In [None]:
import torchvision
import torch
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from dcai.score import ScoreTracker
from dcai.dataset import TrainDataset, ValidationDataset
from typing import List

In [None]:
# Start a score tracker to track score as function of the number of annotations bought
score_tracker = ScoreTracker(team_name="example-clustering")

# Get a MNIST train data set
train_dataset = TrainDataset()

### Indices tracking

We often need to track indices in the original dataset which can sometimes be a bit challenging. Below an example is given how tracking of indices can be done. You can use the `np.nonzero` or `np.where` function for this.

`mask_generic_class` is a boolean vector with `True` and `False` for those datapoints that we want to select. In this case we want to select the class generic class `1`.

`idx_generic_class` is a vector with all the indices to the element that we select by masking. So in this case, the indices to all datapoints with the generic class `1`.

In [None]:
mask_generic_class = train_dataset.y == 1
idx_generic_class = np.nonzero(mask_generic_class)[0]

print("The first 10 elements of these vectors:")
print(f"train_dataset.y    = {train_dataset.y[:10]}")
print(f"mask_generic_class = {mask_generic_class[:10]}")
print(f"idx_generic_class  = {idx_generic_class[:10]}")

You can also consider creating a small pandas dataframe where you do the bookkeeping of all indices:

In [None]:
df_bookkeeper = (
    pd.DataFrame({
        "current_label": train_dataset.y,
        "bought": False  
    })
    .assign(mask_generic_class=lambda df: df.current_label == 1)
)

In [None]:
# The dataframe index is then the sample identifier.
df_bookkeeper.loc[lambda df: df.mask_generic_class].index

In [None]:
# You can update the bought column by running:
for i in train_dataset.annotations_bought:
    df_bookkeeper.bought[i] = True

## Hands-on: cluster the generic class into two clusters

* Train a clustering algorithm with 2 classes on the generic class
* Predict using the clustering algorithm. Find out which cluster is which class (e.g. by buying some annotations)
* Retrain using the labels from the clustering algorithm

In [None]:
from sklearn.cluster import ...

In [None]:
sample_ids_for_cluster0 = ...
sample_ids_for_cluster1 = ...

In [None]:
labels_for_cluster0 = train_dataset.buy_annotations(sample_ids_for_cluster0)
labels_for_cluster1 = train_dataset.buy_annotations(sample_ids_for_cluster1)

In [None]:
# Assign the labels from the original training dataset to the labels from clustering
train_dataset.y = ...

In [None]:
# Train a new model
score_tracker.train_and_score_model(train_dataset)

In [None]:
fitted_model, metrics = score_tracker.plot_scores()

## Where to go from here?

Can you spot datapoints in the train data that are harder to predict? 

Hint: use the raw probability outputs of the fitted_model. What does it mean if a probability of 0.5 is predicted for a specific class?