## Setup

In [None]:
import torchvision
import torch
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from dcai.score import ScoreTracker
from dcai.dataset import TrainDataset, ValidationDataset
from typing import List

In [None]:
# Start a score tracker to track score as function of the number of annotations bought
score_tracker = ScoreTracker(team_name="marcel-brute-force")

# Get a MNIST train data set
train_dataset = TrainDataset()

## Example of buying an annotation

In [None]:
plt.figure(figsize=(2,2))
plt.title(f"Given label = {train_dataset[15][1]}")
plt.imshow(train_dataset[15][0]);

In [None]:
# Buy an annotation
train_dataset.buy_annotation(15)

In [None]:
plt.figure(figsize=(2,2))
plt.title(f"Given label = {train_dataset[15][1]}")
plt.imshow(train_dataset[15][0]);

In [None]:
for x in [1,2,7]:
    print(f"Num examples for given class {x}: {np.sum(train_dataset.y == x)}")
    # np.sum(train_dataset.y == 1)

## Train with the current data set and get the score

In [None]:
model = score_tracker.train_and_score_model(train_dataset, plot_confusion_matrix=True)

In [None]:
print(model)

The score function returns the fitted model, so you can use this model for active learning etc. You don't have to use this model.

## What else can you do?

In [None]:
print(train_dataset.__doc__)

## Naive/baseline approach: Randomly annotate!

Your method should at least have a better performance than this :D

In [None]:
def iterate_in_batches(it: List, batch_size: int):
    i=0
    while True:
        ret = it[i:i+batch_size]
        if len(ret) == 0:
            return
        yield ret
        i += batch_size

In [None]:
labels = pd.Series(train_dataset.y)
labels_1 = labels[labels == 1]

# Excluding the given label class 1, we will only include data points that we have bought annotations for
train_dataset.exclude_datapoints(labels_1.index)

In [None]:
for batch in iterate_in_batches(labels_1, 2048):
    for el in batch.index:
        train_dataset.buy_annotation(el)
    train_dataset.include_datapoints(batch.index)
        
    score_tracker.train_and_score_model(train_dataset, plot_confusion_matrix=False)

In [None]:
score_tracker.train_and_score_model(train_dataset, plot_confusion_matrix=True)
score_tracker.plot_scores()