# **DISTIL Installation**

In [None]:
!git clone https://github.com/decile-team/distil.git
!pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ submodlib
!cd distil && git checkout origin/scalable
%cd distil/examples/

**Imports**

In [None]:
import pandas as pd 
import numpy as np
from torch.utils.data import Dataset, Subset, ConcatDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch import nn
from torchvision import transforms
from torchvision.datasets import cifar
import torch
import torch.optim as optim
from torch.autograd import Variable
import sys
import os
from google.colab import drive
sys.path.append('../')
from distil.active_learning_strategies import GLISTER, BADGE, EntropySampling, RandomSampling

from distil.utils.models.resnet import ResNet18
from distil.utils.train_helper import data_train
from distil.utils.utils import LabeledToUnlabeledDataset

# **Data, Model & Directory Configuration**

The CIFAR-10 dataset contains 60,000 32x32 color images in 10 different classes.The 10 different classes represent airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships, and trucks. There are 6,000 images of each class. The training set contains 50,000 images and test set contains 10,000 images. We will use DISTIL's custom data handler for CIFAR10:- DataHandler_CIFAR10 to load the labeled as well as unlabeled data.

In [None]:
data_set_name = 'CIFAR10'
download_path = '../downloaded_data/'

cifar_training_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
cifar_test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

cifar10_full_train = cifar.CIFAR10(download_path, train=True, download=True, transform=cifar_training_transform)
cifar10_test = cifar.CIFAR10(download_path, train=False, download=True, transform=cifar_test_transform)

dim = np.shape(cifar10_full_train[0][0])

train_size = 1000
cifar10_train = Subset(cifar10_full_train, list(range(train_size)))
cifar10_unlabeled = Subset(cifar10_full_train, list(range(train_size, len(cifar10_full_train))))

nclasses = 10
n_rounds = 9    ##Number of rounds to run active learning
budget = 500 

net = ResNet18()

In [None]:
#Model Directory
base_dir = "/content/models/"
os.makedirs(base_dir, exist_ok = True)
model_directory = os.path.join(base_dir, 'base_model.pth')

# **INITIAL MODEL TRAINING**
Run only if you don't have a base model or when you are running it for the first time. Otherwise, you skip this cell and load model by running the below cell

In [None]:
args = {'n_epoch':300, 'lr':float(0.01), 'batch_size':20, 'max_accuracy':0.99, 'optimizer':'sgd'} 
dt = data_train(cifar10_train, net, args)
clf = dt.train()
torch.save(clf.state_dict(), model_directory)

# **LOAD BASE MODEL**

In [None]:
base_dir = "/content/models/"
model_directory = os.path.join(base_dir, 'base_model.pth')
net.load_state_dict(torch.load(model_directory))
clf = net

# **RANDOM SAMPLING**
This strategy is often used as a baseline, where we pick a set of unlabled points randomly. Here we create a instance of distil.active_learning_strategies.random_sampling.RandomSampling by passing following parameters:

**training_dataset** – The labeled dataset

**unlabeled_dataset** – The unlabeled dataset, which has a wrapper around it that strips the label

**net (class object)** – Model architecture used for training. Could be instance of models defined in distil.utils.models or something similar.

**nclasses (int)** – No. of classes in tha dataset

**args (dictionary)**– This dictionary should have ‘batch_size’ as a key. 'batch_size' should be such that one can exploit the benefits of tensorization while honouring the resourse constraits. This ‘batch_size’ therefore can be different than the one used for training.


In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 20}
strategy = RandomSampling(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':300, 'lr':float(0.01), 'batch_size':20, 'max_accuracy':0.99, 'optimizer':'sgd'} 
dt = data_train(cifar10_train, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(cifar10_test)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    cifar10_full_train.transform = cifar_test_transform # Disable augmentation
    idx = strategy.select(budget)
    cifar10_full_train.transform = cifar_training_transform # Enable augmentation

    #Adding new points to training set
    cifar10_train = ConcatDataset([cifar10_train, Subset(cifar10_unlabeled, idx)])
    remaining_unlabeled_idx = list(set(range(len(cifar10_unlabeled))) - set(idx))
    cifar10_unlabeled = Subset(cifar10_unlabeled, remaining_unlabeled_idx)

    print('Number of training points -', len(cifar10_train))

    strategy.update_data(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled))
    dt.update_data(cifar10_train)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(cifar10_test)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'random.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **Uncertanity based Active learning Strategy**

The most basic active learning strategy, where we select samples about which the model is most uncertain. To quantify the uncertainity we use entropy, therefore select points which have maximum entropy. Let $z_i$ be output from the model then the correponding softmax would be $$\sigma(z_i) = \frac{e^{z_i}}{\sum_j e^{z_j}}$$. Then entropy can be calculated as, $$ENTROPY = -\sum_j \sigma(z_j)*log(\sigma(z_j))$$

Here we create a instance of distil.active_learning_strategies.entropy_sampling.EntropySampling with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling.

**Reloading Base Model & Data**

In [None]:
data_set_name = 'CIFAR10'
download_path = '../downloaded_data/'

cifar_training_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
cifar_test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

cifar10_full_train = cifar.CIFAR10(download_path, train=True, download=True, transform=cifar_training_transform)
cifar10_test = cifar.CIFAR10(download_path, train=False, download=True, transform=cifar_test_transform)

dim = np.shape(cifar10_full_train[0][0])

train_size = 1000
cifar10_train = Subset(cifar10_full_train, list(range(train_size)))
cifar10_unlabeled = Subset(cifar10_full_train, list(range(train_size, len(cifar10_full_train))))

nclasses = 10
n_rounds = 9    ##Number of rounds to run active learning
budget = 500 

net = ResNet18()

In [None]:
base_dir = "/content/models/"
model_directory = os.path.join(base_dir, 'base_model.pth')
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 20}
strategy = EntropySampling(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':300, 'lr':float(0.01), 'batch_size':20, 'max_accuracy':0.99, 'optimizer':'sgd'} 
dt = data_train(cifar10_train, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(cifar10_test)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    cifar10_full_train.transform = cifar_test_transform # Disable augmentation
    idx = strategy.select(budget)
    cifar10_full_train.transform = cifar_training_transform # Enable augmentation

    #Adding new points to training set
    cifar10_train = ConcatDataset([cifar10_train, Subset(cifar10_unlabeled, idx)])
    remaining_unlabeled_idx = list(set(range(len(cifar10_unlabeled))) - set(idx))
    cifar10_unlabeled = Subset(cifar10_unlabeled, remaining_unlabeled_idx)

    print('Number of training points -', len(cifar10_train))

    strategy.update_data(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled))
    dt.update_data(cifar10_train)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(cifar10_test)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'entropy.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **BADGE**
This method is based on the paper [Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds](https://arxiv.org/abs/1906.03671). Here at each around of selection loss gradients are computed using the hypothesised lables. Then to points to be labled are selected by applying k-means++ on these loss gradients. 

Here we create a instance of distil.active_learning_strategies.badge.BADGE with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling.

**Reloading Base Model & Data**

In [None]:
data_set_name = 'CIFAR10'
download_path = '../downloaded_data/'

cifar_training_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
cifar_test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

cifar10_full_train = cifar.CIFAR10(download_path, train=True, download=True, transform=cifar_training_transform)
cifar10_test = cifar.CIFAR10(download_path, train=False, download=True, transform=cifar_test_transform)

dim = np.shape(cifar10_full_train[0][0])

train_size = 1000
cifar10_train = Subset(cifar10_full_train, list(range(train_size)))
cifar10_unlabeled = Subset(cifar10_full_train, list(range(train_size, len(cifar10_full_train))))

nclasses = 10
n_rounds = 9    ##Number of rounds to run active learning
budget = 500 

net = ResNet18()

In [None]:
base_dir = "/content/models/"
model_directory = os.path.join(base_dir, 'base_model.pth')
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 20}
strategy = BADGE(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':300, 'lr':float(0.01), 'batch_size':20, 'max_accuracy':0.99, 'optimizer':'sgd'} 
dt = data_train(cifar10_train, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(cifar10_test)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    cifar10_full_train.transform = cifar_test_transform # Disable augmentation
    idx = strategy.select(budget)
    cifar10_full_train.transform = cifar_training_transform # Enable augmentation

    #Adding new points to training set
    cifar10_train = ConcatDataset([cifar10_train, Subset(cifar10_unlabeled, idx)])
    remaining_unlabeled_idx = list(set(range(len(cifar10_unlabeled))) - set(idx))
    cifar10_unlabeled = Subset(cifar10_unlabeled, remaining_unlabeled_idx)

    print('Number of training points -', len(cifar10_train))

    strategy.update_data(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled))
    dt.update_data(cifar10_train)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(cifar10_test)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'badge.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **GLISTER**
This is implemetation of GLISTER-ACTIVE from the paper [GLISTER: Generalization based Data Subset Selection for Efficient and Robust Learning](https://arxiv.org/abs/2012.10630). GLISTER methods tries to solve a bi-level optimisation problem.
\begin{equation*}
\overbrace{\underset{{S \subseteq {\mathcal U}, |S| \leq k}}{\operatorname{argmax\hspace{0.7mm}}} LL_V(\underbrace{\underset{\theta}{\operatorname{argmax\hspace{0.7mm}}} LL_T( \theta, S)}_{inner-level}, {\mathcal V})}^{outer-level}
\end{equation*}
where is $S$ is set of points selected at each round,${\mathcal V}$ could be a dedicated validation set with labled points or could be union of labeled and unlabeled points with hypothesised labels, $k$ is the budget.
To set ${\mathcal V}$ to be validation set, while calling **GLISTER** class in the toolkit set _valid=TRUE_ and pass validation set otherwise set _valid=False_.

Solving this problem directly is almost impossible, therefore we resort to one-step approxiations.We start we $S^0$ as empty set and bulid it as $S^k = S^{k-1} \cup e$, where $e$ is $\underset{e}{\operatorname{argmax\hspace{0.7mm}}} G_{\theta}(e | S^k)$. We define,$$G_{\theta}(e | S^k) = LL_{V}(\theta^{k}, {\mathcal V})$$ and update $$\theta^k \leftarrow \theta^{k-1} -  \eta \nabla_{\theta} LL_T(\hat{\theta}, e)$$ where $\hat{\theta}$ is the parameters of the model at the begining of the selection.
To prevent overfitting, we can add regularizer to GLISTER, which can be set by **_typeOf_**. **_typeOf_** can be set to - **'none'**(which is default) for normal GLISTER,**'Rand'** for replacing **_lam_** fraction of points replaced by random points, **'Diversity'** adding diversity set function while computing gain and **'FacLoc'** adding Facility Location set function while computing gain. **_lam_** for both **'Diversity'** and **'FacLoc'** determines the weightage given to them while computing the gain.

Here we create a instance of distil.active_learning_strategies.glister.GLISTER( with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling, we slight change that, **args** dictionary should have keys ‘batch_size’ and ‘lr’. ‘lr’ should be the learning rate used for training. In addition to those folowing additional parameters may be passed:

**validation_dataset (torch.utils.data.Dataset, optional)** – An optional validation dataset

**typeOf (str, optional)** – Determines the type of regulariser to be used. Default is ‘none’. For random regulariser use ‘Rand’. To use Facility Location set functiom as a regulariser use ‘FacLoc’. To use Diversity set functiom as a regulariser use ‘Diversity’.

**lam (float, optional)** – Determines the amount of regularisation to be applied. Mandatory if is not typeOf=’none’ and by default set to None. For random regulariser use values should be between 0 and 1 as it determines fraction of points replaced by random points. For both ‘Diversity’ and ‘FacLoc’ lam determines the weightage given to them while computing the gain.

**kernel_batch_size (int, optional)** – For 'Diversity' and 'FacLoc' regualrizer versions, similarity kernel is to be computed, which entails creating a 3d torch tensor of dimenssions $kernel\_batch\_size^{2}*(feature\ dimenssion)$. Again kernel_batch_size should be such that one can exploit the benefits of tensorization while honouring the resourse constraits.

**Reloading Base Model & Data**

In [None]:
data_set_name = 'CIFAR10'
download_path = '../downloaded_data/'

cifar_training_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
cifar_test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
cifar_target_transform = transforms.ToTensor()

cifar10_full_train = cifar.CIFAR10(download_path, train=True, download=True, transform=cifar_training_transform, target_transform=torch.tensor)
cifar10_test = cifar.CIFAR10(download_path, train=False, download=True, transform=cifar_test_transform, target_transform=torch.tensor)

dim = np.shape(cifar10_full_train[0][0])

train_size = 1000
cifar10_train = Subset(cifar10_full_train, list(range(train_size)))
cifar10_unlabeled = Subset(cifar10_full_train, list(range(train_size, len(cifar10_full_train))))

nclasses = 10
n_rounds = 9    ##Number of rounds to run active learning
budget = 500 

net = ResNet18()

In [None]:
base_dir = "/content/models/"
model_directory = os.path.join(base_dir, 'base_model.pth')
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 20, 'lr' : 0.01}
strategy = GLISTER(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':300, 'lr':float(0.01), 'batch_size':20, 'max_accuracy':0.99, 'optimizer':'sgd'} 
dt = data_train(cifar10_train, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(cifar10_test)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    cifar10_full_train.transform = cifar_test_transform # Disable augmentation
    idx = strategy.select(budget)
    cifar10_full_train.transform = cifar_training_transform # Enable augmentation

    #Adding new points to training set
    cifar10_train = ConcatDataset([cifar10_train, Subset(cifar10_unlabeled, idx)])
    remaining_unlabeled_idx = list(set(range(len(cifar10_unlabeled))) - set(idx))
    cifar10_unlabeled = Subset(cifar10_unlabeled, remaining_unlabeled_idx)

    print('Number of training points -', len(cifar10_train))

    strategy.update_data(cifar10_train, LabeledToUnlabeledDataset(cifar10_unlabeled))
    dt.update_data(cifar10_train)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(cifar10_test)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'glister.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

**VISUALISATION**

In [None]:
#Loading accuracies
with open(os.path.join(base_dir,'entropy.txt'), 'r') as f:
  acc_ = f.readlines()
acc_en = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'badge.txt'), 'r') as f:
  acc_ = f.readlines()
acc_bd = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'glister.txt'), 'r') as f:
  acc_ = f.readlines()
acc_gl = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'random.txt'), 'r') as f:
  acc_ = f.readlines()
acc_rd = [round(float(x)*100, 2) for x in acc_]

#Plotting
import matplotlib.pyplot as plt
x_axis = np.array([train_size+budget*i for i in range(n_rounds)])
plt.figure()
plt.plot(x_axis, acc_gl, 'b-', label='GLISTER RAND',marker='o')
plt.plot(x_axis, acc_en, 'g-', label='UNCERTAINITY',marker='o')
plt.plot(x_axis, acc_bd, 'c', label='BADGE',marker='o')
plt.plot(x_axis, acc_rd, 'r', label='RANDOM',marker='o')


plt.legend()
plt.xlabel('No of Images')
plt.ylabel('Test Accuracy')
plt.title('DISTIL_CIFAR10')