# **DISTIL Installation**

In [None]:
!git clone https://github.com/decile-team/distil.git
!git clone https://github.com/decile-team/datasets.git
!pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ submodlib
%cd /content/distil

# **Imports**

In [None]:
import numpy as np
import os
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch.nn.functional as F
from torch import nn
from torchvision import transforms
import torch
import torch.optim as optim
from torch.autograd import Variable
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from copy import deepcopy

from distil.active_learning_strategies import GLISTER, BADGE, EntropySampling, RandomSampling, SubmodularSampling
from distil.utils.models.simple_net import TwoLayerNet
from distil.utils.train_helper import data_train
from distil.utils.utils import LabeledToUnlabeledDataset

from matplotlib import pyplot as plt
from google.colab import drive

# **Data, Model & Directory Configuration**

In [None]:
def libsvm_file_load(path,dim, save_data=False):
    data = []
    target = []
    with open(path) as fp:
       line = fp.readline()
       while line:
        temp = [i for i in line.strip().split(" ")]
        target.append(int(float(temp[0]))) # Class Number. # Not assumed to be in (0, K-1)
        temp_data = [0]*dim
        
        for i in temp[1:]:
            ind,val = i.split(':')
            temp_data[int(ind)-1] = float(val)
        data.append(temp_data)
        line = fp.readline()
    X_data = np.array(data,dtype=np.float32)
    Y_label = np.array(target)
    if save_data:
        # Save the numpy files to the folder where they come from
        data_np_path = path + '.data.npy'
        target_np_path = path + '.label.npy'
        np.save(data_np_path, X_data)
        np.save(target_np_path, Y_label)
    return (X_data, Y_label)

    
trn_file = '../datasets/satimage/satimage.scale.trn'
val_file = '../datasets/satimage/satimage.scale.val'
tst_file = '../datasets/satimage/satimage.scale.tst'

data_dims = 36
nclasses = 6

x_trn, y_trn = libsvm_file_load(trn_file, dim=data_dims)
x_val, y_val = libsvm_file_load(val_file, dim=data_dims)
x_tst, y_tst = libsvm_file_load(tst_file, dim=data_dims)

y_trn -= 1  # First Class should be zero
y_val -= 1
y_tst -= 1  # First Class should be zero

sc = StandardScaler()
x_trn = sc.fit_transform(x_trn)
x_val = sc.transform(x_val)
x_tst = sc.transform(x_tst)

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [None]:
X_unlabeled = deepcopy(x_trn)
y_unlabeled = deepcopy(y_trn)
X_test = deepcopy(x_tst)
y_test = deepcopy(y_tst)

nSamps, dim = np.shape(X_unlabeled)

np.random.seed(42)
start_idxs = np.random.choice(nSamps, size=int(0.01*nSamps), replace=False)

X_tr = X_unlabeled[start_idxs]
X_unlabeled = np.delete(X_unlabeled, start_idxs, axis = 0)

y_tr = y_unlabeled[start_idxs]
y_unlabeled = np.delete(y_unlabeled, start_idxs, axis = 0)

training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(x_tst), torch.tensor(y_tst, dtype=torch.long))

n_rounds = 10    ##Number of rounds to run ac
budget = int(0.01*x_trn.shape[0]) 

net = TwoLayerNet(data_dims, nclasses, 40)
net.apply(init_weights)

In [None]:
#Model Directory
base_dir = "/content/satimage/"
os.makedirs(base_dir, exist_ok = True)
model_directory = os.path.join(base_dir, 'base_model.pth')

# **Initial Model Training**

In [None]:
args = {'n_epoch':500, 'lr':float(0.01),'batch_size':16, 'max_accuracy':0.99, 'window_size':20} 
dt = data_train(training_dataset, net, args)
clf = dt.train()
torch.save(clf.state_dict(), model_directory)

# **Load Base Model**

In [None]:
net.load_state_dict(torch.load(model_directory))
clf = net

# **Random**
This strategy is often used as a baseline, where we pick a set of unlabled points randomly. Here we create a instance of distil.active_learning_strategies.random_sampling.RandomSampling by passing following parameters:

**training_dataset** – The labeled dataset

**unlabeled_dataset** – The unlabeled dataset, which has a wrapper around it that strips the label

**net (class object)** – Model architecture used for training. Could be instance of models defined in distil.utils.models or something similar.

**nclasses (int)** – No. of classes in tha dataset

**args (dictionary)**– This dictionary should have ‘batch_size’ as a key. 'batch_size' should be such that one can exploit the benefits of tensorization while honouring the resourse constraits. This ‘batch_size’ therefore can be different than the one used for training.

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 16, 'lr':float(0.01)}
strategy = RandomSampling(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':500, 'lr':float(0.01),'batch_size':16, 'max_accuracy':0.99, 'window_size':20} 
dt = data_train(training_dataset, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(test_dataset)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    idx = strategy.select(budget)

    #Adding new points to training set
    X_tr = np.concatenate((X_tr, X_unlabeled[idx]), axis=0)
    X_unlabeled = np.delete(X_unlabeled, idx, axis = 0)

    #Human In Loop, Assuming user adds new labels here
    y_tr = np.concatenate((y_tr, y_unlabeled[idx]), axis = 0)
    y_unlabeled = np.delete(y_unlabeled, idx, axis = 0)
    print('Number of training points -',X_tr.shape[0])

    training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
    unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))

    #Reload state and start training
    strategy.update_data(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset))
    dt.update_data(training_dataset)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(test_dataset)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'random.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **Uncertanity based Active learning Strategy**

The most basic active learning strategy, where we select samples about which the model is most uncertain. To quantify the uncertainity we use entropy, therefore select points which have maximum entropy. Let $z_i$ be output from the model then the correponding softmax would be $$\sigma(z_i) = \frac{e^{z_i}}{\sum_j e^{z_j}}$$. Then entropy can be calculated as, $$ENTROPY = -\sum_j \sigma(z_j)*log(\sigma(z_j))$$

Here we create a instance of distil.active_learning_strategies.entropy_sampling.EntropySampling with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling.

**Reinitialize Model & Data**

In [None]:
X_unlabeled = deepcopy(x_trn)
y_unlabeled = deepcopy(y_trn)
X_test = deepcopy(x_tst)
y_test = deepcopy(y_tst)

nSamps, dim = np.shape(X_unlabeled)

np.random.seed(42)
start_idxs = np.random.choice(nSamps, size=int(0.01*nSamps), replace=False)

X_tr = X_unlabeled[start_idxs]
X_unlabeled = np.delete(X_unlabeled, start_idxs, axis = 0)

y_tr = y_unlabeled[start_idxs]
y_unlabeled = np.delete(y_unlabeled, start_idxs, axis = 0)

training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(x_tst), torch.tensor(y_tst, dtype=torch.long))

n_rounds = 10    ##Number of rounds to run ac
budget = int(0.01*x_trn.shape[0]) 

net = TwoLayerNet(data_dims, nclasses, 40)

In [None]:
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 16, 'lr':float(0.01)}
strategy = EntropySampling(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':500, 'lr':float(0.01),'batch_size':16, 'max_accuracy':0.99, 'window_size':20} 
dt = data_train(training_dataset, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(test_dataset)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    idx = strategy.select(budget)

    #Adding new points to training set
    X_tr = np.concatenate((X_tr, X_unlabeled[idx]), axis=0)
    X_unlabeled = np.delete(X_unlabeled, idx, axis = 0)

    #Human In Loop, Assuming user adds new labels here
    y_tr = np.concatenate((y_tr, y_unlabeled[idx]), axis = 0)
    y_unlabeled = np.delete(y_unlabeled, idx, axis = 0)
    print('Number of training points -',X_tr.shape[0])

    training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
    unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))

    #Reload state and start training
    strategy.update_data(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset))
    dt.update_data(training_dataset)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(test_dataset)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'entropy.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **BADGE**
This method is based on the paper [Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds](https://arxiv.org/abs/1906.03671). Here at each around of selection loss gradients are computed using the hypothesised lables. Then to points to be labled are selected by applying k-means++ on these loss gradients. 

Here we create a instance of distil.active_learning_strategies.badge.BADGE with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling.

**Reinitialize Model & Data**

In [None]:
X_unlabeled = deepcopy(x_trn)
y_unlabeled = deepcopy(y_trn)
X_test = deepcopy(x_tst)
y_test = deepcopy(y_tst)

nSamps, dim = np.shape(X_unlabeled)

np.random.seed(42)
start_idxs = np.random.choice(nSamps, size=int(0.01*nSamps), replace=False)

X_tr = X_unlabeled[start_idxs]
X_unlabeled = np.delete(X_unlabeled, start_idxs, axis = 0)

y_tr = y_unlabeled[start_idxs]
y_unlabeled = np.delete(y_unlabeled, start_idxs, axis = 0)

training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(x_tst), torch.tensor(y_tst, dtype=torch.long))

n_rounds = 10    ##Number of rounds to run ac
budget = int(0.01*x_trn.shape[0]) 

net = TwoLayerNet(data_dims, nclasses, 40)

In [None]:
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 16, 'lr':float(0.01)}
strategy = BADGE(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':500, 'lr':float(0.01),'batch_size':16, 'max_accuracy':0.99, 'window_size':20} 
dt = data_train(training_dataset, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(test_dataset)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    idx = strategy.select(budget)

    #Adding new points to training set
    X_tr = np.concatenate((X_tr, X_unlabeled[idx]), axis=0)
    X_unlabeled = np.delete(X_unlabeled, idx, axis = 0)

    #Human In Loop, Assuming user adds new labels here
    y_tr = np.concatenate((y_tr, y_unlabeled[idx]), axis = 0)
    y_unlabeled = np.delete(y_unlabeled, idx, axis = 0)
    print('Number of training points -',X_tr.shape[0])

    training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
    unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))

    #Reload state and start training
    strategy.update_data(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset))
    dt.update_data(training_dataset)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(test_dataset)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'badge.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **GLISTER**
This is implemetation of GLISTER-ACTIVE from the paper [GLISTER: Generalization based Data Subset Selection for Efficient and Robust Learning](https://arxiv.org/abs/2012.10630). GLISTER methods tries to solve a bi-level optimisation problem.
\begin{equation*}
\overbrace{\underset{{S \subseteq {\mathcal U}, |S| \leq k}}{\operatorname{argmax\hspace{0.7mm}}} LL_V(\underbrace{\underset{\theta}{\operatorname{argmax\hspace{0.7mm}}} LL_T( \theta, S)}_{inner-level}, {\mathcal V})}^{outer-level}
\end{equation*}
where is $S$ is set of points selected at each round,${\mathcal V}$ could be a dedicated validation set with labled points or could be union of labeled and unlabeled points with hypothesised labels, $k$ is the budget.
To set ${\mathcal V}$ to be validation set, while calling **GLISTER** class in the toolkit set _valid=TRUE_ and pass validation set otherwise set _valid=False_.

Solving this problem directly is almost impossible, therefore we resort to one-step approxiations.We start we $S^0$ as empty set and bulid it as $S^k = S^{k-1} \cup e$, where $e$ is $\underset{e}{\operatorname{argmax\hspace{0.7mm}}} G_{\theta}(e | S^k)$. We define,$$G_{\theta}(e | S^k) = LL_{V}(\theta^{k}, {\mathcal V})$$ and update $$\theta^k \leftarrow \theta^{k-1} -  \eta \nabla_{\theta} LL_T(\hat{\theta}, e)$$ where $\hat{\theta}$ is the parameters of the model at the begining of the selection.
To prevent overfitting, we can add regularizer to GLISTER, which can be set by **_typeOf_**. **_typeOf_** can be set to - **'none'**(which is default) for normal GLISTER,**'Rand'** for replacing **_lam_** fraction of points replaced by random points, **'Diversity'** adding diversity set function while computing gain and **'FacLoc'** adding Facility Location set function while computing gain. **_lam_** for both **'Diversity'** and **'FacLoc'** determines the weightage given to them while computing the gain.

Here we create a instance of distil.active_learning_strategies.glister.GLISTER( with same parameters passed to distil.active_learning_strategies.random_sampling.RandomSampling, we slight change that, **args** dictionary should have keys ‘batch_size’ and ‘lr’. ‘lr’ should be the learning rate used for training. In addition to those folowing additional parameters may be passed:

**validation_dataset (torch.utils.data.Dataset, optional)** – An optional validation dataset

**typeOf (str, optional)** – Determines the type of regulariser to be used. Default is ‘none’. For random regulariser use ‘Rand’. To use Facility Location set functiom as a regulariser use ‘FacLoc’. To use Diversity set functiom as a regulariser use ‘Diversity’.

**lam (float, optional)** – Determines the amount of regularisation to be applied. Mandatory if is not typeOf=’none’ and by default set to None. For random regulariser use values should be between 0 and 1 as it determines fraction of points replaced by random points. For both ‘Diversity’ and ‘FacLoc’ lam determines the weightage given to them while computing the gain.

**kernel_batch_size (int, optional)** – For 'Diversity' and 'FacLoc' regualrizer versions, similarity kernel is to be computed, which entails creating a 3d torch tensor of dimenssions $kernel\_batch\_size^{2}*(feature\ dimenssion)$. Again kernel_batch_size should be such that one can exploit the benefits of tensorization while honouring the resourse constraits.

**Reinitialize Model & Data**

In [None]:
X_unlabeled = deepcopy(x_trn)
y_unlabeled = deepcopy(y_trn)
X_test = deepcopy(x_tst)
y_test = deepcopy(y_tst)

nSamps, dim = np.shape(X_unlabeled)

np.random.seed(42)
start_idxs = np.random.choice(nSamps, size=int(0.01*nSamps), replace=False)

X_tr = X_unlabeled[start_idxs]
X_unlabeled = np.delete(X_unlabeled, start_idxs, axis = 0)

y_tr = y_unlabeled[start_idxs]
y_unlabeled = np.delete(y_unlabeled, start_idxs, axis = 0)

training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(x_tst), torch.tensor(y_tst, dtype=torch.long))

n_rounds = 10    ##Number of rounds to run ac
budget = int(0.01*x_trn.shape[0]) 

net = TwoLayerNet(data_dims, nclasses, 40)

In [None]:
net.load_state_dict(torch.load(model_directory))
clf = net

In [None]:
#Initializing Strategy Class
strategy_args = {'batch_size' : 16, 'lr':float(0.01)}
strategy = BADGE(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset), net, nclasses, strategy_args)

#Initial Training
args = {'n_epoch':500, 'lr':float(0.01),'batch_size':16, 'max_accuracy':0.99, 'window_size':20} 
dt = data_train(training_dataset, clf, args)

#Updating the trained model in strategy class
strategy.update_model(clf)

# y_pred = strategy.predict(X_test).numpy()
acc = np.zeros(n_rounds)
acc[0] = dt.get_acc_on_set(test_dataset)
print('Initial Testing accuracy:', round(acc[0]*100, 2), flush=True)

##User Controlled Loop
for rd in range(1, n_rounds):
    print('-------------------------------------------------')
    print('Round', rd) 
    print('-------------------------------------------------')

    #Using select function for getting next set of data points
    idx = strategy.select(budget)

    #Adding new points to training set
    X_tr = np.concatenate((X_tr, X_unlabeled[idx]), axis=0)
    X_unlabeled = np.delete(X_unlabeled, idx, axis = 0)

    #Human In Loop, Assuming user adds new labels here
    y_tr = np.concatenate((y_tr, y_unlabeled[idx]), axis = 0)
    y_unlabeled = np.delete(y_unlabeled, idx, axis = 0)
    print('Number of training points -',X_tr.shape[0])

    training_dataset = TensorDataset(torch.tensor(X_tr), torch.tensor(y_tr, dtype=torch.long))
    unlabeled_dataset = TensorDataset(torch.tensor(X_unlabeled), torch.tensor(y_unlabeled, dtype=torch.long))

    #Reload state and start training
    strategy.update_data(training_dataset, LabeledToUnlabeledDataset(unlabeled_dataset))
    dt.update_data(training_dataset)

    clf = dt.train()
    strategy.update_model(clf)
    acc[rd] = dt.get_acc_on_set(test_dataset)
    print('Testing accuracy:', round(acc[rd]*100, 2), flush=True)
    if acc[rd] > 0.98:
        print('Testing accuracy reached above 98%, stopping training!')
        break

print('Training Completed')

#Saving accuracies for further analysis
with open(os.path.join(base_dir,'glister.txt'), 'w') as f:
    for item in acc:
        f.write("%s\n" % item)

# **VISUALIZATION**

In [None]:
#Loading accuracies
with open(os.path.join(base_dir,'entropy.txt'), 'r') as f:
  acc_ = f.readlines()
acc_en = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'badge.txt'), 'r') as f:
  acc_ = f.readlines()
acc_bd = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'glister.txt'), 'r') as f:
  acc_ = f.readlines()
acc_gl = [round(float(x)*100, 2) for x in acc_]
with open(os.path.join(base_dir,'random.txt'), 'r') as f:
  acc_ = f.readlines()
acc_rd = [round(float(x)*100, 2) for x in acc_]

#Plotting
import matplotlib.pyplot as plt
budget = 31 
n_rounds = 10
x_axis = np.array([31+budget*i for i in range(n_rounds)])
plt.figure()
plt.plot(x_axis, acc_gl, 'b-', label='GLISTER RAND',marker='o')
plt.plot(x_axis, acc_en, 'g-', label='UNCERTAINITY',marker='o')
plt.plot(x_axis, acc_bd, 'c', label='BADGE',marker='o')
plt.plot(x_axis, acc_rd, 'r', label='RANDOM',marker='o')


plt.legend()
plt.xlabel('No of Images')
plt.ylabel('Test Accuracy')
plt.title('DISTIL_IJCNN1')