From 4108c7fe8b81f9f198410bb6b2598f6833329a5e Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Thu, 29 Apr 2021 15:35:16 +0530
Subject: [PATCH 01/30] new models

---
 .../adversarial_deepfool.py                   |   1 +
 distil/utils/models/README.md                 |  19 +
 distil/utils/models/__init__.py               |  46 ++-
 distil/utils/models/cifar10net.py             |  38 +-
 distil/utils/models/densenet.py               | 215 ++++++-----
 distil/utils/models/dla.py                    |  43 ++-
 distil/utils/models/dla_simple.py             |  43 ++-
 distil/utils/models/dpn.py                    |  53 ++-
 distil/utils/models/efficientnet.py           |  37 +-
 distil/utils/models/googlenet.py              |  73 +++-
 distil/utils/models/lenet.py                  |  43 ++-
 distil/utils/models/logreg_net.py             |  18 +-
 distil/utils/models/mnist_net.py              |  47 ++-
 distil/utils/models/mobilenet.py              |  42 ++-
 distil/utils/models/mobilenetv2.py            |  48 ++-
 distil/utils/models/mobilenetv2tf.py          | 133 +++++++
 distil/utils/models/pnasnet.py                |  57 ++-
 distil/utils/models/preact_resnet.py          |  53 ++-
 distil/utils/models/regnet.py                 |  51 ++-
 distil/utils/models/resnet.py                 |  46 +--
 distil/utils/models/resnet164.py              | 339 ++++++++++++++++++
 distil/utils/models/resnext.py                |  41 ++-
 distil/utils/models/senet.py                  |  44 ++-
 distil/utils/models/shufflenet.py             |  52 ++-
 distil/utils/models/shufflenetv2.py           |  55 ++-
 distil/utils/models/simpleNN_net.py           |  63 ++++
 distil/utils/models/vgg.py                    |  27 +-
 .../source/ActStrategy/cords.utils.models.rst |  41 +++
 docs/source/ActStrategy/distil.utils.rst      |  18 +-
 29 files changed, 1452 insertions(+), 334 deletions(-)
 create mode 100644 distil/utils/models/README.md
 create mode 100644 distil/utils/models/mobilenetv2tf.py
 create mode 100644 distil/utils/models/resnet164.py
 create mode 100644 distil/utils/models/simpleNN_net.py
 create mode 100644 docs/source/ActStrategy/cords.utils.models.rst

diff --git a/distil/active_learning_strategies/adversarial_deepfool.py b/distil/active_learning_strategies/adversarial_deepfool.py
index 446d23a..a175ccb 100644
--- a/distil/active_learning_strategies/adversarial_deepfool.py
+++ b/distil/active_learning_strategies/adversarial_deepfool.py
@@ -47,6 +47,7 @@ def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         super(AdversarialDeepFool, self).__init__(X, Y, unlabeled_x, net, handler, nclasses, args={})
 
     def cal_dis(self, x):
+
         nx = Variable(torch.unsqueeze(x, 0), requires_grad=True)
         eta = Variable(torch.zeros(nx.shape))
 
diff --git a/distil/utils/models/README.md b/distil/utils/models/README.md
new file mode 100644
index 0000000..8825142
--- /dev/null
+++ b/distil/utils/models/README.md
@@ -0,0 +1,19 @@
+### DISTIL has incorporated popular model architectures like DenseNet, ResNet, MobileNet, VGG etc
+
+### To use custom model architecture, modify the model architecture in the following way:
+
+  - The forward method should have two more variables:
+
+    - A boolean variable ‘last’ which -
+
+      - If true:  returns the model output and the output of the second last layer
+      
+      - If false: Returns the model output. 
+    
+    - A boolean variable ‘freeze’ which -
+      
+       - If true: disables the tracking of any calculations required to later calculate a gradient i.e skips gradient calculation over the weights
+      
+       - If false: otherwise
+
+  - get_embedding_dim() method which returns the number of hidden units in the last layer.
diff --git a/distil/utils/models/__init__.py b/distil/utils/models/__init__.py
index 2ec4bd1..41c05f3 100644
--- a/distil/utils/models/__init__.py
+++ b/distil/utils/models/__init__.py
@@ -1,5 +1,41 @@
-# __init__.py
-# Author: Apurva Dani <apurvadani@gmail.com>
-
-
-__version__ = '0.0.1'
\ No newline at end of file
+from .cifar10net import CifarNet
+from .densenet import DenseNet121
+from .densenet import DenseNet161
+from .densenet import DenseNet169
+from .densenet import DenseNet201
+from .dla import DLA
+from .dla_simple import SimpleDLA
+from .dpn import DPN26
+from .dpn import DPN92
+from .efficientnet import EfficientNetB0
+from .googlenet import GoogLeNet
+from .googlenet import Inception
+from .lenet import LeNet
+from .logreg_net import LogisticRegNet
+from .mnist_net import MnistNet
+from .mobilenet import MobileNet
+from .mobilenetv2 import MobileNetV2
+from .pnasnet import PNASNetA
+from .pnasnet import PNASNetB
+from .preact_resnet import PreActResNet18
+from .preact_resnet import PreActResNet34
+from .preact_resnet import PreActResNet50
+from .preact_resnet import PreActResNet101
+from .preact_resnet import PreActResNet152
+from .regnet import RegNetX_200MF
+from .regnet import RegNetX_400MF
+from .regnet import RegNetY_400MF
+from .resnet import ResNet18
+from .resnet import ResNet34
+from .resnet import ResNet50
+from .resnet import ResNet101
+from .resnet import ResNet152
+from .senet import SENet18
+from .shufflenet import ShuffleNetG2
+from .shufflenet import ShuffleNetG3
+from .shufflenetv2 import ShuffleNetV2
+from .simpleNN_net import TwoLayerNet
+from .simpleNN_net import ThreeLayerNet
+from .vgg import VGG
+from .resnet164 import ResNet164
+from .mobilenetv2tf import MobileNet2
\ No newline at end of file
diff --git a/distil/utils/models/cifar10net.py b/distil/utils/models/cifar10net.py
index e270a75..ee5bc49 100644
--- a/distil/utils/models/cifar10net.py
+++ b/distil/utils/models/cifar10net.py
@@ -1,10 +1,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
-
+import torch
 
 class CifarNet(nn.Module):
     def __init__(self):
         super(CifarNet, self).__init__()
+        self.embDim = 256
+        
         self.conv1 = nn.Conv2d(3,   64,  3)
         self.conv2 = nn.Conv2d(64,  128, 3)
         self.conv3 = nn.Conv2d(128, 256, 3)
@@ -13,17 +15,29 @@ def __init__(self):
         self.fc2 = nn.Linear(128, 256)
         self.fc3 = nn.Linear(256, 10)
 
-    def forward(self, x, last=False):
-        x = self.pool(F.relu(self.conv1(x)))
-        x = self.pool(F.relu(self.conv2(x)))
-        x = self.pool(F.relu(self.conv3(x)))
-        x = x.view(-1, 64 * 4 * 4)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        output = self.fc3(x)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.pool(F.relu(self.conv1(x)))
+                out = self.pool(F.relu(self.conv2(out)))
+                out = self.pool(F.relu(self.conv3(out)))
+                out = out.view(-1, 64 * 4 * 4)
+                out = F.relu(self.fc1(out))
+                e = F.relu(self.fc2(out))
+        else:
+            out = self.pool(F.relu(self.conv1(x)))
+            out = self.pool(F.relu(self.conv2(out)))
+            out = self.pool(F.relu(self.conv3(out)))
+            out = out.view(-1, 64 * 4 * 4)
+            out = F.relu(self.fc1(out))
+            e = F.relu(self.fc2(out))
+        out = self.fc3(e)
         if last:
-          return output, x
+            return out, e
         else:
-          return output
+            return out
+
+
     def get_embedding_dim(self):
-        return 256
+        return self.embDim
diff --git a/distil/utils/models/densenet.py b/distil/utils/models/densenet.py
index 983334f..e98149e 100644
--- a/distil/utils/models/densenet.py
+++ b/distil/utils/models/densenet.py
@@ -1,113 +1,156 @@
-'''DenseNet in PyTorch.'''
-import math
+''' DenseNet in PyTorch'
 
+Reference
+    Densely Connected Convolutional Networks 
+    https://arxiv.org/pdf/1608.06993.pdf
+'''
+
+
+import re
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+
 
+class _DenseLayer(nn.Sequential):
+    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
+        super(_DenseLayer, self).__init__()
+    
 
-class Bottleneck(nn.Module):
-    def __init__(self, in_planes, growth_rate):
-        super(Bottleneck, self).__init__()
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(4*growth_rate)
-        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
+        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
+        self.add_module('relu1', nn.ReLU(inplace=True)),
+        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
+                        growth_rate, kernel_size=1, stride=1, bias=False)),
+        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
+        self.add_module('relu2', nn.ReLU(inplace=True)),
+        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
+                        kernel_size=3, stride=1, padding=1, bias=False)),
+        self.drop_rate = drop_rate
 
     def forward(self, x):
-        out = self.conv1(F.relu(self.bn1(x)))
-        out = self.conv2(F.relu(self.bn2(out)))
-        out = torch.cat([out,x], 1)
-        return out
+        new_features = super(_DenseLayer, self).forward(x)
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        return torch.cat([x, new_features], 1)
 
 
-class Transition(nn.Module):
-    def __init__(self, in_planes, out_planes):
-        super(Transition, self).__init__()
-        self.bn = nn.BatchNorm2d(in_planes)
-        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
+class _DenseBlock(nn.Sequential):
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
+            self.add_module('denselayer%d' % (i + 1), layer)
 
-    def forward(self, x):
-        out = self.conv(F.relu(self.bn(x)))
-        out = F.avg_pool2d(out, 2)
-        return out
+
+class _Transition(nn.Sequential):
+    def __init__(self, num_input_features, num_output_features):
+        super(_Transition, self).__init__()
+        self.add_module('norm', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu', nn.ReLU(inplace=True))
+        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
+                                          kernel_size=1, stride=1, bias=False))
+        self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
 
 
 class DenseNet(nn.Module):
-    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
+    """
+    Parameters
+    ----------
+    growth_rate: int 
+        how many filters to add each layer (`k` in paper)
+    block_config: list of 4 ints
+        how many layers in each pooling block
+    num_init_features: int
+        the number of filters to learn in the first convolution layer
+    bn_size: int
+        multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+    drop_rate: float
+        dropout rate after each dense layer
+    num_classes: int
+        number of classification classes
+    """
+
+    def __init__(self, num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
+                 bn_size=4, drop_rate=0, num_classes=10):
+
         super(DenseNet, self).__init__()
-        self.growth_rate = growth_rate
-        self.embDim = num_planes
-        num_planes = 2*growth_rate
-        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
-
-        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
-        num_planes += nblocks[0]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans1 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
-        num_planes += nblocks[1]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans2 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
-        num_planes += nblocks[2]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans3 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
-        num_planes += nblocks[3]*growth_rate
-
-        self.bn = nn.BatchNorm2d(num_planes)
-        self.linear = nn.Linear(num_planes, num_classes)
-
-    def _make_dense_layers(self, block, in_planes, nblock):
-        layers = []
-        for i in range(nblock):
-            layers.append(block(in_planes, self.growth_rate))
-            in_planes += self.growth_rate
-        return nn.Sequential(*layers)
-
-    def forward(self, x, last=False):
-        out = self.conv1(x)
-        out = self.trans1(self.dense1(out))
-        out = self.trans2(self.dense2(out))
-        out = self.trans3(self.dense3(out))
-        out = self.dense4(out)
-        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
-        e = out.view(out.size(0), -1)
-        out = self.linear(w)
+
+        # First convolution
+        self.features = nn.Sequential(OrderedDict([
+            ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+            ('norm0', nn.BatchNorm2d(num_init_features)),
+            ('relu0', nn.ReLU(inplace=True)),
+            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+        ]))
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
+                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
+            self.features.add_module('denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
+                self.features.add_module('transition%d' % (i + 1), trans)
+                num_features = num_features // 2
+
+        # Embedding dimension
+        self.embDim = num_features
+        
+        # Final batch norm
+        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
+        
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+
+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                features = self.features(x)
+                out = F.relu(features, inplace=True)
+                e = F.adaptive_avg_pool2d(out, (1, 1)).view(features.size(0), -1)
+        else:
+            features = self.features(x)
+            out = F.relu(features, inplace=True)
+            e = F.adaptive_avg_pool2d(out, (1, 1)).view(features.size(0), -1)
+        out = self.classifier(e)
         if last:
-            return out, e
+          return out, e
         else:
-            return out
+          return out
+
 
     def get_embedding_dim(self):
         return self.embDim
 
-def DenseNet121(num_classes = 10):
-    return DenseNet(Bottleneck, [6,12,24,16], 32, 0.5, num_classes)
 
-def DenseNet169(num_classes = 10):
-    return DenseNet(Bottleneck, [6,12,32,32], 32, 0.5, num_classes)
+def DenseNet121(**kwargs):
+    return DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), **kwargs)
 
-def DenseNet201(num_classes = 10):
-    return DenseNet(Bottleneck, [6,12,48,32], 32, 0.5, num_classes)
 
-def DenseNet161(num_classes = 10):
-    return DenseNet(Bottleneck, [6,12,36,24], 48, 0.5, num_classes)
+def DenseNet169(**kwargs):
+    return DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), **kwargs)
+    
 
-def densenet_cifar(num_classes = 10):
-    return DenseNet(Bottleneck, [6,12,24,16], 12, 0.5, num_classes)
+def DenseNet201(**kwargs):
+    return DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), **kwargs)
+    
 
-def test():
-    net = densenet_cifar()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y)
+def DenseNet161(**kwargs):
+    return DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24), **kwargs)
 
-# test()
diff --git a/distil/utils/models/dla.py b/distil/utils/models/dla.py
index fb2dcc7..9ed2f38 100644
--- a/distil/utils/models/dla.py
+++ b/distil/utils/models/dla.py
@@ -1,8 +1,11 @@
 '''DLA in PyTorch.
 
 Reference:
-    Deep Layer Aggregation. https://arxiv.org/abs/1707.06484
+    Deep Layer Aggregation
+    https://arxiv.org/abs/1707.06484
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -28,6 +31,7 @@ def __init__(self, in_planes, planes, stride=1):
                 nn.BatchNorm2d(self.expansion*planes)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
@@ -44,6 +48,7 @@ def __init__(self, in_channels, out_channels, kernel_size=1):
             stride=1, padding=(kernel_size - 1) // 2, bias=False)
         self.bn = nn.BatchNorm2d(out_channels)
 
+
     def forward(self, xs):
         x = torch.cat(xs, 1)
         out = F.relu(self.bn(self.conv(x)))
@@ -68,6 +73,7 @@ def __init__(self, block, in_channels, out_channels, level=1, stride=1):
             self.left_node = block(out_channels, out_channels, stride=1)
             self.right_node = block(out_channels, out_channels, stride=1)
 
+
     def forward(self, x):
         xs = [self.prev_root(x)] if self.level > 1 else []
         for i in reversed(range(1, self.level)):
@@ -86,6 +92,7 @@ class DLA(nn.Module):
     def __init__(self, num_classes=10, block=BasicBlock):
         super(DLA, self).__init__()
         self.embDim = 512
+        
         self.base = nn.Sequential(
             nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
             nn.BatchNorm2d(16),
@@ -110,22 +117,36 @@ def __init__(self, num_classes=10, block=BasicBlock):
         self.layer6 = Tree(block, 256, 512, level=1, stride=2)
         self.linear = nn.Linear(512, num_classes)
 
-    def forward(self, x, last=False):
-        out = self.base(x)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = self.layer5(out)
-        out = self.layer6(out)
-        out = F.avg_pool2d(out, 4)
-        e = out.view(out.size(0), -1)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.base(x)
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = self.layer5(out)
+                out = self.layer6(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = self.base(x)
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = self.layer5(out)
+            out = self.layer6(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
         out = self.linear(e)
         if last:
             return out, e
         else:
             return out
 
+
     def get_embedding_dim(self):
         return self.embDim
 
diff --git a/distil/utils/models/dla_simple.py b/distil/utils/models/dla_simple.py
index 3a43d7e..71c7049 100644
--- a/distil/utils/models/dla_simple.py
+++ b/distil/utils/models/dla_simple.py
@@ -6,8 +6,11 @@
 See dla.py for the original paper version.
 
 Reference:
-    Deep Layer Aggregation. https://arxiv.org/abs/1707.06484
+    Deep Layer Aggregation
+    https://arxiv.org/abs/1707.06484
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -33,6 +36,7 @@ def __init__(self, in_planes, planes, stride=1):
                 nn.BatchNorm2d(self.expansion*planes)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
@@ -49,6 +53,7 @@ def __init__(self, in_channels, out_channels, kernel_size=1):
             stride=1, padding=(kernel_size - 1) // 2, bias=False)
         self.bn = nn.BatchNorm2d(out_channels)
 
+
     def forward(self, xs):
         x = torch.cat(xs, 1)
         out = F.relu(self.bn(self.conv(x)))
@@ -68,6 +73,7 @@ def __init__(self, block, in_channels, out_channels, level=1, stride=1):
             self.right_tree = Tree(block, out_channels,
                                    out_channels, level=level-1, stride=1)
 
+
     def forward(self, x):
         out1 = self.left_tree(x)
         out2 = self.right_tree(out1)
@@ -79,6 +85,7 @@ class SimpleDLA(nn.Module):
     def __init__(self, num_classes=10, block=BasicBlock):
         super(SimpleDLA, self).__init__()
         self.embDim = 512
+        
         self.base = nn.Sequential(
             nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
             nn.BatchNorm2d(16),
@@ -103,22 +110,36 @@ def __init__(self, num_classes=10, block=BasicBlock):
         self.layer6 = Tree(block, 256, 512, level=1, stride=2)
         self.linear = nn.Linear(512, num_classes)
 
-    def forward(self, x, last=False):
-        out = self.base(x)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = self.layer5(out)
-        out = self.layer6(out)
-        out = F.avg_pool2d(out, 4)
-        e = out.view(out.size(0), -1)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.base(x)
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = self.layer5(out)
+                out = self.layer6(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = self.base(x)
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = self.layer5(out)
+            out = self.layer6(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
         out = self.linear(e)
         if last:
             return out, e
         else:
             return out
 
+
     def get_embedding_dim(self):
         return self.embDim
 
diff --git a/distil/utils/models/dpn.py b/distil/utils/models/dpn.py
index d334367..6d90a4b 100644
--- a/distil/utils/models/dpn.py
+++ b/distil/utils/models/dpn.py
@@ -1,4 +1,11 @@
-'''Dual Path Networks in PyTorch.'''
+'''DPN in PyTorch.
+
+Reference
+    Dual Path Networks
+    https://arxiv.org/abs/1707.01629
+'''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -24,6 +31,7 @@ def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, firs
                 nn.BatchNorm2d(out_planes+dense_depth)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -40,7 +48,8 @@ def __init__(self, cfg):
         super(DPN, self).__init__()
         in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
         num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
-
+        self.embDim = out_planes[3]+(num_blocks[3]+1)*dense_depth[3]
+        
         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.last_planes = 64
@@ -50,6 +59,7 @@ def __init__(self, cfg):
         self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
         self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
 
+
     def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
         strides = [stride] + [1]*(num_blocks-1)
         layers = []
@@ -58,16 +68,34 @@ def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
             self.last_planes = out_planes + (i+2) * dense_depth
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+      
+      
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def DPN26():
@@ -79,6 +107,7 @@ def DPN26():
     }
     return DPN(cfg)
 
+
 def DPN92():
     cfg = {
         'in_planes': (96,192,384,768),
diff --git a/distil/utils/models/efficientnet.py b/distil/utils/models/efficientnet.py
index b3abdba..2532ae3 100644
--- a/distil/utils/models/efficientnet.py
+++ b/distil/utils/models/efficientnet.py
@@ -1,8 +1,8 @@
 '''EfficientNet in PyTorch.
 
-Paper: "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks".
-
-Reference: https://github.com/keras-team/keras-applications/blob/master/keras_applications/efficientnet.py
+Reference
+    EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks
+    https://github.com/keras-team/keras-applications/blob/master/keras_applications/efficientnet.py
 '''
 import torch
 import torch.nn as nn
@@ -32,6 +32,7 @@ def __init__(self, in_channels, se_channels):
         self.se2 = nn.Conv2d(se_channels, in_channels,
                              kernel_size=1, bias=True)
 
+
     def forward(self, x):
         out = F.adaptive_avg_pool2d(x, (1, 1))
         out = swish(self.se1(out))
@@ -92,6 +93,7 @@ def __init__(self,
         # Skip connection if in and out shapes are the same (MV-V2 style)
         self.has_skip = (stride == 1) and (in_channels == out_channels)
 
+
     def forward(self, x):
         out = x if self.expand_ratio == 1 else swish(self.bn1(self.conv1(x)))
         out = swish(self.bn2(self.conv2(out)))
@@ -119,6 +121,7 @@ def __init__(self, cfg, num_classes=10):
         self.layers = self._make_layers(in_channels=32)
         self.linear = nn.Linear(cfg['out_channels'][-1], num_classes)
 
+
     def _make_layers(self, in_channels):
         layers = []
         cfg = [self.cfg[k] for k in ['expansion', 'out_channels', 'num_blocks', 'kernel_size',
@@ -140,20 +143,32 @@ def _make_layers(self, in_channels):
                 in_channels = out_channels
         return nn.Sequential(*layers)
 
-    def forward(self, x, last=False):
-        out = swish(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = F.adaptive_avg_pool2d(out, 1)
-        e = out.view(out.size(0), -1)
-        dropout_rate = self.cfg['dropout_rate']
-        if self.training and dropout_rate > 0:
-            e = F.dropout(e, p=dropout_rate)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = swish(self.bn1(self.conv1(x)))
+                out = self.layers(out)
+                out = F.adaptive_avg_pool2d(out, 1)
+                e = out.view(out.size(0), -1)
+                dropout_rate = self.cfg['dropout_rate']
+                if self.training and dropout_rate > 0:
+                    e = F.dropout(e, p=dropout_rate)
+        else:
+            out = swish(self.bn1(self.conv1(x)))
+            out = self.layers(out)
+            out = F.adaptive_avg_pool2d(out, 1)
+            e = out.view(out.size(0), -1)
+            dropout_rate = self.cfg['dropout_rate']
+            if self.training and dropout_rate > 0:
+                e = F.dropout(e, p=dropout_rate)
         out = self.linear(e)
         if last:
             return out, e
         else:
             return out
 
+
     def get_embedding_dim(self):
         return self.embDim
 
diff --git a/distil/utils/models/googlenet.py b/distil/utils/models/googlenet.py
index de036d8..9b849af 100644
--- a/distil/utils/models/googlenet.py
+++ b/distil/utils/models/googlenet.py
@@ -1,4 +1,11 @@
-'''GoogLeNet with PyTorch.'''
+'''GoogLeNet in PyTorch.
+
+Reference:
+    GoogLeNet
+    https://arxiv.org/abs/1409.4842
+'''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -45,6 +52,7 @@ def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
             nn.ReLU(True),
         )
 
+
     def forward(self, x):
         y1 = self.b1(x)
         y2 = self.b2(x)
@@ -56,12 +64,14 @@ def forward(self, x):
 class GoogLeNet(nn.Module):
     def __init__(self):
         super(GoogLeNet, self).__init__()
+        self.embDim = 1024
+        
         self.pre_layers = nn.Sequential(
             nn.Conv2d(3, 192, kernel_size=3, padding=1),
             nn.BatchNorm2d(192),
             nn.ReLU(True),
         )
-
+        
         self.a3 = Inception(192,  64,  96, 128, 16, 32, 32)
         self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
 
@@ -79,23 +89,48 @@ def __init__(self):
         self.avgpool = nn.AvgPool2d(8, stride=1)
         self.linear = nn.Linear(1024, 10)
 
-    def forward(self, x):
-        out = self.pre_layers(x)
-        out = self.a3(out)
-        out = self.b3(out)
-        out = self.maxpool(out)
-        out = self.a4(out)
-        out = self.b4(out)
-        out = self.c4(out)
-        out = self.d4(out)
-        out = self.e4(out)
-        out = self.maxpool(out)
-        out = self.a5(out)
-        out = self.b5(out)
-        out = self.avgpool(out)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.pre_layers(x)
+                out = self.a3(out)
+                out = self.b3(out)
+                out = self.maxpool(out)
+                out = self.a4(out)
+                out = self.b4(out)
+                out = self.c4(out)
+                out = self.d4(out)
+                out = self.e4(out)
+                out = self.maxpool(out)
+                out = self.a5(out)
+                out = self.b5(out)
+                out = self.avgpool(out)
+                e = out.view(out.size(0), -1)
+        else:
+            out = self.pre_layers(x)
+            out = self.a3(out)
+            out = self.b3(out)
+            out = self.maxpool(out)
+            out = self.a4(out)
+            out = self.b4(out)
+            out = self.c4(out)
+            out = self.d4(out)
+            out = self.e4(out)
+            out = self.maxpool(out)
+            out = self.a5(out)
+            out = self.b5(out)
+            out = self.avgpool(out)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+     
+     
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def test():
diff --git a/distil/utils/models/lenet.py b/distil/utils/models/lenet.py
index d657b74..ad2698c 100644
--- a/distil/utils/models/lenet.py
+++ b/distil/utils/models/lenet.py
@@ -1,23 +1,46 @@
 '''LeNet in PyTorch.'''
+
+
 import torch.nn as nn
 import torch.nn.functional as F
+import torch
 
 class LeNet(nn.Module):
     def __init__(self):
         super(LeNet, self).__init__()
+        self.embDim = 84
+        
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.conv2 = nn.Conv2d(6, 16, 5)
         self.fc1   = nn.Linear(16*5*5, 120)
         self.fc2   = nn.Linear(120, 84)
         self.fc3   = nn.Linear(84, 10)
 
-    def forward(self, x):
-        out = F.relu(self.conv1(x))
-        out = F.max_pool2d(out, 2)
-        out = F.relu(self.conv2(out))
-        out = F.max_pool2d(out, 2)
-        out = out.view(out.size(0), -1)
-        out = F.relu(self.fc1(out))
-        out = F.relu(self.fc2(out))
-        out = self.fc3(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.conv1(x))
+                out = F.max_pool2d(out, 2)
+                out = F.relu(self.conv2(out))
+                out = F.max_pool2d(out, 2)
+                out = out.view(out.size(0), -1)
+                out = F.relu(self.fc1(out))
+                e = F.relu(self.fc2(out))
+        else:
+            out = F.relu(self.conv1(x))
+            out = F.max_pool2d(out, 2)
+            out = F.relu(self.conv2(out))
+            out = F.max_pool2d(out, 2)
+            out = out.view(out.size(0), -1)
+            out = F.relu(self.fc1(out))
+            e = F.relu(self.fc2(out))
+        out = self.fc3(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
\ No newline at end of file
diff --git a/distil/utils/models/logreg_net.py b/distil/utils/models/logreg_net.py
index fe89e78..72b1965 100644
--- a/distil/utils/models/logreg_net.py
+++ b/distil/utils/models/logreg_net.py
@@ -1,4 +1,5 @@
 import torch.nn as nn
+import torch
 
 ### Logisitic Regression model
 ### The softmax will be applied by the torch's CrossEntropyLoss loss function
@@ -7,17 +8,20 @@ class LogisticRegNet(nn.Module):
     def __init__(self, input_dim, num_classes):
         super(LogisticRegNet, self).__init__()
         self.linear = nn.Linear(input_dim, num_classes)
-        self.embd_dim = input_dim
+        self.feature_dim = input_dim
 
-    def forward(self, x, last=False):
-        scores = self.linear(x)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                scores = self.linear(x)
+        else:
+            scores = self.linear(x)
         if last:
             return scores, x
         else:
             return scores
 
-    def get_embedding_dim(self):
-        return self.embd_dim
-
-
 
+    def get_embedding_dim(self):
+        return self.feature_dim
diff --git a/distil/utils/models/mnist_net.py b/distil/utils/models/mnist_net.py
index 48e16e1..ddc52a2 100644
--- a/distil/utils/models/mnist_net.py
+++ b/distil/utils/models/mnist_net.py
@@ -6,6 +6,8 @@
 class MnistNet(nn.Module):
     def __init__(self):
         super(MnistNet, self).__init__()
+        self.embDim = 128
+        
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
         self.dropout1 = nn.Dropout2d(0.25)
@@ -13,22 +15,37 @@ def __init__(self):
         self.fc1 = nn.Linear(9216, 128)
         self.fc2 = nn.Linear(128, 10)
 
-    def forward(self, x, last=False):
-        x = self.conv1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        x = F.relu(x)
-        x = F.max_pool2d(x, 2)
-        x = self.dropout1(x)
-        x = torch.flatten(x, 1)
-        x = self.fc1(x)
-        x = F.relu(x)
-        x = self.dropout2(x)
-        output = self.fc2(x)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.conv1(x)
+                out = F.relu(out)
+                out = self.conv2(out)
+                out = F.relu(out)
+                out = F.max_pool2d(out, 2)
+                out = self.dropout1(out)
+                out = torch.flatten(out, 1)
+                out = self.fc1(out)
+                out = F.relu(out)
+                e = self.dropout2(out) 
+        else:
+            out = self.conv1(x)
+            out = F.relu(out)
+            out = self.conv2(out)
+            out = F.relu(out)
+            out = F.max_pool2d(out, 2)
+            out = self.dropout1(out)
+            out = torch.flatten(out, 1)
+            out = self.fc1(out)
+            out = F.relu(out)
+            e = self.dropout2(out)
+        out = self.fc2(e)
         if last:
-            return output, x
+            return out, e
         else:
-            return output
+            return out
+
 
     def get_embedding_dim(self):
-        return 128
+        return self.embDim
diff --git a/distil/utils/models/mobilenet.py b/distil/utils/models/mobilenet.py
index 497ef1e..911067c 100644
--- a/distil/utils/models/mobilenet.py
+++ b/distil/utils/models/mobilenet.py
@@ -1,8 +1,11 @@
 '''MobileNet in PyTorch.
 
-See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-for more details.
+Reference
+    MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications 
+    https://arxiv.org/abs/1704.04861
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -10,6 +13,7 @@
 
 class Block(nn.Module):
     '''Depthwise conv + Pointwise conv'''
+    
     def __init__(self, in_planes, out_planes, stride=1):
         super(Block, self).__init__()
         self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
@@ -17,6 +21,7 @@ def __init__(self, in_planes, out_planes, stride=1):
         self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
         self.bn2 = nn.BatchNorm2d(out_planes)
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -29,11 +34,14 @@ class MobileNet(nn.Module):
 
     def __init__(self, num_classes=10):
         super(MobileNet, self).__init__()
+        self.embDim = 1024
+        
         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(32)
         self.layers = self._make_layers(in_planes=32)
         self.linear = nn.Linear(1024, num_classes)
 
+
     def _make_layers(self, in_planes):
         layers = []
         for x in self.cfg:
@@ -43,18 +51,32 @@ def _make_layers(self, in_planes):
             in_planes = out_planes
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = F.avg_pool2d(out, 2)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
 
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layers(out)
+                out = F.avg_pool2d(out, 2)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layers(out)
+            out = F.avg_pool2d(out, 2)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+    def get_embedding_dim(self):
+        return self.embDim
 
+        
 def test():
     net = MobileNet()
-    x = torch.randn(1,3,32,32)
+    x = torch.randn(1, 3, 32, 32)
     y = net(x)
     print(y.size())
 
diff --git a/distil/utils/models/mobilenetv2.py b/distil/utils/models/mobilenetv2.py
index e4122d1..cf7a14f 100644
--- a/distil/utils/models/mobilenetv2.py
+++ b/distil/utils/models/mobilenetv2.py
@@ -1,8 +1,11 @@
 '''MobileNetV2 in PyTorch.
 
-See the paper "Inverted Residuals and Linear Bottlenecks:
-Mobile Networks for Classification, Detection and Segmentation" for more details.
+Reference
+    Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation
+    https://arxiv.org/abs/1801.04381
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -10,6 +13,7 @@
 
 class Block(nn.Module):
     '''expand + depthwise + pointwise'''
+    
     def __init__(self, in_planes, out_planes, expansion, stride):
         super(Block, self).__init__()
         self.stride = stride
@@ -29,6 +33,7 @@ def __init__(self, in_planes, out_planes, expansion, stride):
                 nn.BatchNorm2d(out_planes),
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -47,10 +52,12 @@ class MobileNetV2(nn.Module):
            (6, 160, 3, 2),
            (6, 320, 1, 1)]
 
+
     def __init__(self, num_classes=10):
         super(MobileNetV2, self).__init__()
-        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
         self.embDim = 1280
+        
+        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(32)
         self.layers = self._make_layers(in_planes=32)
@@ -58,6 +65,7 @@ def __init__(self, num_classes=10):
         self.bn2 = nn.BatchNorm2d(1280)
         self.linear = nn.Linear(1280, num_classes)
 
+
     def _make_layers(self, in_planes):
         layers = []
         for expansion, out_planes, num_blocks, stride in self.cfg:
@@ -67,22 +75,36 @@ def _make_layers(self, in_planes):
                 in_planes = out_planes
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = F.relu(self.bn2(self.conv2(out)))
-        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layers(out)
+                out = F.relu(self.bn2(self.conv2(out)))
+                # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layers(out)
+            out = F.relu(self.bn2(self.conv2(out)))
+            # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
 
     def get_embedding_dim(self):
         return self.embDim
 
+
 def test():
     net = MobileNetV2()
-    x = torch.randn(2,3,32,32)
+    x = torch.randn(2, 3, 32, 32)
     y = net(x)
     print(y.size())
 
diff --git a/distil/utils/models/mobilenetv2tf.py b/distil/utils/models/mobilenetv2tf.py
new file mode 100644
index 0000000..ad2d877
--- /dev/null
+++ b/distil/utils/models/mobilenetv2tf.py
@@ -0,0 +1,133 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BaseBlock(nn.Module):
+    alpha = 1
+
+    def __init__(self, input_channel, output_channel, t=6, downsample=False):
+        """
+            t:  expansion factor, t*input_channel is channel of expansion layer
+            alpha:  width multiplier, to get thinner models
+            rho:    resolution multiplier, to get reduced representation
+        """
+        super(BaseBlock, self).__init__()
+        self.stride = 2 if downsample else 1
+        self.downsample = downsample
+        self.shortcut = (not downsample) and (input_channel == output_channel)
+
+        # apply alpha
+        input_channel = int(self.alpha * input_channel)
+        output_channel = int(self.alpha * output_channel)
+
+        # for main path:
+        c = t * input_channel
+        # 1x1   point wise conv
+        self.conv1 = nn.Conv2d(input_channel, c, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(c)
+        # 3x3   depth wise conv
+        self.conv2 = nn.Conv2d(c, c, kernel_size=3, stride=self.stride, padding=1, groups=c, bias=False)
+        self.bn2 = nn.BatchNorm2d(c)
+        # 1x1   point wise conv
+        self.conv3 = nn.Conv2d(c, output_channel, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(output_channel)
+
+    def forward(self, inputs):
+        # main path
+        x = F.relu6(self.bn1(self.conv1(inputs)), inplace=True)
+        x = F.relu6(self.bn2(self.conv2(x)), inplace=True)
+        x = self.bn3(self.conv3(x))
+
+        # shortcut path
+        x = x + inputs if self.shortcut else x
+
+        return x
+
+class MobileNet2(nn.Module):
+    def __init__(self, output_size, alpha=1):
+        super(MobileNet2, self).__init__()
+        self.output_size = output_size
+
+        # first conv layer 
+        self.conv0 = nn.Conv2d(3, int(32 * alpha), kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn0 = nn.BatchNorm2d(int(32 * alpha))
+
+        # build bottlenecks
+        BaseBlock.alpha = alpha
+        self.bottlenecks = nn.Sequential(
+            BaseBlock(32, 16, t=1, downsample=False),
+            BaseBlock(16, 24, downsample=False),
+            BaseBlock(24, 24),
+            BaseBlock(24, 32, downsample=False),
+            BaseBlock(32, 32),
+            BaseBlock(32, 32),
+            BaseBlock(32, 64, downsample=True),
+            BaseBlock(64, 64),
+            BaseBlock(64, 64),
+            BaseBlock(64, 64),
+            BaseBlock(64, 96, downsample=False),
+            BaseBlock(96, 96),
+            BaseBlock(96, 96),
+            BaseBlock(96, 160, downsample=True),
+            BaseBlock(160, 160),
+            BaseBlock(160, 160),
+            BaseBlock(160, 320, downsample=False))
+
+        # last conv layers and fc layer
+        self.conv1 = nn.Conv2d(int(320 * alpha), 1280, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(1280)
+        self.fc = nn.Linear(1280, output_size)
+        self.embDim = 1280
+        # weights init
+        self.weights_init()
+
+    def weights_init(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, inputs, last=False, freeze=False):
+
+        if freeze:
+            with torch.no_grad():
+                # first conv layer
+                x = F.relu6(self.bn0(self.conv0(inputs)), inplace=True)
+                # assert x.shape[1:] == torch.Size([32, 32, 32])
+                # bottlenecks
+                x = self.bottlenecks(x)
+                # assert x.shape[1:] == torch.Size([320, 8, 8])
+                # last conv layer
+                x = F.relu6(self.bn1(self.conv1(x)), inplace=True)
+                # assert x.shape[1:] == torch.Size([1280,8,8])
+                # global pooling and fc (in place of conv 1x1 in paper)
+                x = F.adaptive_avg_pool2d(x, 1)
+                e = x.view(x.shape[0], -1)
+        else:
+            # first conv layer
+            x = F.relu6(self.bn0(self.conv0(inputs)), inplace=True)
+            # assert x.shape[1:] == torch.Size([32, 32, 32])
+            # bottlenecks
+            x = self.bottlenecks(x)
+            # assert x.shape[1:] == torch.Size([320, 8, 8])
+            # last conv layer
+            x = F.relu6(self.bn1(self.conv1(x)), inplace=True)
+            # assert x.shape[1:] == torch.Size([1280,8,8])
+            # global pooling and fc (in place of conv 1x1 in paper)
+            x = F.adaptive_avg_pool2d(x, 1)
+            e = x.view(x.shape[0], -1)
+        x = self.fc(e)
+
+        if last:
+            return x, e
+        else:
+            return x
+
+    def get_embedding_dim(self):
+        return self.embDim
diff --git a/distil/utils/models/pnasnet.py b/distil/utils/models/pnasnet.py
index de8c4d5..bcd03cb 100644
--- a/distil/utils/models/pnasnet.py
+++ b/distil/utils/models/pnasnet.py
@@ -1,7 +1,11 @@
 '''PNASNet in PyTorch.
 
-Paper: Progressive Neural Architecture Search
+Reference
+    Progressive Neural Architecture Search
+    https://arxiv.org/abs/1712.00559
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -17,6 +21,7 @@ def __init__(self, in_planes, out_planes, kernel_size, stride):
                                bias=False, groups=in_planes)
         self.bn1 = nn.BatchNorm2d(out_planes)
 
+
     def forward(self, x):
         return self.bn1(self.conv1(x))
 
@@ -30,6 +35,7 @@ def __init__(self, in_planes, out_planes, stride=1):
             self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
             self.bn1 = nn.BatchNorm2d(out_planes)
 
+
     def forward(self, x):
         y1 = self.sep_conv1(x)
         y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
@@ -37,6 +43,7 @@ def forward(self, x):
             y2 = self.bn1(self.conv1(y2))
         return F.relu(y1+y2)
 
+
 class CellB(nn.Module):
     def __init__(self, in_planes, out_planes, stride=1):
         super(CellB, self).__init__()
@@ -53,6 +60,7 @@ def __init__(self, in_planes, out_planes, stride=1):
         self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
         self.bn2 = nn.BatchNorm2d(out_planes)
 
+
     def forward(self, x):
         # Left branch
         y1 = self.sep_conv1(x)
@@ -68,12 +76,14 @@ def forward(self, x):
         y = torch.cat([b1,b2], 1)
         return F.relu(self.bn2(self.conv2(y)))
 
+
 class PNASNet(nn.Module):
     def __init__(self, cell_type, num_cells, num_planes):
         super(PNASNet, self).__init__()
         self.in_planes = num_planes
         self.cell_type = cell_type
-
+        self.embDim = self.in_planes * 4
+        
         self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(num_planes)
 
@@ -85,6 +95,7 @@ def __init__(self, cell_type, num_cells, num_planes):
 
         self.linear = nn.Linear(num_planes*4, 10)
 
+
     def _make_layer(self, planes, num_cells):
         layers = []
         for _ in range(num_cells):
@@ -92,26 +103,48 @@ def _make_layer(self, planes, num_cells):
             self.in_planes = planes
         return nn.Sequential(*layers)
 
+
     def _downsample(self, planes):
         layer = self.cell_type(self.in_planes, planes, stride=2)
         self.in_planes = planes
         return layer
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = self.layer5(out)
-        out = F.avg_pool2d(out, 8)
-        out = self.linear(out.view(out.size(0), -1))
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = self.layer5(out)
+                out = F.avg_pool2d(out, 8)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = self.layer5(out)
+            out = F.avg_pool2d(out, 8)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def PNASNetA():
     return PNASNet(CellA, num_cells=6, num_planes=44)
 
+
 def PNASNetB():
     return PNASNet(CellB, num_cells=6, num_planes=32)
 
diff --git a/distil/utils/models/preact_resnet.py b/distil/utils/models/preact_resnet.py
index abb1bc3..c979370 100644
--- a/distil/utils/models/preact_resnet.py
+++ b/distil/utils/models/preact_resnet.py
@@ -1,9 +1,11 @@
 '''Pre-activation ResNet in PyTorch.
 
 Reference:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
+    Identity Mappings in Deep Residual Networks
+    https://arxiv.org/abs/1603.05027
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -25,6 +27,7 @@ def __init__(self, in_planes, planes, stride=1):
                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(x))
         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
@@ -52,6 +55,7 @@ def __init__(self, in_planes, planes, stride=1):
                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(x))
         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
@@ -66,7 +70,8 @@ class PreActResNet(nn.Module):
     def __init__(self, block, num_blocks, num_classes=10):
         super(PreActResNet, self).__init__()
         self.in_planes = 64
-
+        self.embDim = 8 * self.in_planes * block.expansion
+        
         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
@@ -82,30 +87,52 @@ def _make_layer(self, block, planes, num_blocks, stride):
             self.in_planes = planes * block.expansion
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.conv1(x)
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = self.conv1(x)
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def PreActResNet18():
     return PreActResNet(PreActBlock, [2,2,2,2])
 
+
 def PreActResNet34():
     return PreActResNet(PreActBlock, [3,4,6,3])
 
+
 def PreActResNet50():
     return PreActResNet(PreActBottleneck, [3,4,6,3])
 
+
 def PreActResNet101():
     return PreActResNet(PreActBottleneck, [3,4,23,3])
 
+
 def PreActResNet152():
     return PreActResNet(PreActBottleneck, [3,8,36,3])
 
diff --git a/distil/utils/models/regnet.py b/distil/utils/models/regnet.py
index 5d59c1a..d853d14 100644
--- a/distil/utils/models/regnet.py
+++ b/distil/utils/models/regnet.py
@@ -1,9 +1,11 @@
 '''RegNet in PyTorch.
 
-Paper: "Designing Network Design Spaces".
-
-Reference: https://github.com/keras-team/keras-applications/blob/master/keras_applications/efficientnet.py
+Reference
+    Designing Network Design Spaces
+    https://arxiv.org/abs/2003.13678
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -17,6 +19,7 @@ def __init__(self, in_planes, se_planes):
         self.se1 = nn.Conv2d(in_planes, se_planes, kernel_size=1, bias=True)
         self.se2 = nn.Conv2d(se_planes, in_planes, kernel_size=1, bias=True)
 
+
     def forward(self, x):
         out = F.adaptive_avg_pool2d(x, (1, 1))
         out = F.relu(self.se1(out))
@@ -54,6 +57,7 @@ def __init__(self, w_in, w_out, stride, group_width, bottleneck_ratio, se_ratio)
                 nn.BatchNorm2d(w_out)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -70,6 +74,8 @@ def __init__(self, cfg, num_classes=10):
         super(RegNet, self).__init__()
         self.cfg = cfg
         self.in_planes = 64
+        self.embDim = self.cfg['widths'][-1]
+        
         self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
@@ -79,6 +85,7 @@ def __init__(self, cfg, num_classes=10):
         self.layer4 = self._make_layer(3)
         self.linear = nn.Linear(self.cfg['widths'][-1], num_classes)
 
+
     def _make_layer(self, idx):
         depth = self.cfg['depths'][idx]
         width = self.cfg['widths'][idx]
@@ -95,16 +102,34 @@ def _make_layer(self, idx):
             self.in_planes = width
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.adaptive_avg_pool2d(out, (1, 1))
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = F.adaptive_avg_pool2d(out, (1, 1))
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = F.adaptive_avg_pool2d(out, (1, 1))
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def RegNetX_200MF():
diff --git a/distil/utils/models/resnet.py b/distil/utils/models/resnet.py
index bc9803f..1182b66 100644
--- a/distil/utils/models/resnet.py
+++ b/distil/utils/models/resnet.py
@@ -1,15 +1,16 @@
 '''ResNet in PyTorch.
 
-For Pre-activation ResNet, see 'preact_resnet.py'.
-
-Reference:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+Reference
+    Deep Residual Learning for Image Recognition
+    https://arxiv.org/abs/1512.03385
 '''
+
+
 import torch.nn as nn
 import torch.nn.functional as F
 import torch
 
+
 class BasicBlock(nn.Module):
     expansion = 1
 
@@ -54,6 +55,7 @@ def __init__(self, in_planes, planes, stride=1):
                 nn.BatchNorm2d(self.expansion*planes)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -64,11 +66,12 @@ def forward(self, x):
 
 
 class ResNet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=10, channels=3):
+    def __init__(self, block, num_blocks, num_classes=10):
         super(ResNet, self).__init__()
         self.in_planes = 64
         self.embDim = 8 * self.in_planes * block.expansion
-        self.conv1 = nn.Conv2d(channels, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
@@ -76,6 +79,7 @@ def __init__(self, block, num_blocks, num_classes=10, channels=3):
         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
         self.linear = nn.Linear(512*block.expansion, num_classes)
 
+
     def _make_layer(self, block, planes, num_blocks, stride):
         strides = [stride] + [1]*(num_blocks-1)
         layers = []
@@ -111,26 +115,24 @@ def forward(self, x, last=False, freeze=False):
     def get_embedding_dim(self):
         return self.embDim
 
-def ResNet18(num_classes=10, channels=3):
-    return ResNet(BasicBlock, [2,2,2,2], num_classes, channels)
 
-def ResNet34(num_classes=10, channels=3):
-    return ResNet(BasicBlock, [3,4,6,3], num_classes, channels)
+def ResNet18(num_classes=10):
+    return ResNet(BasicBlock, [2,2,2,2], num_classes)
+
+
+def ResNet34(num_classes=10):
+    return ResNet(BasicBlock, [3,4,6,3], num_classes)
+
 
-def ResNet50(num_classes=10, channels=3):
-    return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)
+def ResNet50(num_classes=10):
+    return ResNet(Bottleneck, [3,4,6,3], num_classes)
 
-def ResNet101(num_classes=10, channels=3):
-    return ResNet(Bottleneck, [3,4,23,3], num_classes, channels)
 
-def ResNet152(num_classes=10, channels=3):
-    return ResNet(Bottleneck, [3,8,36,3], num_classes, channels)
+def ResNet101(num_classes=10):
+    return ResNet(Bottleneck, [3,4,23,3], num_classes)
 
 
-# def test():
-#     net = ResNet18()
-#     y = net(torch.randn(1,3,32,32))
-#     print(y)
-#     print(y.size())
+def ResNet152(num_classes=10):
+    return ResNet(Bottleneck, [3,8,36,3], num_classes)
 
 #test()
diff --git a/distil/utils/models/resnet164.py b/distil/utils/models/resnet164.py
new file mode 100644
index 0000000..7837906
--- /dev/null
+++ b/distil/utils/models/resnet164.py
@@ -0,0 +1,339 @@
+'''ResNet for cifar in pytorch
+
+Reference:
+    Deep residual learning for image recognition
+        https://arxiv.org/abs/1512.03385
+    Identity mappings in deep residual networks
+        https://arxiv.org/abs/1603.05027
+'''
+
+import torch
+import torch.nn as nn
+import math
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    " 3x3 convolution with padding "
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion=1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion=4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes*4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes*4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PreActBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(PreActBasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = conv3x3(planes, planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(out)
+
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        out += residual
+
+        return out
+
+
+class PreActBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(PreActBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes*4, kernel_size=1, bias=False)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.bn1(x)
+        out = self.relu(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(out)
+
+        out = self.conv1(out)
+
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        out = self.bn3(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+
+        out += residual
+
+        return out
+
+
+class ResNet_Cifar(nn.Module):
+
+    def __init__(self, block, layers, num_classes=10):
+        super(ResNet_Cifar, self).__init__()
+        self.inplanes = 16
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, 16, layers[0])
+        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
+        self.avgpool = nn.AvgPool2d(8, stride=1)
+        self.fc = nn.Linear(64 * block.expansion, num_classes)
+        self.embDim = 64 * block.expansion
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion)
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                x = self.conv1(x)
+                x = self.bn1(x)
+                x = self.relu(x)
+
+                x = self.layer1(x)
+                x = self.layer2(x)
+                x = self.layer3(x)
+
+                x = self.avgpool(x)
+                x = x.view(x.size(0), -1)
+        else:
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+
+            x = self.avgpool(x)
+            x = x.view(x.size(0), -1)
+        out = self.fc(x)
+        if last:
+            return out, x
+        else:
+            return out
+
+    def get_embedding_dim(self):
+        return self.embDim
+
+class PreAct_ResNet_Cifar(nn.Module):
+
+    def __init__(self, block, layers, num_classes=10):
+        super(PreAct_ResNet_Cifar, self).__init__()
+        self.inplanes = 16
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
+        self.layer1 = self._make_layer(block, 16, layers[0])
+        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
+        self.bn = nn.BatchNorm2d(64*block.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(8, stride=1)
+        self.fc = nn.Linear(64*block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes*block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes*block.expansion, kernel_size=1, stride=stride, bias=False)
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes*block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+
+def resnet20_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [3, 3, 3], **kwargs)
+    return model
+
+
+def resnet32_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [5, 5, 5], **kwargs)
+    return model
+
+
+def resnet44_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [7, 7, 7], **kwargs)
+    return model
+
+
+def resnet56_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [9, 9, 9], **kwargs)
+    return model
+
+
+def resnet110_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [18, 18, 18], **kwargs)
+    return model
+
+
+def resnet1202_cifar(**kwargs):
+    model = ResNet_Cifar(BasicBlock, [200, 200, 200], **kwargs)
+    return model
+
+
+def ResNet164(num_class):
+    model = ResNet_Cifar(Bottleneck, [18, 18, 18], num_classes=num_class)
+    return model
+
+
+def resnet1001_cifar(**kwargs):
+    model = ResNet_Cifar(Bottleneck, [111, 111, 111], **kwargs)
+    return model
+
+
+def preact_resnet110_cifar(**kwargs):
+    model = PreAct_ResNet_Cifar(PreActBasicBlock, [18, 18, 18], **kwargs)
+    return model
+
+
+def preact_resnet164_cifar(**kwargs):
+    model = PreAct_ResNet_Cifar(PreActBottleneck, [18, 18, 18], **kwargs)
+    return model
+
+
+def preact_resnet1001_cifar(**kwargs):
+    model = PreAct_ResNet_Cifar(PreActBottleneck, [111, 111, 111], **kwargs)
+    return model
diff --git a/distil/utils/models/resnext.py b/distil/utils/models/resnext.py
index d6219ef..cfc8776 100644
--- a/distil/utils/models/resnext.py
+++ b/distil/utils/models/resnext.py
@@ -1,7 +1,11 @@
 '''ResNeXt in PyTorch.
 
-See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
+Reference
+    Aggregated Residual Transformations for Deep Neural Networks
+    https://arxiv.org/abs/1611.05431
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -27,6 +31,7 @@ def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
                 nn.BatchNorm2d(self.expansion*group_width)
             )
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = F.relu(self.bn2(self.conv2(out)))
@@ -43,6 +48,7 @@ def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
         self.bottleneck_width = bottleneck_width
         self.in_planes = 64
         self.embDim = cardinality*bottleneck_width*8
+        
         self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.layer1 = self._make_layer(num_blocks[0], 1)
@@ -51,6 +57,7 @@ def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
         # self.layer4 = self._make_layer(num_blocks[3], 2)
         self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
 
+
     def _make_layer(self, num_blocks, stride):
         strides = [stride] + [1]*(num_blocks-1)
         layers = []
@@ -61,20 +68,32 @@ def _make_layer(self, num_blocks, stride):
         self.bottleneck_width *= 2
         return nn.Sequential(*layers)
 
-    def forward(self, x,last=False):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        # out = self.layer4(out)
-        out = F.avg_pool2d(out, 8)
-        e = out.view(out.size(0), -1)
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                # out = self.layer4(out)
+                out = F.avg_pool2d(out, 8)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            # out = self.layer4(out)
+            out = F.avg_pool2d(out, 8)
+            e = out.view(out.size(0), -1) 
         out = self.linear(e)
         if last:
             return out, e
         else:
             return out
 
+
     def get_embedding_dim(self):
         return self.embDim
 
@@ -82,15 +101,19 @@ def get_embedding_dim(self):
 def ResNeXt29_2x64d(num_classes=10):
     return ResNeXt([3,3,3], 2, 64, num_classes)
 
+
 def ResNeXt29_4x64d(num_classes=10):
     return ResNeXt([3,3,3], 4, 64, num_classes)
 
+
 def ResNeXt29_8x64d(num_classes=10):
     return ResNeXt([3,3,3], 8, 64, num_classes)
 
+
 def ResNeXt29_32x4d(num_classes=10):
     return ResNeXt([3,3,3], 32, 4, num_classes)
 
+
 def test_resnext():
     net = ResNeXt29_2x64d()
     x = torch.randn(1,3,32,32)
diff --git a/distil/utils/models/senet.py b/distil/utils/models/senet.py
index 98bfa0c..ad93b05 100644
--- a/distil/utils/models/senet.py
+++ b/distil/utils/models/senet.py
@@ -26,6 +26,7 @@ def __init__(self, in_planes, planes, stride=1):
         self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
         self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
@@ -59,6 +60,7 @@ def __init__(self, in_planes, planes, stride=1):
         self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
         self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
 
+
     def forward(self, x):
         out = F.relu(self.bn1(x))
         shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
@@ -80,7 +82,8 @@ class SENet(nn.Module):
     def __init__(self, block, num_blocks, num_classes=10):
         super(SENet, self).__init__()
         self.in_planes = 64
-
+        self.embDim = 512
+        
         self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
@@ -89,6 +92,7 @@ def __init__(self, block, num_blocks, num_classes=10):
         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
         self.linear = nn.Linear(512, num_classes)
 
+
     def _make_layer(self, block, planes, num_blocks, stride):
         strides = [stride] + [1]*(num_blocks-1)
         layers = []
@@ -97,16 +101,34 @@ def _make_layer(self, block, planes, num_blocks, stride):
             self.in_planes = planes
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = self.layer4(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = self.layer4(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)     
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
 
 
 def SENet18():
diff --git a/distil/utils/models/shufflenet.py b/distil/utils/models/shufflenet.py
index acff6f7..9eb1779 100644
--- a/distil/utils/models/shufflenet.py
+++ b/distil/utils/models/shufflenet.py
@@ -1,7 +1,11 @@
 '''ShuffleNet in PyTorch.
 
-See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
+Reference
+    ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices
+    https://arxiv.org/abs/1707.01083
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -12,6 +16,7 @@ def __init__(self, groups):
         super(ShuffleBlock, self).__init__()
         self.groups = groups
 
+
     def forward(self, x):
         '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
         N,C,H,W = x.size()
@@ -23,9 +28,9 @@ class Bottleneck(nn.Module):
     def __init__(self, in_planes, out_planes, stride, groups):
         super(Bottleneck, self).__init__()
         self.stride = stride
-
-        mid_planes = out_planes/4
+        mid_planes = int(out_planes/4)
         g = 1 if in_planes==24 else groups
+        
         self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
         self.bn1 = nn.BatchNorm2d(mid_planes)
         self.shuffle1 = ShuffleBlock(groups=g)
@@ -38,6 +43,7 @@ def __init__(self, in_planes, out_planes, stride, groups):
         if stride == 2:
             self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
 
+
     def forward(self, x):
         out = F.relu(self.bn1(self.conv1(x)))
         out = self.shuffle1(out)
@@ -54,7 +60,8 @@ def __init__(self, cfg):
         out_planes = cfg['out_planes']
         num_blocks = cfg['num_blocks']
         groups = cfg['groups']
-
+        self.embDim = out_planes[2]
+        
         self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
         self.bn1 = nn.BatchNorm2d(24)
         self.in_planes = 24
@@ -63,6 +70,7 @@ def __init__(self, cfg):
         self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
         self.linear = nn.Linear(out_planes[2], 10)
 
+
     def _make_layer(self, out_planes, num_blocks, groups):
         layers = []
         for i in range(num_blocks):
@@ -72,16 +80,33 @@ def _make_layer(self, out_planes, num_blocks, groups):
             self.in_planes = out_planes
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
 
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)            
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
+        
 
 def ShuffleNetG2():
     cfg = {
@@ -91,6 +116,7 @@ def ShuffleNetG2():
     }
     return ShuffleNet(cfg)
 
+
 def ShuffleNetG3():
     cfg = {
         'out_planes': [240,480,960],
diff --git a/distil/utils/models/shufflenetv2.py b/distil/utils/models/shufflenetv2.py
index eefcda3..ab684cd 100644
--- a/distil/utils/models/shufflenetv2.py
+++ b/distil/utils/models/shufflenetv2.py
@@ -1,7 +1,11 @@
 '''ShuffleNetV2 in PyTorch.
 
-See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
+Reference
+    ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design
+    https://arxiv.org/abs/1807.11164
 '''
+
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -12,6 +16,7 @@ def __init__(self, groups=2):
         super(ShuffleBlock, self).__init__()
         self.groups = groups
 
+
     def forward(self, x):
         '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
         N, C, H, W = x.size()
@@ -24,6 +29,7 @@ def __init__(self, ratio):
         super(SplitBlock, self).__init__()
         self.ratio = ratio
 
+
     def forward(self, x):
         c = int(x.size(1) * self.ratio)
         return x[:, :c, :, :], x[:, c:, :, :]
@@ -45,6 +51,7 @@ def __init__(self, in_channels, split_ratio=0.5):
         self.bn3 = nn.BatchNorm2d(in_channels)
         self.shuffle = ShuffleBlock()
 
+
     def forward(self, x):
         x1, x2 = self.split(x)
         out = F.relu(self.bn1(self.conv1(x2)))
@@ -79,6 +86,7 @@ def __init__(self, in_channels, out_channels):
 
         self.shuffle = ShuffleBlock()
 
+
     def forward(self, x):
         # left
         out1 = self.bn1(self.conv1(x))
@@ -98,7 +106,8 @@ def __init__(self, net_size):
         super(ShuffleNetV2, self).__init__()
         out_channels = configs[net_size]['out_channels']
         num_blocks = configs[net_size]['num_blocks']
-
+        self.embDim = out_channels[3]
+        
         self.conv1 = nn.Conv2d(3, 24, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(24)
@@ -111,6 +120,7 @@ def __init__(self, net_size):
         self.bn2 = nn.BatchNorm2d(out_channels[3])
         self.linear = nn.Linear(out_channels[3], 10)
 
+
     def _make_layer(self, out_channels, num_blocks):
         layers = [DownBlock(self.in_channels, out_channels)]
         for i in range(num_blocks):
@@ -118,18 +128,37 @@ def _make_layer(self, out_channels, num_blocks):
             self.in_channels = out_channels
         return nn.Sequential(*layers)
 
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        # out = F.max_pool2d(out, 3, stride=2, padding=1)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
 
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = F.relu(self.bn1(self.conv1(x)))
+                # out = F.max_pool2d(out, 3, stride=2, padding=1)
+                out = self.layer1(out)
+                out = self.layer2(out)
+                out = self.layer3(out)
+                out = F.relu(self.bn2(self.conv2(out)))
+                out = F.avg_pool2d(out, 4)
+                e = out.view(out.size(0), -1)
+        else:
+            out = F.relu(self.bn1(self.conv1(x)))
+            # out = F.max_pool2d(out, 3, stride=2, padding=1)
+            out = self.layer1(out)
+            out = self.layer2(out)
+            out = self.layer3(out)
+            out = F.relu(self.bn2(self.conv2(out)))
+            out = F.avg_pool2d(out, 4)
+            e = out.view(out.size(0), -1)
+        out = self.linear(e)
+        if last:
+            return out, e
+        else:
+            return out
+
+
+    def get_embedding_dim(self):
+        return self.embDim
+        
 
 configs = {
     0.5: {
diff --git a/distil/utils/models/simpleNN_net.py b/distil/utils/models/simpleNN_net.py
new file mode 100644
index 0000000..96f2ffd
--- /dev/null
+++ b/distil/utils/models/simpleNN_net.py
@@ -0,0 +1,63 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+
+class TwoLayerNet(nn.Module):
+    def __init__(self, input_dim, num_classes, hidden_units):
+        super(TwoLayerNet, self).__init__()
+        self.linear1 = nn.Linear(input_dim, hidden_units)
+        self.linear2 = nn.Linear(hidden_units, num_classes)
+        self.feature_dim = hidden_units
+    
+
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                l1scores = F.relu(self.linear1(x))
+        else:
+            l1scores = F.relu(self.linear1(x))
+        scores = self.linear2(l1scores)
+        if last:
+            return scores, l1scores
+        else:
+            return scores
+
+
+    def get_feature_dim(self):
+        return self.feature_dim
+
+
+    def get_embedding_dim(self):
+        return self.feature_dim
+
+
+class ThreeLayerNet(nn.Module):
+    def __init__(self, input_dim, num_classes, h1, h2):
+        super(ThreeLayerNet, self).__init__()
+        self.linear1 = nn.Linear(input_dim, h1)
+        self.linear2 = nn.Linear(h1, h2)
+        self.linear3 = nn.Linear(h2, num_classes)
+        self.feature_dim = h2
+
+    
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                l1scores = F.relu(self.linear1(x))
+                l2scores = F.relu(self.linear2(l1scores))
+        else:
+            l1scores = F.relu(self.linear1(x))
+            l2scores = F.relu(self.linear2(l1scores))
+        scores = self.linear3(l2scores)
+        if last:
+            return scores, l2scores
+        else:
+            return scores
+
+
+    def get_feature_dim(self):
+        return self.feature_dim
+
+
+    def get_embedding_dim(self):
+        return self.feature_dim
\ No newline at end of file
diff --git a/distil/utils/models/vgg.py b/distil/utils/models/vgg.py
index 08347ff..9b3cd29 100644
--- a/distil/utils/models/vgg.py
+++ b/distil/utils/models/vgg.py
@@ -1,4 +1,6 @@
 '''VGG11/13/16/19 in Pytorch.'''
+
+
 import torch
 import torch.nn as nn
 
@@ -15,13 +17,9 @@ class VGG(nn.Module):
     def __init__(self, vgg_name):
         super(VGG, self).__init__()
         self.features = self._make_layers(cfg[vgg_name])
+        self.embDim = 512
         self.classifier = nn.Linear(512, 10)
 
-    def forward(self, x):
-        out = self.features(x)
-        out = out.view(out.size(0), -1)
-        out = self.classifier(out)
-        return out
 
     def _make_layers(self, cfg):
         layers = []
@@ -38,6 +36,25 @@ def _make_layers(self, cfg):
         return nn.Sequential(*layers)
 
 
+    def forward(self, x, last=False, freeze=False):
+        if freeze:
+            with torch.no_grad():
+                out = self.features(x)
+                e = out.view(out.size(0), -1)
+        else:
+            out = self.features(x)
+            e = out.view(out.size(0), -1)
+        out = self.classifier(e)
+        if last:
+            return out, e
+        else:
+            return out
+        
+
+    def get_embedding_dim(self):
+        return self.embDim
+
+
 def test():
     net = VGG('VGG11')
     x = torch.randn(2,3,32,32)
diff --git a/docs/source/ActStrategy/cords.utils.models.rst b/docs/source/ActStrategy/cords.utils.models.rst
new file mode 100644
index 0000000..28d7457
--- /dev/null
+++ b/docs/source/ActStrategy/cords.utils.models.rst
@@ -0,0 +1,41 @@
+distil.utils.models package
+===========================
+
+We have incorporated several neural network architectures in the DISTIL repository. Below given is a list of Neural network architectures:
+ - densenet
+ - dla
+ - dla_simple
+ - dpn
+ - efficientnet
+ - googlenet
+ - lenet
+ - mobilenet
+ - mobilenetv2
+ - pnasnet
+ - preact_resnet
+ - regnet
+ - resnet
+ - resnext
+ - senet
+ - shufflenet
+ - shufflenetv2
+ - vgg
+
+
+**To use custom model architecture, modify the model architecture in the following way:**
+
+The forward method should have two more variables:
+
+#. A boolean variable *last* which -
+
+	*If *true*: returns the model output and the output of the second last layer
+
+	*If *false*: Returns the model output.
+
+#. A boolean variable ‘freeze’ which -
+
+	*If *true*: disables the tracking of any calculations required to later calculate a gradient i.e skips gradient calculation over the weights
+
+	*If *false*: otherwise
+
+#. get_embedding_dim() method which returns the number of hidden units in the last layer.
diff --git a/docs/source/ActStrategy/distil.utils.rst b/docs/source/ActStrategy/distil.utils.rst
index 33408a1..7dbf3fc 100644
--- a/docs/source/ActStrategy/distil.utils.rst
+++ b/docs/source/ActStrategy/distil.utils.rst
@@ -10,7 +10,7 @@ utils
 DataHandler
 -------------------------------
 
-.. automodule:: distil.utils.DataHandler
+.. automodule:: distil.utils.data_handler
    :members:
    :undoc-members:
    :show-inheritance:
@@ -23,3 +23,19 @@ Dataset
    :undoc-members:
    :show-inheritance:
 
+Submodular Functions
+---------------------------
+
+.. automodule:: distil.utils.submodular
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Similarity Matrix
+---------------------------
+
+.. automodule:: distil.utils.similarity_mat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+

From a55d0baf5966635ce34c4150d7d188abe9e775c9 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Thu, 29 Apr 2021 15:35:49 +0530
Subject: [PATCH 02/30] model new

---
 distil/active_learning_strategies/entropy_sampling.py         | 2 +-
 distil/active_learning_strategies/entropy_sampling_dropout.py | 2 +-
 distil/active_learning_strategies/least_confidence.py         | 2 +-
 distil/active_learning_strategies/least_confidence_dropout.py | 2 +-
 distil/active_learning_strategies/margin_sampling.py          | 2 +-
 distil/active_learning_strategies/margin_sampling_dropout.py  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/distil/active_learning_strategies/entropy_sampling.py b/distil/active_learning_strategies/entropy_sampling.py
index 58555ff..f240ecc 100644
--- a/distil/active_learning_strategies/entropy_sampling.py
+++ b/distil/active_learning_strategies/entropy_sampling.py
@@ -9,7 +9,7 @@ class EntropySampling(Strategy):
     we use entropy and therefore select points which have maximum entropy. 
 
     Suppose the model has `nclasses` output nodes and each output node is denoted by :math:`z_j`. Thus,  
-    :math:`j \\in [1,nclasses]`. Then for a output node :math:`z_i` from the model, the correponding 
+    :math:`j \\in [1,nclasses]`. Then for a output node :math:`z_i` from the model, the corresponding
     softmax would be 
 
     .. math::
diff --git a/distil/active_learning_strategies/entropy_sampling_dropout.py b/distil/active_learning_strategies/entropy_sampling_dropout.py
index 56a46b5..d90d0d4 100644
--- a/distil/active_learning_strategies/entropy_sampling_dropout.py
+++ b/distil/active_learning_strategies/entropy_sampling_dropout.py
@@ -10,7 +10,7 @@ class EntropySamplingDropout(Strategy):
     which have maximum entropy. 
 
     Suppose the model has `nclasses` output nodes and each output node is denoted by :math:`z_j`. Thus,  
-    :math:`j \in [1,nclasses]`. Then for a output node :math:`z_i` from the model, the correponding 
+    :math:`j \in [1,nclasses]`. Then for a output node :math:`z_i` from the model, the corresponding 
     softmax would be 
 
     .. math::
diff --git a/distil/active_learning_strategies/least_confidence.py b/distil/active_learning_strategies/least_confidence.py
index c13c2a6..6fa58d4 100644
--- a/distil/active_learning_strategies/least_confidence.py
+++ b/distil/active_learning_strategies/least_confidence.py
@@ -9,7 +9,7 @@ class LeastConfidence(Strategy):
     
     Suppose the model has `nclasses` output nodes denoted by :math:`\\overrightarrow{\\boldsymbol{z}}` 
     and each output node is denoted by :math:`z_j`. Thus, :math:`j \\in [1, nclasses]`. 
-    Then for a output node :math:`z_i` from the model, the correponding softmax would be 
+    Then for a output node :math:`z_i` from the model, the corresponding softmax would be 
 
     .. math::
         \\sigma(z_i) = \\frac{e^{z_i}}{\\sum_j e^{z_j}} 
diff --git a/distil/active_learning_strategies/least_confidence_dropout.py b/distil/active_learning_strategies/least_confidence_dropout.py
index 4d94832..dde5e4d 100644
--- a/distil/active_learning_strategies/least_confidence_dropout.py
+++ b/distil/active_learning_strategies/least_confidence_dropout.py
@@ -8,7 +8,7 @@ class LeastConfidenceDropout(Strategy):
     
     Suppose the model has `nclasses` output nodes denoted by :math:`\\overrightarrow{\\boldsymbol{z}}` 
     and each output node is denoted by :math:`z_j`. Thus, :math:`j \\in [1, nclasses]`. 
-    Then for a output node :math:`z_i` from the model, the correponding softmax would be 
+    Then for a output node :math:`z_i` from the model, the corresponding softmax would be 
 
     .. math::
         \\sigma(z_i) = \\frac{e^{z_i}}{\\sum_j e^{z_j}} 
diff --git a/distil/active_learning_strategies/margin_sampling.py b/distil/active_learning_strategies/margin_sampling.py
index da8f60d..1676750 100644
--- a/distil/active_learning_strategies/margin_sampling.py
+++ b/distil/active_learning_strategies/margin_sampling.py
@@ -10,7 +10,7 @@ class MarginSampling(Strategy):
     
     Suppose the model has `nclasses` output nodes denoted by :math:`\\overrightarrow{\\boldsymbol{z}}` 
     and each output node is denoted by :math:`z_j`. Thus, :math:`j \\in [1, nclasses]`. 
-    Then for a output node :math:`z_i` from the model, the correponding softmax would be 
+    Then for a output node :math:`z_i` from the model, the corresponding softmax would be 
 
     .. math::
         \\sigma(z_i) = \\frac{e^{z_i}}{\\sum_j e^{z_j}} 
diff --git a/distil/active_learning_strategies/margin_sampling_dropout.py b/distil/active_learning_strategies/margin_sampling_dropout.py
index e98ead8..542d6ab 100644
--- a/distil/active_learning_strategies/margin_sampling_dropout.py
+++ b/distil/active_learning_strategies/margin_sampling_dropout.py
@@ -9,7 +9,7 @@ class MarginSamplingDropout(Strategy):
     
     Suppose the model has `nclasses` output nodes denoted by :math:`\\overrightarrow{\\boldsymbol{z}}` 
     and each output node is denoted by :math:`z_j`. Thus, :math:`j \\in [1, nclasses]`. 
-    Then for a output node :math:`z_i` from the model, the correponding softmax would be 
+    Then for a output node :math:`z_i` from the model, the corresponding softmax would be 
 
     .. math::
         \\sigma(z_i) = \\frac{e^{z_i}}{\\sum_j e^{z_j}} 

From 1cc56e0e368eea1583b97ccf670366490fca2efd Mon Sep 17 00:00:00 2001
From: ApurvaDani <37484263+ApurvaDani@users.noreply.github.com>
Date: Mon, 3 May 2021 21:19:31 +0530
Subject: [PATCH 03/30] Single channel support for resent

---
 distil/utils/models/resnet.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/distil/utils/models/resnet.py b/distil/utils/models/resnet.py
index 1182b66..44c6982 100644
--- a/distil/utils/models/resnet.py
+++ b/distil/utils/models/resnet.py
@@ -66,12 +66,12 @@ def forward(self, x):
 
 
 class ResNet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=10):
+    def __init__(self, block, num_blocks, num_classes=10, channels=3):
         super(ResNet, self).__init__()
         self.in_planes = 64
         self.embDim = 8 * self.in_planes * block.expansion
         
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.conv1 = nn.Conv2d(channels, 64, kernel_size=3, stride=1, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
@@ -116,23 +116,23 @@ def get_embedding_dim(self):
         return self.embDim
 
 
-def ResNet18(num_classes=10):
-    return ResNet(BasicBlock, [2,2,2,2], num_classes)
+def ResNet18(num_classes=10, channels=3):
+    return ResNet(BasicBlock, [2,2,2,2], num_classes, channels)
 
 
-def ResNet34(num_classes=10):
-    return ResNet(BasicBlock, [3,4,6,3], num_classes)
+def ResNet34(num_classes=10, channels=3):
+    return ResNet(BasicBlock, [3,4,6,3], num_classes, channels)
 
 
-def ResNet50(num_classes=10):
-    return ResNet(Bottleneck, [3,4,6,3], num_classes)
+def ResNet50(num_classes=10, channels=3):
+    return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)
 
 
-def ResNet101(num_classes=10):
-    return ResNet(Bottleneck, [3,4,23,3], num_classes)
+def ResNet101(num_classes=10, channels=3):
+    return ResNet(Bottleneck, [3,4,23,3], num_classes, channels)
 
 
-def ResNet152(num_classes=10):
-    return ResNet(Bottleneck, [3,8,36,3], num_classes)
+def ResNet152(num_classes=10, channels=3):
+    return ResNet(Bottleneck, [3,8,36,3], num_classes, channels)
 
 #test()

From 99c2a1a6ced205645cb104faf556a27976fa927d Mon Sep 17 00:00:00 2001
From: Rishabh Iyer <rishabhnad@gmail.com>
Date: Mon, 3 May 2021 11:14:35 -0500
Subject: [PATCH 04/30] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 023ed6a..2694f6a 100644
--- a/README.md
+++ b/README.md
@@ -240,4 +240,4 @@ https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join
 [10] Gal, Yarin, Riashat Islam, and Zoubin Ghahramani. "Deep bayesian active learning with image data." International Conference on Machine Learning. PMLR, 2017.
 
 ## Acknowledgement
-This library takes inspiration and also uses pieces of code from [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.
+This library takes inspiration, builds upon, and uses pieces of code from [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.

From 3373efbd2368517cda7b4e3dd4815e49e5090fc6 Mon Sep 17 00:00:00 2001
From: Rishabh Iyer <rishabhnad@gmail.com>
Date: Mon, 3 May 2021 11:18:40 -0500
Subject: [PATCH 05/30] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2694f6a..1a18547 100644
--- a/README.md
+++ b/README.md
@@ -216,6 +216,8 @@ To receive updates about DISTIL and to be a part of the community, join the Deci
 ```
 https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join 
 ```
+## Acknowledgement and Credits
+This library takes inspiration, builds upon, and uses pieces of code from several open source codebases. These include [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.
 
 ## Publications
 
@@ -239,5 +241,3 @@ https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join
 
 [10] Gal, Yarin, Riashat Islam, and Zoubin Ghahramani. "Deep bayesian active learning with image data." International Conference on Machine Learning. PMLR, 2017.
 
-## Acknowledgement
-This library takes inspiration, builds upon, and uses pieces of code from [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.

From d9fd5b2b2b5a61671716442aaad3347f9738e938 Mon Sep 17 00:00:00 2001
From: Rishabh Iyer <rishabhnad@gmail.com>
Date: Mon, 3 May 2021 11:26:54 -0500
Subject: [PATCH 06/30] Update README.md

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1a18547..6fb7808 100644
--- a/README.md
+++ b/README.md
@@ -40,8 +40,9 @@
 - [Evaluation of Active Learning Strategies](#evaluation-of-active-learning-strategies)
 - [Testing Individual Strategies and Running Examples](#testing-individual-strategies-and-running-examples)
 - [Mailing List](#mailing-list)
-- [Publications](#publications)
 - [Acknowledgement](#acknowledgement)
+- [Team](#team)
+- [Publications](#publications)
 
 ## What is DISTIL?
 <p align="center">
@@ -216,9 +217,12 @@ To receive updates about DISTIL and to be a part of the community, join the Deci
 ```
 https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join 
 ```
-## Acknowledgement and Credits
+## Acknowledgement
 This library takes inspiration, builds upon, and uses pieces of code from several open source codebases. These include [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.
 
+## Team
+DISTIL is created and maintained by Nathan Beck, Durga Subramanian, [Apurva Dani](https://apurvadani.github.io/index.html), [Rishabh Iyer](rishiyer.com), and [Ganesh Ramakrishnan](https://www.cse.iitb.ac.in/~ganesh/). We look forward to have DISTIL more community driven. Please use it and contribute to it for your active learning research, and feel free to use it for your commercial projects. We will add the major contributors here.
+
 ## Publications
 
 [1] Settles, Burr. Active learning literature survey. University of Wisconsin-Madison Department of Computer Sciences, 2009.

From 36a4de64ac154f87633ef59968050426da90356d Mon Sep 17 00:00:00 2001
From: Rishabh Iyer <rishabhnad@gmail.com>
Date: Mon, 3 May 2021 11:42:41 -0500
Subject: [PATCH 07/30] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6fb7808..2dd1981 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ To get a clearer idea about how to incorporate DISTIL with your own models, refe
 
 ## Active Learning Benchmarks using DISTIL
 #### Experimentation Method
-The models used below were first trained on n randomly selected points, where n is the budget of the experiment. For each set of new points added, the model was trained from scratch until the training accuracy crossed the max accuracy threshold. The test accuracy was then reported before the next selection round.
+The models used below were first trained on an initial random set of points (equal to the budget). For each set of new points added, the model was trained from scratch until the training accuracy crossed the max accuracy threshold. The test accuracy was then reported before the next selection round. The results below are *preliminary* results each obtained only with one run. We are doing a more thorough benchmarking experiment, with more runs and report standard deviations etc. We will also link to a preprint which will include the benchmarking results.
 
 #### CIFAR10
 Model: Resnet18

From ebb3ad29df28d481ac73791f871ac6e6667d3e24 Mon Sep 17 00:00:00 2001
From: Rishabh Iyer <rishabhnad@gmail.com>
Date: Mon, 3 May 2021 11:46:05 -0500
Subject: [PATCH 08/30] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2dd1981..15c2888 100644
--- a/README.md
+++ b/README.md
@@ -221,7 +221,7 @@ https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join
 This library takes inspiration, builds upon, and uses pieces of code from several open source codebases. These include [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.
 
 ## Team
-DISTIL is created and maintained by Nathan Beck, Durga Subramanian, [Apurva Dani](https://apurvadani.github.io/index.html), [Rishabh Iyer](rishiyer.com), and [Ganesh Ramakrishnan](https://www.cse.iitb.ac.in/~ganesh/). We look forward to have DISTIL more community driven. Please use it and contribute to it for your active learning research, and feel free to use it for your commercial projects. We will add the major contributors here.
+DISTIL is created and maintained by Nathan Beck, Durga Subramanian, [Apurva Dani](https://apurvadani.github.io/index.html), [Rishabh Iyer](https://www.rishiyer.com), and [Ganesh Ramakrishnan](https://www.cse.iitb.ac.in/~ganesh/). We look forward to have DISTIL more community driven. Please use it and contribute to it for your active learning research, and feel free to use it for your commercial projects. We will add the major contributors here.
 
 ## Publications
 

From 7ae8c53a2758cf3773448bdf3f7b61e235b00153 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 08:06:54 +0530
Subject: [PATCH 09/30] doc

---
 .../adversarial_bim.py                           | 16 ++--------------
 .../adversarial_deepfool.py                      |  2 +-
 distil/active_learning_strategies/fass.py        | 11 +++++++++++
 .../least_confidence.py                          |  2 +-
 .../least_confidence_dropout.py                  |  2 +-
 .../margin_sampling.py                           |  4 ++--
 .../margin_sampling_dropout.py                   |  4 ++--
 ....utils.models.rst => distil.utils.models.rst} |  0
 8 files changed, 20 insertions(+), 21 deletions(-)
 rename docs/source/ActStrategy/{cords.utils.models.rst => distil.utils.models.rst} (100%)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index db84cd9..7fab5d8 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -8,27 +8,15 @@ def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         """
         
         Implements Adversial Bim Strategy which is motivated by the fact that often the distance 
-        computation from decision boundary is often difficult and intractable for margin based 
+        computation from decision boundary is difficult and intractable for margin based 
         methods. This technique avoids estimating distance by using BIM(Basic Iterative Method) 
         :footcite:`tramer2017ensemble` to estimate how much adversarial perturbation is required 
         to cross the boundary. Smaller the required the perturbation, closer the point is to the 
         boundary. 
 
-        **Basic Iterative Method (BIM)**: Given a base input, the approach is to perturb each 
-        feature in the direction of the gradient by magnitude :math:`\\epsilon`, where is a 
-        parameter that determines perturbation size. For a model with loss 
-        :math:`\\nabla J(\\theta, x, y)`, where :math:`\\theta` represents the model parameters, 
-        x is the model input, and y is the label of x, the adversarial sample is generated 
-        iteratively as,
-
-        .. math:: 
-            x*{\\*}_0 = x,
-            x*{\\*}_i = clip_{x,e} (x*{\\*}_{i-1} + sign(\\nabla_{x*{\\*}_{i-1}} J(\\theta, x*{\\*}_{i-1} , y)))
-
-
         Parameters
         ----------
-        X: numpy array
+        X: numpy arrays
             Present training/labeled data   
         y: numpy array
             Labels of present training data
diff --git a/distil/active_learning_strategies/adversarial_deepfool.py b/distil/active_learning_strategies/adversarial_deepfool.py
index a175ccb..212c7b2 100644
--- a/distil/active_learning_strategies/adversarial_deepfool.py
+++ b/distil/active_learning_strategies/adversarial_deepfool.py
@@ -8,7 +8,7 @@ class AdversarialDeepFool(Strategy):
     Implements Adversial Deep Fool Strategy :footcite:`ducoffe2018adversarial`, a Deep-Fool based 
     Active Learning strategy that selects unlabeled samples with the smallest adversarial 
     perturbation. This technique is motivated by the fact that often the distance computation 
-    from decision boundary is often difficult and intractable for margin-based methods. This 
+    from decision boundary is difficult and intractable for margin-based methods. This 
     technique avoids estimating distance by using Deep-Fool :footcite:`Moosavi-Dezfooli_2016_CVPR` 
     like techniques to estimate how much adversarial perturbation is required to cross the boundary. 
     The smaller the required perturbation, the closer the point is to the boundary.
diff --git a/distil/active_learning_strategies/fass.py b/distil/active_learning_strategies/fass.py
index 8b37f68..47ca45d 100644
--- a/distil/active_learning_strategies/fass.py
+++ b/distil/active_learning_strategies/fass.py
@@ -11,6 +11,17 @@ class FASS(Strategy):
     'facility_location' , 'graph_cut', 'saturated_coverage', 'sum_redundancy', 'feature_based' 
     is applied to get the final set of points.
 
+    We select a subset :math:`F` of size :math:`\\beta` based on uncertainty sampling, such 
+    that :math:`\\beta \\ge k`.
+      
+    Then select a subset :math:`S` by solving 
+    
+    .. math::
+        \\max \\{f(S) \\text{ such that } |S| \\leq k, S \\subseteq F\\} 
+    
+    where :math:`k` is the is the `budget` and :math:`f` can be one of these functions - 
+    'facility location' , 'graph cut', 'saturated coverage', 'sum redundancy', 'feature based'. 
+
     Parameters
     ----------
     X: numpy array
diff --git a/distil/active_learning_strategies/least_confidence.py b/distil/active_learning_strategies/least_confidence.py
index 6fa58d4..4321437 100644
--- a/distil/active_learning_strategies/least_confidence.py
+++ b/distil/active_learning_strategies/least_confidence.py
@@ -18,7 +18,7 @@ class LeastConfidence(Strategy):
     confidence as follows, 
     
     .. math::
-        arg\\min_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{(arg\\max_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
+        \\mbox{argmin}_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{\\sum_S(\mbox{argmax}_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
     
 
     where :math:`\\mathcal{U}` denotes the Data without lables i.e. `unlabeled_x` and :math:`k` is the `budget`.
diff --git a/distil/active_learning_strategies/least_confidence_dropout.py b/distil/active_learning_strategies/least_confidence_dropout.py
index dde5e4d..9b2b354 100644
--- a/distil/active_learning_strategies/least_confidence_dropout.py
+++ b/distil/active_learning_strategies/least_confidence_dropout.py
@@ -17,7 +17,7 @@ class LeastConfidenceDropout(Strategy):
     confidence as follows, 
     
     .. math::
-        arg\\min_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{(arg\\max_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
+        \\mbox{argmin}_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{\\sum_S(\\mbox{argmax}_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
     
 
     where :math:`\\mathcal{U}` denotes the Data without lables i.e. `unlabeled_x` and :math:`k` is the `budget`.
diff --git a/distil/active_learning_strategies/margin_sampling.py b/distil/active_learning_strategies/margin_sampling.py
index 1676750..49cf009 100644
--- a/distil/active_learning_strategies/margin_sampling.py
+++ b/distil/active_learning_strategies/margin_sampling.py
@@ -18,12 +18,12 @@ class MarginSampling(Strategy):
     Let,
 
     .. math::
-        m = arg\\max_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}
+        m = \\mbox{argmax}_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}
         
     Then using softmax, Margin Sampling Strategy would pick `budget` no. of elements as follows, 
     
     .. math::
-        arg\\min_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{(arg\\max_j {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}) - (arg\\max_{j \\ne m} {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
+        \\mbox{argmin}_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{\\sum_S(\\mbox{argmax}_j {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}) - (\\mbox{argmax}_{j \\ne m} {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
     
 
     where :math:`\\mathcal{U}` denotes the Data without lables i.e. `unlabeled_x` and :math:`k` is the `budget`.
diff --git a/distil/active_learning_strategies/margin_sampling_dropout.py b/distil/active_learning_strategies/margin_sampling_dropout.py
index 542d6ab..7f211c8 100644
--- a/distil/active_learning_strategies/margin_sampling_dropout.py
+++ b/distil/active_learning_strategies/margin_sampling_dropout.py
@@ -17,12 +17,12 @@ class MarginSamplingDropout(Strategy):
     Let,
 
     .. math::
-        m = arg\\max_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}
+        m = \\mbox{argmax}_j{(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}
         
     Then using softmax, Margin Sampling Strategy would pick `budget` no. of elements as follows, 
     
     .. math::
-        arg\\min_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{(arg\\max_j {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}) - (arg\\max_{j \\ne m} {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
+        \\mbox{argmin}_{{S \\subseteq {\\mathcal U}, |S| \\leq k}}{\\sum_S(\\mbox{argmax}_j {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))}) - (\\mbox{argmax}_{j \\ne m} {(\\sigma(\\overrightarrow{\\boldsymbol{z}}))})}  
     
 
     where :math:`\\mathcal{U}` denotes the Data without lables i.e. `unlabeled_x` and :math:`k` is the `budget`.
diff --git a/docs/source/ActStrategy/cords.utils.models.rst b/docs/source/ActStrategy/distil.utils.models.rst
similarity index 100%
rename from docs/source/ActStrategy/cords.utils.models.rst
rename to docs/source/ActStrategy/distil.utils.models.rst

From 0cacf5cdd4fdddb4c44ec57c2359b159ae5f2a26 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 08:40:40 +0530
Subject: [PATCH 10/30] doc

---
 .../adversarial_bim.py                        | 13 +++++++-
 distil/active_learning_strategies/badge.py    | 32 ++++++++++---------
 distil/active_learning_strategies/core_set.py |  2 +-
 distil/active_learning_strategies/fass.py     | 14 ++++----
 docs/source/ActStrategy/distil.utils.rst      |  2 --
 docs/source/ActStrategy/modules.rst           |  1 +
 docs/source/conf.py                           |  4 +--
 7 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index 7fab5d8..e23765e 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -6,7 +6,6 @@
 class AdversarialBIM(Strategy):
     def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         """
-        
         Implements Adversial Bim Strategy which is motivated by the fact that often the distance 
         computation from decision boundary is difficult and intractable for margin based 
         methods. This technique avoids estimating distance by using BIM(Basic Iterative Method) 
@@ -14,6 +13,18 @@ def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         to cross the boundary. Smaller the required the perturbation, closer the point is to the 
         boundary. 
 
+        **Basic Iterative Method (BIM)**: Given a base input, the approach is to perturb each
+        feature in the direction of the gradient by magnitude :math:`\\epsilon`, where is a
+        parameter that determines perturbation size. For a model with loss
+        :math:`\\nabla J(\\theta, x, y)`, where :math:`\\theta` represents the model parameters,
+        x is the model input, and y is the label of x, the adversarial sample is generated
+        iteratively as,
+ 
+       .. math::
+            x*{\\*}_0 = x,
+            x*{\\*}_i = clip_{x,e} (x*{\\*}_{i-1} + sign(\\nabla_{x*{\\*}_{i-1}} J(\\theta, x*{\\*}_{i-1} , y)))
+
+
         Parameters
         ----------
         X: numpy arrays
diff --git a/distil/active_learning_strategies/badge.py b/distil/active_learning_strategies/badge.py
index dd19c11..293b17f 100644
--- a/distil/active_learning_strategies/badge.py
+++ b/distil/active_learning_strategies/badge.py
@@ -57,22 +57,24 @@ class BADGE(Strategy):
     hypothesised labels. Then to select the points to be labeled are selected by applying 
     k-means++ on these loss gradients. 
     
-    Parameters.
+    Parameters
     ----------
-    X: Numpy array 
-        Features of the labled set of points 
-    Y: Numpy array
-        Lables of the labled set of points 
-    unlabeled_x: Numpy array
-        Features of the unlabled set of points 
-    net: class object
-        Model architecture used for training. Could be instance of models defined in `distil.utils.models` or something similar.
-    handler: class object
-        It should be a subclasses of torch.utils.data.Dataset i.e, have __getitem__ and __len__ methods implemented, so that is could be passed to pytorch DataLoader.Could be instance of handlers defined in `distil.utils.DataHandler` or something similar.
-    nclasses: int 
-        No. of classes in tha dataset
-    args: dictionary
-        This dictionary should have 'batch_size' as a key. 
+    X: numpy array
+        Present training/labeled data   
+    Y: numpy array
+        Labels of present training data
+    unlabeled_x: numpy array
+        Data without labels
+    net: class
+        Pytorch Model class
+    handler: class
+        Data Handler, which can load data even without labels.
+    nclasses: int
+        Number of unique target variables
+    args: dict
+        Specify optional parameters.
+        `batch_size` 
+        Batch size to be used inside strategy class (int, optional)
     """
 
     def __init__(self, X, Y, unlabeled_x, net, handler,nclasses, args):
diff --git a/distil/active_learning_strategies/core_set.py b/distil/active_learning_strategies/core_set.py
index 585a18b..47c3b2c 100644
--- a/distil/active_learning_strategies/core_set.py
+++ b/distil/active_learning_strategies/core_set.py
@@ -13,7 +13,7 @@ class CoreSet(Strategy):
     ----------
     X: numpy array
         Present training/labeled data   
-    y: numpy array
+    Y: numpy array
         Labels of present training data
     unlabeled_x: numpy array
         Data without labels
diff --git a/distil/active_learning_strategies/fass.py b/distil/active_learning_strategies/fass.py
index 47ca45d..570c16a 100644
--- a/distil/active_learning_strategies/fass.py
+++ b/distil/active_learning_strategies/fass.py
@@ -37,16 +37,14 @@ class FASS(Strategy):
     nclasses: int
         Number of unique target variables
     args: dict
-        Specify optional parameters
-        
-        batch_size 
+        Specify optional parameters - `batch_size` 
         Batch size to be used inside strategy class (int, optional)
 
-        submod: str
-        Choice of submodular function - 'facility_location' | 'graph_cut' | 'saturated_coverage' | 'sum_redundancy' | 'feature_based'
-        
-        selection_type: str
-        Choice of selection strategy - 'PerClass' | 'Supervised'
+    submod: str
+    Choice of submodular function - 'facility_location' | 'graph_cut' | 'saturated_coverage' | 'sum_redundancy' | 'feature_based'
+    
+    selection_type: str
+    Choice of selection strategy - 'PerClass' | 'Supervised'
     """
 
     def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
diff --git a/docs/source/ActStrategy/distil.utils.rst b/docs/source/ActStrategy/distil.utils.rst
index 7dbf3fc..83f1254 100644
--- a/docs/source/ActStrategy/distil.utils.rst
+++ b/docs/source/ActStrategy/distil.utils.rst
@@ -4,8 +4,6 @@ utils
 .. toctree::
    :maxdepth: 4
 
-   distil.utils.models
-
 
 DataHandler
 -------------------------------
diff --git a/docs/source/ActStrategy/modules.rst b/docs/source/ActStrategy/modules.rst
index af6d340..690614d 100644
--- a/docs/source/ActStrategy/modules.rst
+++ b/docs/source/ActStrategy/modules.rst
@@ -6,3 +6,4 @@ DISTIL
 
    distil.active_learning_strategies
    distil.utils
+   distil.utils.models
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 97d4f8f..5afd7e3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,8 +22,8 @@
 # -- Project information -----------------------------------------------------
 
 project = 'DISTIL'
-copyright = '2021, Durga Sivasubramanian, Apurva Dani, Rishabh Iyer'
-author = 'Durga Sivasubramanian, Apurva Dani, Rishabh Iyer'
+copyright = '2021, Durga Sivasubramanian, Nathan Beck, Apurva Dani, Rishabh Iyer'
+author = 'Durga Sivasubramanian,Nathan Beck,Apurva Dani, Rishabh Iyer'
 
 # The full version, including alpha/beta/rc tags
 release = 'v0.1'

From 532b8048c885f8258833c107bd1ffef8ebc3f2a7 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 08:58:30 +0530
Subject: [PATCH 11/30] doc

---
 ...an_active_learning_disagreement_dropout.py |  8 +++---
 .../distil.active_learning_strategies.rst     | 27 ++++++++++++++-----
 docs/source/refs.bib                          |  9 +++++++
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/distil/active_learning_strategies/bayesian_active_learning_disagreement_dropout.py b/distil/active_learning_strategies/bayesian_active_learning_disagreement_dropout.py
index be13feb..30f712c 100644
--- a/distil/active_learning_strategies/bayesian_active_learning_disagreement_dropout.py
+++ b/distil/active_learning_strategies/bayesian_active_learning_disagreement_dropout.py
@@ -3,9 +3,11 @@
 
 class BALDDropout(Strategy):
     """
-    Implementation of BALDDropout Strategy.
-    This class extends :class:`active_learning_strategies.strategy.Strategy`
-    to include entropy sampling technique to select data points for active learning.
+    
+    Implements Bayesian Active Learning by Disagreement (BALD) Strategy :footcite:`houlsby2011bayesian`,
+    which assumes a Basiyan setting and selects points which maximise the mutual information 
+    between the predicted labels and model parameters. This implementation is an adaptation for a 
+    non-bayesian setting, with the assumption that there is a dropout layer in the model. 
 
     Parameters
     ----------
diff --git a/docs/source/ActStrategy/distil.active_learning_strategies.rst b/docs/source/ActStrategy/distil.active_learning_strategies.rst
index 8803a41..87043e9 100644
--- a/docs/source/ActStrategy/distil.active_learning_strategies.rst
+++ b/docs/source/ActStrategy/distil.active_learning_strategies.rst
@@ -1,21 +1,29 @@
 Active Learning Strategies
 ===========================================
 
-BADGE
-------------------------------------------------
+Core-Set Approch
+----------------------------------------------------
 
-.. automodule:: distil.active_learning_strategies.badge
+.. automodule:: distil.active_learning_strategies.core_set
    :members:
+   :undoc-members:
    :show-inheritance:
 
-Core-Set Approch
+CRAIG-ACTIVE
 ----------------------------------------------------
-
-.. automodule:: distil.active_learning_strategies.core_set
+.. automodule:: distil.active_learning_strategies.craig_active
    :members:
    :undoc-members:
    :show-inheritance:
 
+
+BADGE
+------------------------------------------------
+
+.. automodule:: distil.active_learning_strategies.badge
+   :members:
+   :show-inheritance:
+
 Entropy Sampling 
 ------------------------------------------------------------
 
@@ -47,6 +55,13 @@ GLISTER
    :members:
    :show-inheritance:
 
+GRADAMATCH
+--------------------------------------------------
+
+.. automodule:: distil.active_learning_strategies.gradmatch_active
+   :members:
+   :show-inheritance:
+
 Least Confidence 
 ------------------------------------------------------------
 
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index d349aed..049c7cc 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -86,6 +86,15 @@ @inproceedings{dasgupta-etal-2013-summarization
     pages = "1014--1022",
 }
 
+@misc{houlsby2011bayesian,
+      title={Bayesian Active Learning for Classification and Preference Learning}, 
+      author={Neil Houlsby and Ferenc Huszár and Zoubin Ghahramani and Máté Lengyel},
+      year={2011},
+      eprint={1112.5745},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML}
+}
+
 @article{tramer2017ensemble,
   title={Ensemble adversarial training: Attacks and defenses},
   author={Tram{\`e}r, Florian and Kurakin, Alexey and Papernot, Nicolas and Goodfellow, Ian and Boneh, Dan and McDaniel, Patrick},

From 80b676c8bdbf084b81eeef70a20c57856ea6b56c Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 09:02:18 +0530
Subject: [PATCH 12/30] doc

---
 .../adversarial_bim.py                        | 34 ++++++-------------
 .../distil.active_learning_strategies.rst     | 14 ++++----
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index e23765e..b935e66 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -6,28 +6,17 @@
 class AdversarialBIM(Strategy):
     def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         """
-        Implements Adversial Bim Strategy which is motivated by the fact that often the distance 
-        computation from decision boundary is difficult and intractable for margin based 
-        methods. This technique avoids estimating distance by using BIM(Basic Iterative Method) 
-        :footcite:`tramer2017ensemble` to estimate how much adversarial perturbation is required 
-        to cross the boundary. Smaller the required the perturbation, closer the point is to the 
-        boundary. 
-
-        **Basic Iterative Method (BIM)**: Given a base input, the approach is to perturb each
-        feature in the direction of the gradient by magnitude :math:`\\epsilon`, where is a
-        parameter that determines perturbation size. For a model with loss
-        :math:`\\nabla J(\\theta, x, y)`, where :math:`\\theta` represents the model parameters,
-        x is the model input, and y is the label of x, the adversarial sample is generated
-        iteratively as,
- 
-       .. math::
-            x*{\\*}_0 = x,
-            x*{\\*}_i = clip_{x,e} (x*{\\*}_{i-1} + sign(\\nabla_{x*{\\*}_{i-1}} J(\\theta, x*{\\*}_{i-1} , y)))
-
+        Implements Adversial Deep Fool Strategy :footcite:`ducoffe2018adversarial`, a Deep-Fool based 
+        Active Learning strategy that selects unlabeled samples with the smallest adversarial 
+        perturbation. This technique is motivated by the fact that often the distance computation 
+        from decision boundary is difficult and intractable for margin-based methods. This 
+        technique avoids estimating distance by using Deep-Fool :footcite:`Moosavi-Dezfooli_2016_CVPR` 
+        like techniques to estimate how much adversarial perturbation is required to cross the boundary. 
+        The smaller the required perturbation, the closer the point is to the boundary.
 
         Parameters
         ----------
-        X: numpy arrays
+        X: numpy array
             Present training/labeled data   
         y: numpy array
             Labels of present training data
@@ -42,13 +31,12 @@ def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         args: dict
             Specify optional parameters
             
-            batch_size 
-            Batch size to be used inside strategy class (int, optional)
+            `batch_size`- Batch size to be used inside strategy class (int, optional)
 
-            eps
-            epsilon value for gradients
+            `eps`-epsilon value for gradients
         """
         
+        
         if 'eps' in args:
             self.eps = args['eps']
         else:
diff --git a/docs/source/ActStrategy/distil.active_learning_strategies.rst b/docs/source/ActStrategy/distil.active_learning_strategies.rst
index 87043e9..91e140a 100644
--- a/docs/source/ActStrategy/distil.active_learning_strategies.rst
+++ b/docs/source/ActStrategy/distil.active_learning_strategies.rst
@@ -1,6 +1,13 @@
 Active Learning Strategies
 ===========================================
 
+BADGE
+------------------------------------------------
+
+.. automodule:: distil.active_learning_strategies.badge
+   :members:
+   :show-inheritance:
+
 Core-Set Approch
 ----------------------------------------------------
 
@@ -17,13 +24,6 @@ CRAIG-ACTIVE
    :show-inheritance:
 
 
-BADGE
-------------------------------------------------
-
-.. automodule:: distil.active_learning_strategies.badge
-   :members:
-   :show-inheritance:
-
 Entropy Sampling 
 ------------------------------------------------------------
 

From 4974a7341cd61f804522b0b8be851ba21130d1c3 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 09:12:24 +0530
Subject: [PATCH 13/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py          | 1 -
 docs/source/ActStrategy/distil.active_learning_strategies.rst | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index b935e66..3c4eed9 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -36,7 +36,6 @@ def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
             `eps`-epsilon value for gradients
         """
         
-        
         if 'eps' in args:
             self.eps = args['eps']
         else:
diff --git a/docs/source/ActStrategy/distil.active_learning_strategies.rst b/docs/source/ActStrategy/distil.active_learning_strategies.rst
index 91e140a..1ab05f2 100644
--- a/docs/source/ActStrategy/distil.active_learning_strategies.rst
+++ b/docs/source/ActStrategy/distil.active_learning_strategies.rst
@@ -115,6 +115,7 @@ Adversarial BIM
 
 .. automodule:: distil.active_learning_strategies.adversarial_bim
    :members:
+   :undoc-members:
    :show-inheritance:
 
 Adversarial DeepFool

From 3ccc477f42665ed0ab620020feab60e0fd13ac68 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 11:18:55 +0530
Subject: [PATCH 14/30] doc

---
 .../adversarial_bim.py                        | 70 +++++++++++--------
 .../distil.active_learning_strategies.rst     |  3 +-
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index 3c4eed9..b4da56d 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -4,38 +4,50 @@
 from .strategy import Strategy
 
 class AdversarialBIM(Strategy):
+    """
+    Implements Adversial Bim Strategy which is motivated by the fact that often the distance
+    computation from decision boundary is difficult and intractable for margin based methods. This 
+    technique avoids estimating distance by using BIM(Basic Iterative Method)
+    :footcite:`tramer2017ensemble` to estimate how much adversarial perturbation is required to 
+    cross the boundary. Smaller the required the perturbation, closer the point is to the boundary.
+ 
+    **Basic Iterative Method (BIM)**: Given a base input, the approach is to perturb each
+    feature in the direction of the gradient by magnitude :math:`\\epsilon`, where is a
+    parameter that determines perturbation size. For a model with loss
+    :math:`\\nabla J(\\theta, x, y)`, where :math:`\\theta` represents the model parameters,
+    x is the model input, and y is the label of x, the adversarial sample is generated
+    iteratively as,
+
+    .. math::
+        x*{\\*}_0 = x,
+        x*{\\*}_i = clip_{x,e} (x*{\\*}_{i-1} + sign(\\nabla_{x*{\\*}_{i-1}} J(\\theta, x*{\\*}_{i-1} , y)))
+
+    Parameters
+    ----------
+    X: numpy array
+        Present training/labeled data   
+    y: numpy array
+        Labels of present training data
+    unlabeled_x: numpy array
+        Data without labels
+    net: class
+        Pytorch Model class
+    handler: class
+        Data Handler, which can load data even without labels.
+    nclasses: int
+        Number of unique target variables
+    args: dict
+        Specify optional parameters
+        
+        `batch_size`- Batch size to be used inside strategy class (int, optional)
+
+        `eps`-epsilon value for gradients
+    """
+    
     def __init__(self, X, Y, unlabeled_x, net, handler, nclasses, args={}):
         """
-        Implements Adversial Deep Fool Strategy :footcite:`ducoffe2018adversarial`, a Deep-Fool based 
-        Active Learning strategy that selects unlabeled samples with the smallest adversarial 
-        perturbation. This technique is motivated by the fact that often the distance computation 
-        from decision boundary is difficult and intractable for margin-based methods. This 
-        technique avoids estimating distance by using Deep-Fool :footcite:`Moosavi-Dezfooli_2016_CVPR` 
-        like techniques to estimate how much adversarial perturbation is required to cross the boundary. 
-        The smaller the required perturbation, the closer the point is to the boundary.
-
-        Parameters
-        ----------
-        X: numpy array
-            Present training/labeled data   
-        y: numpy array
-            Labels of present training data
-        unlabeled_x: numpy array
-            Data without labels
-        net: class
-            Pytorch Model class
-        handler: class
-            Data Handler, which can load data even without labels.
-        nclasses: int
-            Number of unique target variables
-        args: dict
-            Specify optional parameters
-            
-            `batch_size`- Batch size to be used inside strategy class (int, optional)
-
-            `eps`-epsilon value for gradients
+        Constructor method
         """
-        
         if 'eps' in args:
             self.eps = args['eps']
         else:
diff --git a/docs/source/ActStrategy/distil.active_learning_strategies.rst b/docs/source/ActStrategy/distil.active_learning_strategies.rst
index 1ab05f2..2b8b7f3 100644
--- a/docs/source/ActStrategy/distil.active_learning_strategies.rst
+++ b/docs/source/ActStrategy/distil.active_learning_strategies.rst
@@ -55,7 +55,7 @@ GLISTER
    :members:
    :show-inheritance:
 
-GRADAMATCH
+GRADMATCH
 --------------------------------------------------
 
 .. automodule:: distil.active_learning_strategies.gradmatch_active
@@ -115,7 +115,6 @@ Adversarial BIM
 
 .. automodule:: distil.active_learning_strategies.adversarial_bim
    :members:
-   :undoc-members:
    :show-inheritance:
 
 Adversarial DeepFool

From 823167c625e8e3cd316de8ffd5c4d1ce4dcbaee0 Mon Sep 17 00:00:00 2001
From: Apurva Dani <apurvadani98@gmail.com>
Date: Tue, 4 May 2021 11:22:41 +0530
Subject: [PATCH 15/30] Train class updated

---
 distil/utils/train_helper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/distil/utils/train_helper.py b/distil/utils/train_helper.py
index e22844d..00f2536 100644
--- a/distil/utils/train_helper.py
+++ b/distil/utils/train_helper.py
@@ -20,7 +20,9 @@ def __init__(self, X, Y, net, handler, args):
         self.net = net
         self.handler = handler
         self.args = args
-        self.n_pool = len(Y)
+        
+        if Y is not None: #For initialization without data
+            self.n_pool = len(Y)
         
         if 'islogs' not in args:
             self.args['islogs'] = False

From e832a4005b67a7d0f6f29a72d6d858f779d53436 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 11:23:37 +0530
Subject: [PATCH 16/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index b4da56d..d812f5e 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -19,8 +19,8 @@ class AdversarialBIM(Strategy):
     iteratively as,
 
     .. math::
-        x*{\\*}_0 = x,
-        x*{\\*}_i = clip_{x,e} (x*{\\*}_{i-1} + sign(\\nabla_{x*{\\*}_{i-1}} J(\\theta, x*{\\*}_{i-1} , y)))
+        x^*_0 = x,
+        x^*_i = clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
 
     Parameters
     ----------

From a59bc317a7ba6771eaf9c75031fb66ba074f8beb Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 11:24:48 +0530
Subject: [PATCH 17/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index d812f5e..843c869 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -19,7 +19,7 @@ class AdversarialBIM(Strategy):
     iteratively as,
 
     .. math::
-        x^*_0 = x,
+        x^*_0 = x,\\
         x^*_i = clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
 
     Parameters

From 981723388ab9bd251347907a942cd140e798b10d Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Tue, 4 May 2021 11:28:52 +0530
Subject: [PATCH 18/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index 843c869..4aa003d 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -19,7 +19,8 @@ class AdversarialBIM(Strategy):
     iteratively as,
 
     .. math::
-        x^*_0 = x,\\
+        x^*_0 = x,
+    .. math::
         x^*_i = clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
 
     Parameters

From aaacae1fd7445cf772d93bf2a3df8b5d4efd9c85 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 15:44:39 +0530
Subject: [PATCH 19/30] doc

---
 distil/active_learning_strategies/gradmatch_active.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index bd86032..dac765b 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -17,6 +17,9 @@ class GradMatchActive(Strategy):
     hypothesized labels of the loss function and are matched to either the full gradient of these hypothesized 
     examples or a supplied validation gradient. The indices returned are the ones selected by this algorithm.
 
+    .. math::
+        Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
+
     
     Parameters
     ----------

From 4faec314784791dabe970f112d193f3e0ac56970 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 15:46:04 +0530
Subject: [PATCH 20/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index 4aa003d..b752a4a 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -20,7 +20,7 @@ class AdversarialBIM(Strategy):
 
     .. math::
         x^*_0 = x,
-    .. math::
+    
         x^*_i = clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
 
     Parameters

From 96c26478033d67292c9ec7c69d25c65daebde628 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 15:56:36 +0530
Subject: [PATCH 21/30] doc

---
 distil/active_learning_strategies/adversarial_bim.py  |  5 +++--
 distil/active_learning_strategies/gradmatch_active.py | 11 ++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index b752a4a..ad7a52f 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -19,9 +19,10 @@ class AdversarialBIM(Strategy):
     iteratively as,
 
     .. math::
-        x^*_0 = x,
+        x^*_0 &= x,
     
-        x^*_i = clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
+        x^*_i &= clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
+
 
     Parameters
     ----------
diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index dac765b..fd2e7cb 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -20,7 +20,16 @@ class GradMatchActive(Strategy):
     .. math::
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
 
-    
+    where,
+    - Each gradient is computed with respect to the last layer's parameters
+    - :math:\\theta_t are the model parameters at selection round :math:t
+    - :math:X_t is the queried set of points to label at selection round :math:t
+    - :math:k is the budget
+    - :math:N is the number of points contributing to the full gradient :math:\\nabla_\\theta L(\\theta_t)
+    - :math:\\nabla_\\theta L(\\theta_t) is either the complete hypothesized gradient or a validation gradient
+    - :math:\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) is the subset's hypothesized gradient with :math:|X_t| = k
+
+
     Parameters
     ----------
     X: Numpy array 

From 7758ecd7b6625c679e6fa52a1c4e7cbcf115b488 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:02:41 +0530
Subject: [PATCH 22/30] doc

---
 .../active_learning_strategies/adversarial_bim.py  |  8 +++++---
 .../active_learning_strategies/gradmatch_active.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/distil/active_learning_strategies/adversarial_bim.py b/distil/active_learning_strategies/adversarial_bim.py
index ad7a52f..4653409 100644
--- a/distil/active_learning_strategies/adversarial_bim.py
+++ b/distil/active_learning_strategies/adversarial_bim.py
@@ -19,10 +19,12 @@ class AdversarialBIM(Strategy):
     iteratively as,
 
     .. math::
-        x^*_0 &= x,
-    
-        x^*_i &= clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
 
+        \\begin{eqnarray}
+            x^*_0 & = &x,
+    
+            x^*_i & = & clip_{x,e} (x^*_{i-1} + sign(\\nabla_{x^*_{i-1}} J(\\theta, x^*_{i-1} , y)))
+        \\end{eqnarray}
 
     Parameters
     ----------
diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index fd2e7cb..34e899e 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -21,13 +21,13 @@ class GradMatchActive(Strategy):
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
 
     where,
-    - Each gradient is computed with respect to the last layer's parameters
-    - :math:\\theta_t are the model parameters at selection round :math:t
-    - :math:X_t is the queried set of points to label at selection round :math:t
-    - :math:k is the budget
-    - :math:N is the number of points contributing to the full gradient :math:\\nabla_\\theta L(\\theta_t)
-    - :math:\\nabla_\\theta L(\\theta_t) is either the complete hypothesized gradient or a validation gradient
-    - :math:\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) is the subset's hypothesized gradient with :math:|X_t| = k
+        - Each gradient is computed with respect to the last layer's parameters
+        - :math:\\theta_t are the model parameters at selection round :math:t
+        - :math:X_t is the queried set of points to label at selection round :math:t
+        - :math:k is the budget
+        - :math:N is the number of points contributing to the full gradient :math:\\nabla_\\theta L(\\theta_t)
+        - :math:\\nabla_\\theta L(\\theta_t) is either the complete hypothesized gradient or a validation gradient
+        - :math:\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) is the subset's hypothesized gradient with :math:|X_t| = k
 
 
     Parameters

From a200b5f2d064a8527f2349c5c4fdb3f74438abed Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:07:20 +0530
Subject: [PATCH 23/30] doc

---
 .../active_learning_strategies/gradmatch_active.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index 34e899e..e160fd2 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -22,12 +22,12 @@ class GradMatchActive(Strategy):
 
     where,
         - Each gradient is computed with respect to the last layer's parameters
-        - :math:\\theta_t are the model parameters at selection round :math:t
-        - :math:X_t is the queried set of points to label at selection round :math:t
-        - :math:k is the budget
-        - :math:N is the number of points contributing to the full gradient :math:\\nabla_\\theta L(\\theta_t)
-        - :math:\\nabla_\\theta L(\\theta_t) is either the complete hypothesized gradient or a validation gradient
-        - :math:\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) is the subset's hypothesized gradient with :math:|X_t| = k
+        - :math:`\\theta_t` are the model parameters at selection round :math:`t`
+        - :math:`X_t` is the queried set of points to label at selection round :math:`t`
+        - :math:`k` is the budget
+        - :math:`N` is the number of points contributing to the full gradient :math:`\\nabla_\\theta L(\\theta_t)`
+        - :math:`\\nabla_\\theta L(\\theta_t)` is either the complete hypothesized gradient or a validation gradient
+        - :math:`\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t)` is the subset's hypothesized gradient with :math:`|X_t| = k`
 
 
     Parameters

From 3015876291f66091830186644338db06febb7d58 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:33:15 +0530
Subject: [PATCH 24/30] doc

---
 .../active_learning_strategies/gradmatch_active.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index e160fd2..1b081da 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -22,12 +22,12 @@ class GradMatchActive(Strategy):
 
     where,
         - Each gradient is computed with respect to the last layer's parameters
-        - :math:`\\theta_t` are the model parameters at selection round :math:`t`
-        - :math:`X_t` is the queried set of points to label at selection round :math:`t`
-        - :math:`k` is the budget
-        - :math:`N` is the number of points contributing to the full gradient :math:`\\nabla_\\theta L(\\theta_t)`
-        - :math:`\\nabla_\\theta L(\\theta_t)` is either the complete hypothesized gradient or a validation gradient
-        - :math:`\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t)` is the subset's hypothesized gradient with :math:`|X_t| = k`
+        - :math: `\\theta_t` are the model parameters at selection round :math: `t`
+        - :math: `X_t` is the queried set of points to label at selection round :math: `t`
+        - :math: `k` is the budget
+        - :math: `N` is the number of points contributing to the full gradient :math: `\\nabla_\\theta L(\\theta_t)`
+        - :math: `\\nabla_\\theta L(\\theta_t)` is either the complete hypothesized gradient or a validation gradient
+        - :math: `\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t)` is the subset's hypothesized gradient with :math: `|X_t| = k`
 
 
     Parameters

From 93211e0384fe2613acb3c5c785b7fe24e488a1c8 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:35:54 +0530
Subject: [PATCH 25/30] doc

---
 distil/active_learning_strategies/gradmatch_active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index 1b081da..5dd8e6e 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -19,7 +19,7 @@ class GradMatchActive(Strategy):
 
     .. math::
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
-
+    
     where,
         - Each gradient is computed with respect to the last layer's parameters
         - :math: `\\theta_t` are the model parameters at selection round :math: `t`

From 6c07f2c910e893d8bf6792dd33b6ee7ab41ce6e4 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:43:00 +0530
Subject: [PATCH 26/30] doc

---
 distil/active_learning_strategies/gradmatch_active.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index 5dd8e6e..1b081da 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -19,7 +19,7 @@ class GradMatchActive(Strategy):
 
     .. math::
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
-    
+
     where,
         - Each gradient is computed with respect to the last layer's parameters
         - :math: `\\theta_t` are the model parameters at selection round :math: `t`

From 0e697ec5e1d54737f85d0d2dcb174190523a7a76 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:44:23 +0530
Subject: [PATCH 27/30] doc

---
 distil/active_learning_strategies/gradmatch_active.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index 1b081da..22afb76 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -21,6 +21,7 @@ class GradMatchActive(Strategy):
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
 
     where,
+    
         - Each gradient is computed with respect to the last layer's parameters
         - :math: `\\theta_t` are the model parameters at selection round :math: `t`
         - :math: `X_t` is the queried set of points to label at selection round :math: `t`

From 8c3a47428648020fe593c8905d85222eb37caa64 Mon Sep 17 00:00:00 2001
From: durga <you@example.com>
Date: Wed, 5 May 2021 16:45:54 +0530
Subject: [PATCH 28/30] doc

---
 .../active_learning_strategies/gradmatch_active.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/distil/active_learning_strategies/gradmatch_active.py b/distil/active_learning_strategies/gradmatch_active.py
index 22afb76..b03f3c0 100644
--- a/distil/active_learning_strategies/gradmatch_active.py
+++ b/distil/active_learning_strategies/gradmatch_active.py
@@ -21,14 +21,14 @@ class GradMatchActive(Strategy):
         Err(X_t, L, L_T, \\theta_t) = \\left |\\left| \\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t) - \\frac{k}{N} \\nabla_\\theta L(\\theta_t) \\right | \\right|
 
     where,
-    
+
         - Each gradient is computed with respect to the last layer's parameters
-        - :math: `\\theta_t` are the model parameters at selection round :math: `t`
-        - :math: `X_t` is the queried set of points to label at selection round :math: `t`
-        - :math: `k` is the budget
-        - :math: `N` is the number of points contributing to the full gradient :math: `\\nabla_\\theta L(\\theta_t)`
-        - :math: `\\nabla_\\theta L(\\theta_t)` is either the complete hypothesized gradient or a validation gradient
-        - :math: `\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t)` is the subset's hypothesized gradient with :math: `|X_t| = k`
+        - :math:`\\theta_t` are the model parameters at selection round :math:`t`
+        - :math:`X_t` is the queried set of points to label at selection round :math:`t`
+        - :math:`k` is the budget
+        - :math:`N` is the number of points contributing to the full gradient :math:`\\nabla_\\theta L(\\theta_t)`
+        - :math:`\\nabla_\\theta L(\\theta_t)` is either the complete hypothesized gradient or a validation gradient
+        - :math:`\\sum_{i \\in X_t} \\nabla_\\theta L_T^i (\\theta_t)` is the subset's hypothesized gradient with :math:`|X_t| = k`
 
 
     Parameters

From c60b949f2fccd1483b95b6cbfd9220ae74423598 Mon Sep 17 00:00:00 2001
From: Apurva Dani <apurvadani98@gmail.com>
Date: Thu, 6 May 2021 18:18:33 +0530
Subject: [PATCH 29/30] Test transformation changes

---
 README.md                    | 4 +++-
 distil/utils/data_handler.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 15c2888..5241fbd 100644
--- a/README.md
+++ b/README.md
@@ -127,9 +127,11 @@ DISTIL makes it extremely easy to integrate your custom models with active learn
     * Check the models included in DISTIL for examples!
 
 * Data Handler
-    * Your DataHandler class should have a boolean attribute “select”:
+    * Your DataHandler class should have a boolean attribute “select=True” with default value True:
         * If True: Your __getitem__(self, index) method should return (input, index)
         * If False: Your __getitem__(self, index) method should return (input, label, index)
+    * Your DataHandler class should have a boolean attribute “use_test_transform=False” with default value False.
+    
     * Check the DataHandler classes included in DISTIL for examples!
 
 To get a clearer idea about how to incorporate DISTIL with your own models, refer to [Getting Started With DISTIL & Active Learning Blog](https://decile-research.medium.com/getting-started-with-distil-active-learning-ba7fafdbe6f3)
diff --git a/distil/utils/data_handler.py b/distil/utils/data_handler.py
index 47810c1..004f6a6 100644
--- a/distil/utils/data_handler.py
+++ b/distil/utils/data_handler.py
@@ -18,7 +18,7 @@ class DataHandler_Points(Dataset):
     select: bool
         True if loading data without labels, False otherwise
     """
-    def __init__(self, X, Y=None, select=True):
+    def __init__(self, X, Y=None, select=True, use_test_transform=False):
         """
         Constructor
         """

From 77813ae57653fea6ed593bc8974994b19e73d8df Mon Sep 17 00:00:00 2001
From: durgas16 <54654722+durgas16@users.noreply.github.com>
Date: Fri, 7 May 2021 00:07:47 +0530
Subject: [PATCH 30/30] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5241fbd..d55505b 100644
--- a/README.md
+++ b/README.md
@@ -223,7 +223,7 @@ https://groups.google.com/forum/#!forum/Decile_DISTIL_Dev/join
 This library takes inspiration, builds upon, and uses pieces of code from several open source codebases. These include [Kuan-Hao Huang's deep active learning repository](https://github.com/ej0cl6/deep-active-learning), [Jordan Ash's Badge repository](https://github.com/JordanAsh/badge), and [Andreas Kirsch's and Joost van Amersfoort's BatchBALD repository](https://github.com/BlackHC/batchbald_redux). Also, DISTIL uses [Apricot](https://github.com/jmschrei/apricot) for submodular optimization.
 
 ## Team
-DISTIL is created and maintained by Nathan Beck, Durga Subramanian, [Apurva Dani](https://apurvadani.github.io/index.html), [Rishabh Iyer](https://www.rishiyer.com), and [Ganesh Ramakrishnan](https://www.cse.iitb.ac.in/~ganesh/). We look forward to have DISTIL more community driven. Please use it and contribute to it for your active learning research, and feel free to use it for your commercial projects. We will add the major contributors here.
+DISTIL is created and maintained by Nathan Beck, [Durga Sivasubramanian](https://www.linkedin.com/in/durga-s-352831105), [Apurva Dani](https://apurvadani.github.io/index.html), [Rishabh Iyer](https://www.rishiyer.com), and [Ganesh Ramakrishnan](https://www.cse.iitb.ac.in/~ganesh/). We look forward to have DISTIL more community driven. Please use it and contribute to it for your active learning research, and feel free to use it for your commercial projects. We will add the major contributors here.
 
 ## Publications