# Student Network

In [None]:
import torch as torch
import torch.optim as optim
import os
import torchvision
import torch.nn as nn
from torch.autograd import Variable as var
from gtsrb_dataset import GTSRB
import logging as log
import numpy as np
from models.teacher_model import TeacherNetwork
from models.student_model import StudentNetwork
import utils.common_utils

In [None]:
!nvidia-smi

Wed Jul  1 18:49:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    34W / 250W |   1047MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

# Utility Functions

## Save Model State

In [None]:
def saveModel(epoch, model,optimizer,loss,path):
  torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': loss
              }, path)

## Load Model State

In [None]:
def loadModel(model,optimizer,path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    print('Epoch: ',epoch,'Loss: ',loss)
    return model,optimizer, epoch, loss;

# CUDA availability

In [None]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# Defining the Hyper-parameters for Student

In [None]:
learning_rate = 0.001
epochs = 200
weight_decay = 0.0001
k = 64 ## Growth rate and batch size
alpha = 0.9
T = 20

## Loading GTSRB Data

In [None]:
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.3403, 0.3121, 0.3214),
                         (0.2724, 0.2608, 0.2669))
])

train_data = GTSRB(
    root_dir='../input/gtsrbdata', train=True,  transform=transform)
test_data = GTSRB(
    root_dir='../input/gtsrbdata', train=False,  transform=transform)

train_set = torch.utils.data.DataLoader(
    train_data, batch_size=k)
test_set = torch.utils.data.DataLoader(
    test_data, batch_size=k)

# Defining the Teacher Network
The following sub-sections define the various parts of the Teacher Network.
We start by defining the Cell block, followed by the Stage module and finally the complete teacher model as defined in the paper \\
[Lightweight deep network for traffic sign classification, Zhang et. al (2019)](https://rdcu.be/b5aTv)

Before coding the network, we show the visual description of how the network looks with images taken from the above paper

## Cell Block

*The 1 × 1 kernels and the 3 × 3 kernels execute convolution
operations in parallel and splice all output results*

![Cell Block](https://i.imgur.com/RWMjelN.png)

Please note, the numbers 64 on each on the convolution operations are, as per our interpretation, used to denote that each of the convolution operations see exactly half of the input(as the batch size mentioned in the paper is 128)

In [None]:
class Cell(nn.Module):
  def __init__(self,cell_in_channels,cell_out_channels):
    super(Cell, self).__init__()

    self.activation_function = nn.ReLU()
    self.batch_norm = nn.BatchNorm2d(cell_out_channels)

    ## Reflect padding is used for the 3 x 3 convolution as it creates a 
    ## feature map of size 30 X 30, and needs to be padded to 32 x 32
    ## in order to concatenate with the 1 x 1 conv tensor

    self.cnn3 = nn.Conv2d(in_channels=int(cell_in_channels/2),
                          out_channels=int(cell_out_channels/2),
                          kernel_size=3,padding=1,padding_mode='reflect', 
                          stride=1)
    
    self.cnn1 = nn.Conv2d(in_channels=int(cell_in_channels/2), 
                          out_channels=int(cell_out_channels/2),
                          kernel_size=1, stride=1)
    
    
    '''
    I had initially thought about directly using grouped convolution feature, but could not find an implemented way of using different sized kernels for the parallel groups
    '''
    # self.grouped_conv = nn.Conv2d(in_channels=int(cell_in_channels/2), 
    #                               out_channels=int(cell_out_channels/2),
    #                               kernel_size=1, stride=1,groups=2)


    ## Used to split the input tensor in half in order to run parallel convolution
    self.split_size = int(cell_in_channels/2)


  def forward(self,x):

    (path1,path2) = torch.split(x,split_size_or_sections=[self.split_size,self.split_size],dim=1)
    path1 = self.cnn1(path1)

    path2 = self.cnn3(path2)

    x = torch.cat([path1,path2],1)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    return x

## Stage Module
*Six cells are
used to establish the direct connection between different
layers, making full use of the feature maps of each layer*

![stage](https://i.imgur.com/szNZQ9Cm.jpg)

The outputs from each of the cells, as well as the 1 x 1 convolution are accumulated into the input for the next cell

The two 1 x 1 convolutions are used to reduce the number of feature maps when connecting between the two stages

In [None]:
class Stage(nn.Module):
  def __init__(self,cell_connections_in, cell_connections_out,stage_in,stage_out):
    super(Stage,self).__init__()

    self.activation_function = nn.ReLU()
    self.batch_norm = nn.BatchNorm2d(k)

    self.cnn1 = nn.Conv2d(in_channels=stage_in,
                          out_channels=k,kernel_size=1,
                          stride=1)
    
    self.cnn2 = nn.Conv2d(in_channels=7*k,
                          out_channels=stage_out,kernel_size=1,
                          stride=1)
    
    ## Densely connected six cell blocks
    self.cells = nn.ModuleList([
                                Cell(cell_connections_in[i],
                                     cell_connections_out[i]) for i in range(6)
                                     ])      


  def forward(self,x):

    #print(x.size())
    cell_results = []
    x = self.cnn1(x)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    cell_results.append(x)
    for i in range(6):
      #print('Cell ',i)
      x = torch.cat(cell_results,1)
      x = self.cells[i](x)
      cell_results.append(x);
      
    x = torch.cat(cell_results,1)

    x = self.cnn2(x)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    return x

## Teacher Network
![teacher](https://i.imgur.com/bTH8KSCm.jpg)

Finally, we define the teacher network which consists of 4 stage modules connected in a dense fashion, with each stage producing a 'k' feature maps where 'k' is the growth rate of the network.

Stage 0 takes the input tensor which has 3 x H X W tensor and outputs a k x H x W tensor. The remaining Stages take 'k' feature maps as input and output 'k' feature maps

Finally, the Stage 3 output is pooled using a 3 x 3 max pooling with stride of 2
and finally a fully connected linear layer which produces the probability vector for classification.

In [None]:
class TeacherNetwork(nn.Module):
  def __init__(self,cell_onnections_in,cell_connections_out,stage_connections_in,stage_connections_out):
    super(TeacherNetwork, self).__init__()

    self.stages = nn.ModuleList([Stage(cell_onnections_in,cell_connections_out,stage_connections_in[i],stage_connections_out[i]) for i in range(4)])
    self.max_pool = torch.nn.MaxPool2d(kernel_size=2,stride=2)
    self.activation_function = nn.ReLU()
    self.linear = nn.Linear(in_features=65536,out_features=43)


  def forward(self,x):
    stage_results = []
    for i in range(4):
      #print('Stage ',i)
      if i != 0:
        x = torch.cat(stage_results,1)
        x = self.stages[i](x)
        stage_results.append(x);

      else:
        x = self.stages[0](x)
        stage_results.append(x)
    
    x = torch.cat(stage_results,1)
    x = self.max_pool(x)
    # print(x.size())
    x = x.view(x.size(0),-1)
    # print(x.size())
    x = self.linear(x)
    #print("Sending prediction now!")
    return x;

# Defining the Student Network

In [None]:
class StudentNetwork(nn.Module):
  def __init__(self):
    super(StudentNetwork,self).__init__()
    self.activation_function = nn.ReLU()
    self.batch_norm1 = nn.BatchNorm2d(32)
    self.batch_norm2 = nn.BatchNorm2d(32)
    self.batch_norm3 = nn.BatchNorm2d(64)
    self.batch_norm4 = nn.BatchNorm2d(64)
    self.batch_norm5 = nn.BatchNorm2d(128)
    self.max_pool = nn.MaxPool2d(kernel_size=2,stride=2,padding=0)
    self.avg_pool = nn.AvgPool2d(kernel_size=2,stride=2,padding=0)
    self.cnn1 = nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=1,padding=1,padding_mode='reflect')
    self.cnn2 = nn.Conv2d(in_channels=32,out_channels=32,kernel_size=3,stride=1,padding=1,padding_mode='reflect')
    self.cnn3 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3,stride=1,padding=1,padding_mode='reflect')
    self.cnn4 = nn.Conv2d(in_channels=64,out_channels=64,kernel_size=3,stride=1,padding=1,padding_mode='reflect')
    self.cnn5 = nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=1,padding=1,padding_mode='reflect')
    self.fc = nn.Linear(in_features=2048,out_features=43)

  def forward(self,x):
    x = self.cnn1(x)
    x = self.batch_norm1(x)
    x = self.activation_function(x)
    x = self.cnn2(x)
    x = self.batch_norm2(x)
    x = self.activation_function(x)
    x = self.max_pool(x)
    x = self.cnn3(x)
    x = self.batch_norm3(x)
    x = self.activation_function(x)
    x = self.cnn4(x)
    x = self.batch_norm4(x)
    x = self.activation_function(x)
    x = self.max_pool(x)
    x = self.cnn5(x)
    x = self.batch_norm5(x)
    x = self.activation_function(x)
    x = self.avg_pool(x)
    x = x.view(x.size(0),-1)
    x = self.fc(x)

    return x;




## Defining Knowledge Distillation Loss
### KD Loss Formula
![kdloss](https://i.imgur.com/v5HfR1N.png)


In [None]:
'''
NOTE: the KL Divergence for PyTorch comparing the softmaxs of teacher
    and student expects the input tensor to be log probabilities!'''

import torch.nn.functional as F
cec = nn.CrossEntropyLoss().cuda()

def computeKDLoss(teacher_output, student_output,labels):

  loss = nn.KLDivLoss()(F.log_softmax(student_output/T, dim=1),
                             F.softmax(teacher_output/T, dim=1)) * (2. * alpha * T * T) + \
              cec(F.softmax(student_output), labels) * (1. - alpha)
  return loss;

# Validation Function

Validates the model against the test data

In [None]:
def validate(model,data):
  # To get validation accuracy = (correct/total)*100.
  total = 0
  correct = 0
  #model.eval()
  
  with torch.no_grad():
    for i,(images,labels) in enumerate(data):
      images = var(images.cuda())
      x = model(images)
      value,pred = torch.max(x,1)
      pred = pred.data.cpu()
      total += x.size(0)
      correct += torch.sum(pred == labels)
    return correct*100./total

In [None]:
student = StudentNetwork().cuda()
student

StudentNetwork(
  (activation_function): ReLU()
  (batch_norm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (max_pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (avg_pool): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (cnn1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
  (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
  (cnn3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
  (cnn4): Conv2d(64, 64,

## Parameter Count

In [None]:
s  = sum(np.prod(list(p.size())) for p in student.parameters())
print('Number of parameters: ',s)

Number of parameters:  228171


In [None]:
student_stats = pd.DataFrame(columns=['Epoch','Train Loss','Test Accuracy'])

## Defining the Optimizer
We use an ADAM optimizer with a defined initial learning rate and a weight decay

In [None]:
optimizer = optim.Adam(student.parameters(),lr=learning_rate,weight_decay=weight_decay)

# Defining the Teacher Model

In [None]:
'''
The cells use feature maps from every preceding output in the subsequent cells,
increasing the number of feature maps for every next cell by k * 2^(i-1) where 
i is the cell number. Therefore, the first cell inputs 'k' feature maps and 
outputs k feature maps, and the last cell(6th) inputs 2^5 * k feature maps and 
outputs the same number
'''
cell_connections_in = [k,2*k,3*k,4*k,5*k,6*k]
cell_connections_out = [k] * 6

'''
The stages also use feature maps from every preceding output in the subsequent 
cells, increasing the number of feature maps for every next stage linearly. 
This is due to the fact that the 1 x 1 convolution at the end of every stage 
reduces the output feature maps to size 'k'
'''

stage_connections_in = [3,k,2*k,3*k]
stage_connections_out = [k] * 4

trained_teacher = TeacherNetwork(cell_connections_in,cell_connections_out,stage_connections_in,stage_connections_out).to(device)

In [None]:
trained_teacher = torch.load("../saved_models/trained_teacher_model.pth")
trained_teacher

## Training Student Network
### Using trained Teacher Network
#### Using KD Loss from Teacher

In [None]:
def get_teacher_outputs():
    teacher_predictions = []
    for index,(images,labels) in enumerate(train_set):
        with torch.no_grad():
            images = var(images.cuda())
            labels = var(labels.cuda())
            pred = trained_teacher(images).data.cpu().numpy()
            teacher_predictions.append(pred)
    return teacher_predictions

In [None]:
teacher_predictions = get_teacher_outputs()

0
200
400
600


In [None]:
## Student training
for epoch in range(epochs)[201:]:
  epoch_loss = 0.0
  running_loss = 0.0
  for index,(images,labels) in enumerate(train_set):
    images = var(images.cuda())
    labels = var(labels.cuda())
    optimizer.zero_grad()

    teacher_pred =  torch.from_numpy(teacher_predictions[index])
    teacher_pred = var(teacher_pred.cuda(), requires_grad=False)
    
    student_prediction = student(images)
    loss = computeKDLoss(teacher_output=teacher_pred,student_output=student_prediction,labels=labels)
    loss.backward()
    optimizer.step()

    epoch_loss += student_prediction.shape[0] * loss.item()

  avg_epoch_loss = epoch_loss/len(train_data)
  accuracy = float(validate(student,test_set))
  print('Epoch :',epoch+1, 'Train Loss :',avg_epoch_loss,'Test Accuracy: ',accuracy,'%')


# Evaluating the trained model

In [None]:
accuracy = float(validate(student,test_set))
print('Accuracy = ',accuracy)

Accuracy =  2.399049997329712


In [None]:
### Save final
torch.save(student,"final_student_model_2.pth")

In [None]:
## Evaluating the model
trainedTeacher = TeacherNetwork(cell_connections_in,cell_connections_out,stage_connections_in,stage_connections_out).type(torch.cuda.FloatTensor)
# optimizer = optim.Adam(teacher.parameters(),lr=learning_rate,weight_decay=0.0001)
# trainedTeacher,optimizer,ep,loss = loadModel(trainedTeacher,optimizer,"./saved_models/teacher_saved_1.pth")
# print(ep,loss.item())

trainedstate = torch.load("./saved_models/final_saved_model_1.pth")
trainedTeacher.load_state_dict(trainedstate)
trainedTeacher.eval()


In [None]:
teacher = torch.load("../input/models/trained_teacher_model.pth")
teacher

TeacherNetwork(
  (stages): ModuleList(
    (0): Stage(
      (activation_function): ReLU()
      (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (cnn1): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1))
      (cnn2): Conv2d(448, 64, kernel_size=(1, 1), stride=(1, 1))
      (cells): ModuleList(
        (0): Cell(
          (activation_function): ReLU()
          (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (cnn3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
          (cnn1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
        )
        (1): Cell(
          (activation_function): ReLU()
          (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (cnn3): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
          (cnn1): Conv2d(64, 32, kern

In [None]:
## CF For the trained student model


# student.load_state_dict(studentstate)

def confusion_matrix(preds, labels, conf_matrix):
    preds = torch.argmax(preds, 1)
    for p, t in zip(preds, labels):
        conf_matrix[p, t] += 1

    return conf_matrix

conf_matrix = torch.zeros(43, 43)
for index,(images, labels) in enumerate(test_set):
  images = var(images.cuda())
  labels = var(labels.cuda())
  output = teacher(images)
  conf_matrix = confusion_matrix(output, labels, conf_matrix)
  if index% 100 == 0:
    print(index)

0
100


In [None]:
print(conf_matrix)

tensor([[ 60.,   0.,   0.,  ...,   2.,   0.,   0.],
        [  0., 714.,   4.,  ...,   0.,   0.,   0.],
        [  0.,   1., 744.,  ...,   0.,   0.,   0.],
        ...,
        [  0.,   0.,   0.,  ...,  84.,   0.,   0.],
        [  0.,   0.,   0.,  ...,   0.,  54.,  14.],
        [  0.,   0.,   0.,  ...,   0.,   0.,  72.]])


In [None]:
TP = conf_matrix.diag()
for c in range(43):
  idx = torch.ones(43).byte()
  idx[c] = 0
  TN = conf_matrix[idx.nonzero()[:,None], idx.nonzero()].sum()
  FP = conf_matrix[c, idx].sum()
  FN = conf_matrix[idx, c].sum()

  recall = (TP[c] / (TP[c]+FN))
  precision = (TP[c] / (TP[c]+FP))
  specificity = (TN / (TN+FP))
  f1Score = (2 * precision * recall)/(precision+recall)
  FPR = (FP/(FP+TN))
  accuracy = ((TP[c]+TN)/(TP[c]+TN+FP+FN))


  print('Class {}\tTP {}\tTN {}\tFP {}\tFN {}\tRecall {}\tPrecision {}\tF1_Score {}\tTrue_Positive_Rate {}\tFalse_Positive_Rate {}\tSpecificity {}\tAccuracy {}'.format(
            c, TP[c], TN, FP, FN,recall,precision,f1Score,recall,FPR, specificity,accuracy))
#   print('Sensitivity = {}'.format(sensitivity))
#   print('Specificity = {}'.format(specificity))

Class 0	TP 60.0	TN 12567.0	FP 3.0	FN 0.0	Recall 1.0	Precision 0.9523809552192688	F1_Score 0.9756097793579102	True_Positive_Rate 1.0	False_Positive_Rate 0.00023866348783485591	Specificity 0.9997613430023193	Accuracy 0.9997624754905701
Class 1	TP 714.0	TN 11898.0	FP 12.0	FN 6.0	Recall 0.9916666746139526	Precision 0.9834710955619812	F1_Score 0.9875519275665283	True_Positive_Rate 0.9916666746139526	False_Positive_Rate 0.0010075566824525595	Specificity 0.9989924430847168	Accuracy 0.9985747933387756
Class 2	TP 744.0	TN 11871.0	FP 9.0	FN 6.0	Recall 0.9919999837875366	Precision 0.9880478382110596	F1_Score 0.9900199770927429	True_Positive_Rate 0.9919999837875366	False_Positive_Rate 0.0007575757335871458	Specificity 0.9992424249649048	Accuracy 0.9988123774528503
Class 3	TP 442.0	TN 12166.0	FP 14.0	FN 8.0	Recall 0.9822221994400024	Precision 0.969298243522644	F1_Score 0.9757174253463745	True_Positive_Rate 0.9822221994400024	False_Positive_Rate 0.001149425283074379	Specificity 0.9988505840301514	Ac

