In [4]:
import torch as torch
import torch.optim as optim
import os
import torchvision
import torch.nn as nn
from torch.autograd import Variable as var
from gtsrb_dataset import GTSRB
import logging as log
import gc
import numpy as np
import torchvision.transforms as transforms

In [5]:
!nvidia-smi

Tue Jun 30 10:16:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Utility Functions

## Garbage collection

## Save Model State

In [7]:
def saveModel(epoch, model,optimizer,loss,path):
  torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': loss
              }, path)

## Load Model State

In [8]:
def loadModel(model,optimizer,path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    print('Epoch: ',epoch,'Loss: ',loss)
    return model,optimizer, epoch, loss;

In [9]:
## Notifying
from twilio.rest import Client

def send_text(epoch,loss,acc):
  account_sid = 'ACddba861746e4cfc2970856833c4a0b2f'
  auth_token = 'b1daaceeec0758a3ff72d171cf386957'
  client = Client(account_sid, auth_token)
  txt = "Epoch: "+str(epoch)+" Loss: "+str(loss)+" Accuracy: "+str(acc)

  message = client.messages.create(
                                body=txt,
                                from_='whatsapp:+14155238886',
                                to='whatsapp:+31633459670'
                            )

  print(message.sid)

# CUDA availability

In [10]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# Defining the Hyper-parameters for Teacher

In [31]:
learning_rate = 0.001
epochs = 1000
weight_decay = 0.0001
k = 64 ## Growth rate and batch size

## Loading GTSRB Data

In [15]:
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.3403, 0.3121, 0.3214),
                         (0.2724, 0.2608, 0.2669))
])

train_data = GTSRB(
    root_dir='./data/', train=True,  transform=transform)
test_data = GTSRB(
    root_dir='./data/', train=False,  transform=transform)

train_set = torch.utils.data.DataLoader(
    train_data, batch_size=k,shuffle=True,num_workers = 4, pin_memory=True)
test_set = torch.utils.data.DataLoader(
    test_data, batch_size=k,shuffle=False,num_workers = 4,pin_memory=True)

# Defining the Teacher Network
The following sub-sections define the various parts of the Teacher Network.
We start by defining the Cell block, followed by the Stage module and finally the complete teacher model as defined in the paper \\
[Lightweight deep network for traffic sign classification, Zhang et. al (2019)](https://rdcu.be/b5aTv)

Before coding the network, we show the visual description of how the network looks with images taken from the above paper

## Cell Block

*The 1 × 1 kernels and the 3 × 3 kernels execute convolution
operations in parallel and splice all output results*

![Cell Block](https://i.imgur.com/RWMjelN.png)

Please note, the numbers 64 on each on the convolution operations are, as per our interpretation, used to denote that each of the convolution operations see exactly half of the input(as the batch size mentioned in the paper is 128)

In [16]:
class Cell(nn.Module):
  def __init__(self,cell_in_channels,cell_out_channels):
    super(Cell, self).__init__()

    self.activation_function = nn.ReLU()
    self.batch_norm = nn.BatchNorm2d(cell_out_channels)

    ## Reflect padding is used for the 3 x 3 convolution as it creates a 
    ## feature map of size 30 X 30, and needs to be padded to 32 x 32
    ## in order to concatenate with the 1 x 1 conv tensor

    self.cnn3 = nn.Conv2d(in_channels=int(cell_in_channels/2),
                          out_channels=int(cell_out_channels/2),
                          kernel_size=3,padding=1,padding_mode='reflect', 
                          stride=1)
    
    self.cnn1 = nn.Conv2d(in_channels=int(cell_in_channels/2), 
                          out_channels=int(cell_out_channels/2),
                          kernel_size=1, stride=1)
    
    
    '''
    I had initially thought about directly using grouped convolution feature, but could not find an implemented way of using different sized kernels for the parallel groups
    '''
    # self.grouped_conv = nn.Conv2d(in_channels=int(cell_in_channels/2), 
    #                               out_channels=int(cell_out_channels/2),
    #                               kernel_size=1, stride=1,groups=2)


    ## Used to split the input tensor in half in order to run parallel convolution
    self.split_size = int(cell_in_channels/2)


  def forward(self,x):

    (path1,path2) = torch.split(x,split_size_or_sections=[self.split_size,self.split_size],dim=1)
    path1 = self.cnn1(path1)

    path2 = self.cnn3(path2)

    x = torch.cat([path1,path2],1)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    return x

## Stage Module
*Six cells are
used to establish the direct connection between different
layers, making full use of the feature maps of each layer*

![stage](https://i.imgur.com/szNZQ9Cm.jpg)

The outputs from each of the cells, as well as the 1 x 1 convolution are accumulated into the input for the next cell

The two 1 x 1 convolutions are used to reduce the number of feature maps when connecting between the two stages

In [17]:
class Stage(nn.Module):
  def __init__(self,cell_connections_in, cell_connections_out,stage_in,stage_out):
    super(Stage,self).__init__()

    self.activation_function = nn.ReLU()
    self.batch_norm = nn.BatchNorm2d(k)

    self.cnn1 = nn.Conv2d(in_channels=stage_in,
                          out_channels=k,kernel_size=1,
                          stride=1)
    
    self.cnn2 = nn.Conv2d(in_channels=7*k,
                          out_channels=stage_out,kernel_size=1,
                          stride=1)
    
    ## Densely connected six cell blocks
    self.cells = nn.ModuleList([
                                Cell(cell_connections_in[i],
                                     cell_connections_out[i]) for i in range(6)
                                     ])      


  def forward(self,x):

    #print(x.size())
    cell_results = []
    x = self.cnn1(x)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    cell_results.append(x)
    for i in range(6):
      #print('Cell ',i)
      x = torch.cat(cell_results,1)
      x = self.cells[i](x)
      cell_results.append(x);
      
    x = torch.cat(cell_results,1)

    x = self.cnn2(x)
    x = self.batch_norm(x)
    x = self.activation_function(x)

    return x

## Teacher Network
![teacher](https://i.imgur.com/bTH8KSCm.jpg)

Finally, we define the teacher network which consists of 4 stage modules connected in a dense fashion, with each stage producing a 'k' feature maps where 'k' is the growth rate of the network.

Stage 0 takes the input tensor which has 3 x H X W tensor and outputs a k x H x W tensor. The remaining Stages take 'k' feature maps as input and output 'k' feature maps

Finally, the Stage 3 output is pooled using a 3 x 3 max pooling with stride of 2
and finally a fully connected linear layer which produces the probability vector for classification.

In [18]:
class TeacherNetwork(nn.Module):
  def __init__(self,cell_onnections_in,cell_connections_out,stage_connections_in,stage_connections_out):
    super(TeacherNetwork, self).__init__()

    self.stages = nn.ModuleList([Stage(cell_onnections_in,cell_connections_out,stage_connections_in[i],stage_connections_out[i]) for i in range(4)])
    self.max_pool = torch.nn.MaxPool2d(kernel_size=2,stride=2)
    self.activation_function = nn.ReLU()
    self.linear = nn.Linear(in_features=65536,out_features=43)


  def forward(self,x):
    stage_results = []
    for i in range(4):
      #print('Stage ',i)
      if i != 0:
        x = torch.cat(stage_results,1)
        x = self.stages[i](x)
        stage_results.append(x);

      else:
        x = self.stages[0](x)
        stage_results.append(x)
    
    x = torch.cat(stage_results,1)
    x = self.max_pool(x)
    # print(x.size())
    x = x.view(x.size(0),-1)
    # print(x.size())
    x = self.linear(x)
    #print("Sending prediction now!")
    return x;

# Validation Function

Validates the model against the test data

In [48]:
def validate(model,data):
  # To get validation accuracy = (correct/total)*100.
  total = 0
  correct = 0
  
  with torch.no_grad():
    for i,(images,labels) in enumerate(data):
      images = var(images.cuda())
      x = model(images)
      value,pred = torch.max(x,1)
      pred = pred.data.cpu()
      total += x.size(0)
      correct += torch.sum(pred == labels)
    return correct*100./total

# Defining the Teacher Model

In [20]:
'''
The cells use feature maps from every preceding output in the subsequent cells,
increasing the number of feature maps for every next cell by k * 2^(i-1) where 
i is the cell number. Therefore, the first cell inputs 'k' feature maps and 
outputs k feature maps, and the last cell(6th) inputs 2^5 * k feature maps and 
outputs the same number
'''
cell_connections_in = [k,2*k,3*k,4*k,5*k,6*k]
cell_connections_out = [k] * 6

'''
The stages also use feature maps from every preceding output in the subsequent 
cells, increasing the number of feature maps for every next stage linearly. 
This is due to the fact that the 1 x 1 convolution at the end of every stage 
reduces the output feature maps to size 'k'
'''

stage_connections_in = [3,k,2*k,3*k]
stage_connections_out = [k] * 4

teacher = TeacherNetwork(cell_connections_in,cell_connections_out,stage_connections_in,stage_connections_out).to(device)

## Defining the Loss Function for Teacher

In [21]:
cec = nn.CrossEntropyLoss().cuda()

## Checking the Teacher Model

In [None]:
teacher

## Parameter Count

In [40]:
s  = sum(np.prod(list(p.size())) for p in teacher.parameters())
print('Number of parameters: ',s)

Number of parameters:  3823339


## Defining the Optimizer
We use an ADAM optimizer with a defined initial learning rate and a weight decay

In [24]:
optimizer = optim.Adam(teacher.parameters(),lr=learning_rate,weight_decay=weight_decay)

In [42]:
teacher

TeacherNetwork(
  (stages): ModuleList(
    (0): Stage(
      (activation_function): ReLU()
      (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (cnn1): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1))
      (cnn2): Conv2d(448, 64, kernel_size=(1, 1), stride=(1, 1))
      (cells): ModuleList(
        (0): Cell(
          (activation_function): ReLU()
          (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (cnn3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
          (cnn1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
        )
        (1): Cell(
          (activation_function): ReLU()
          (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (cnn3): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), padding_mode=reflect)
          (cnn1): Conv2d(64, 32, kern

In [28]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0.0001
)

# Training the Model

In [None]:
for e in range(epochs)[(epoch+1):]:
  epoch_loss = 0.0
  running_loss = 0.0
  for i,(images,labels) in enumerate(train_set):
    teacher.train()
    images = var(images.cuda())
    labels = var(labels.cuda())
    optimizer.zero_grad()

    prediction = teacher(images)
    loss = cec(prediction,labels)
    loss.backward()
    optimizer.step()

    epoch_loss += prediction.shape[0] * loss.item()
    
    running_loss += loss.item()
    
    if (i+1) % 100 == 0:
      print('Epoch :',e+1,'Batch :',(i+1),'Loss :',running_loss/100)
      running_loss = 0.0
  
  accuracy = float(validate(teacher,test_set))
  print('Epoch: ',e+1,'Loss: ',(epoch_loss/len(train_set)),'Accuracy: ',accuracy,'%')


# Evaluating the trained model

In [47]:
accuracy = float(validate(teacher,test_set))
print('Accuracy = ',accuracy)

Accuracy =  89.88915252685547


# Saving Model

In [None]:
torch.save(teacher,"./saved_models/trained_teacher_model.pth")