## Qusetion 1

a). Please fill in the missing code to train a 3-layer ConvNet on the CIFAR10 dataset.

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

batch_size = 32

train_transform =transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])

test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])


trainset = datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)

train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=train_transform)

validation_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                          shuffle=True)

# for (x_train, y_train) in train_loader:
#     print (x_train.size())
#     break

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, 3, 1, 1)
        
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, 3, 1, 1)
        
        self.bn2 = nn.BatchNorm2d(64)
        
        self.conv3 = nn.Conv2d(64, 128, 3, 1, 1)
        
        self.bn3 = nn.BatchNorm2d(128)
        
        self.fc1 = nn.Linear(128*32*32, 10)


    def forward(self, x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        ## flatten operation
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        
        ## forward x to softmax activation function.
        output = F.softmax(x, dim=1)
        
        return output
    
def train(epoch,log_interval=100):
    model.train()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        
        ## zero gradient buffers
        optimizer.zero_grad()
        
        output = model(data)

        loss = criterion(output, target)

        ## backpropagate
        loss.backward()
        
        ## update weights
        optimizer.step()
        
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data.item()))

def validate(loss_vector, accuracy_vector):
    model.eval()
    val_loss, correct = 0, 0
    for data, target in validation_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        val_loss += criterion(output, target).data.item()
        
        pred = output.data.max(1)[1]
        
        correct += pred.eq(target.data).sum()

    val_loss /= len(validation_loader)
    loss_vector.append(val_loss)

    accuracy = 100. * correct.to(torch.float32) / len(validation_loader.dataset)
    accuracy_vector.append(accuracy)
    
    print('\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        val_loss, correct, len(validation_loader.dataset), accuracy))


model = Net().to(device)

## define the SGD optimizer with lr=0.01 and momentum=0.5
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

## define the crosstntropy loss.
criterion = nn.CrossEntropyLoss()

epochs = 2
lossv, accv = [], []
for epoch in range(1, epochs + 1):
    train(epoch)
    validate(lossv, accv)


Files already downloaded and verified
Files already downloaded and verified

Validation set: Average loss: 2.1753, Accuracy: 2812/10000 (28.12%)


Validation set: Average loss: 2.1366, Accuracy: 3217/10000 (32.17%)



b). Please apply knowledge distillation by using ResNet18 as a teacher network to improve the accuracy of the above 3-layer ConvNet, which can be treated as a student network. Please fill in the missing code.

In [17]:
import os
import time
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

import resnet


## load the saved model for ResNet-18 so you do not to train ResNet-18
def load_checkpoint(checkpoint, model):

    if not os.path.exists(checkpoint):
        raise("File doesn't exist {}".format(checkpoint))

    checkpoint = torch.load(checkpoint, map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint['state_dict'])

    return checkpoint

def cal_loss_kd(T, alpha, output_student, output_teacher):
    """
    Compute the knowledge-distillation (KD) loss given outputs, labels.
    "Hyperparameters": temperature and alpha
    
    Hint: since the built-in CrossEntropy loss in pytorch only takes “(output, target)” 
    where the target is the class index, which is different from knowledge distillation loss,
    you can consider KL Divergence in pytorch. Please find out the definition of CrossEntropy,
    KL Divergence, their difference and usage in pytorch to carefully complete this part.

    """
    criterion_kd = nn.KLDivLoss()
    
    ## calculate the soft distribution of the student output with temperature T
    student_soft = F.log_softmax(output_student / T, dim=1)
    
    ## calculate the soft distribution of the teacher output with temperature T
    teacher_soft = F.softmax(output_teacher / T, dim=1)
    
    KD_loss = criterion_kd(student_soft,teacher_soft)* (alpha * T * T)
    
    return KD_loss


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

batch_size = 32

train_transform =transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])

test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])

trainset = datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)

train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=train_transform)

validation_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                          shuffle=True)


teacher_model = resnet.ResNet18().to(device)
teacher_checkpoint = 'best.pth.tar'
load_checkpoint(teacher_checkpoint, teacher_model)
teacher_model.eval()

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, 3, 1, 1)
        
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, 3, 1, 1)
        
        self.bn2 = nn.BatchNorm2d(64)
        
        self.conv3 = nn.Conv2d(64, 128, 3, 1, 1)
        
        self.bn3 = nn.BatchNorm2d(128)
        
        self.fc1 = nn.Linear(128*32*32, 10)


    def forward(self, x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        
        ## forward x to relu activation function.
        x = F.relu(x)
        
        ## flatten operation
        x = torch.flatten(x, 1)
        
        x = self.fc1(x)
        
        ## forward x to softmax activation function.
        output = F.softmax(x, dim=1)
        
        return output


def train(epoch, log_interval=100):
    model.train()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)

        ## zero gradient buffers
        optimizer.zero_grad()

        ## forward data through the student model
        output_student = model(data)

        ## forward data through the teacher model
        output_teacher = teacher_model(data)

        ## set the output of the teacher attribute .requires_grad as False
        output_teacher = Variable(output_teacher, requires_grad=False)

        T=5
        alpha = 0.5

        loss_kd = cal_loss_kd(T, alpha, output_student, output_teacher)

        loss_class = criterion(output_student, target)*(1-alpha)

        loss = loss_kd + loss_class

        ## backpropagate
        loss.backward()

        ## update weights
        optimizer.step()

        if (batch_idx+1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss_kd: {:.6f}\tLoss_class: {:.6f}\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss_kd.data.item(),loss_class.data.item(),loss.data.item()),flush=True)


def validate(loss_vector, accuracy_vector):
    model.eval()
    val_loss, correct = 0, 0
    for data, target in validation_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        val_loss += criterion(output, target).data.item()

        pred = output.data.max(1)[1]

        correct += pred.eq(target.data).sum()

    val_loss /= len(validation_loader)
    loss_vector.append(val_loss)

    accuracy = 100. * correct.to(torch.float32) / len(validation_loader.dataset)
    accuracy_vector.append(accuracy)

    print('\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        val_loss, correct, len(validation_loader.dataset), accuracy))


model = Net().to(device)

## define the SGD optimizer with lr=0.01 and momentum=0.5
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

## define the crosstntropy loss for the student model. Please note that we do not use softmax output in the network.
criterion = nn.CrossEntropyLoss()

epochs = 2
lossv, accv = [], []
for epoch in range(1, epochs + 1):
    train(epoch)
    validate(lossv, accv)



Files already downloaded and verified
Files already downloaded and verified

Validation set: Average loss: 2.0911, Accuracy: 3654/10000 (36.54%)


Validation set: Average loss: 2.0187, Accuracy: 4400/10000 (44.00%)

