In [2]:
import argparse
import os, sys
import time
import datetime
import numpy as np
import pandas as pd
from PIL import Image
# Import pytorch dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

# You cannot change this line.
from tools.dataloader import CIFAR10

In [3]:
""" 
Assignment 2(a)
Build the LeNet-5 model by following table 1 or figure 1.

You can also insert batch normalization and leave the LeNet-5 
with batch normalization here for assignment 3(c).
"""
# Create the neural network module: VGG
class VGG(nn.Module):
    def __init__(self):
        super(VGG, self).__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=64))
        self.conv2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=64))
        
        self.conv3 = nn.Sequential(nn.Conv2d(64, 128, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=128))
        self.conv4 = nn.Sequential(nn.Conv2d(128, 128, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=128))
        
        self.conv5 = nn.Sequential(nn.Conv2d(128, 256, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=256))
        self.conv6 = nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=256))
        self.conv7 = nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=256))
        self.conv8 = nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=256))
        
        self.conv9 = nn.Sequential(nn.Conv2d(256, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv10 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv11 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv12 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        
        self.conv13 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv14 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv15 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        self.conv16 = nn.Sequential(nn.Conv2d(512, 512, kernel_size=3, padding=1,bias=False),
                                   nn.BatchNorm2d(num_features=512))
        
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv3(out))
        out = F.relu(self.conv4(out))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv5(out))
        out = F.relu(self.conv6(out))
        out = F.relu(self.conv7(out))
        out = F.relu(self.conv8(out))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv9(out))
        out = F.relu(self.conv10(out))
        out = F.relu(self.conv11(out))
        out = F.relu(self.conv12(out))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv13(out))
        out = F.relu(self.conv14(out))
        out = F.relu(self.conv15(out))
        out = F.relu(self.conv16(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        return out

In [4]:
"""
Hyperparameter optimization in assignment 4(a), 4(b) can be 
conducted here.
Be sure to leave only your best hyperparameter combination
here and comment the original hyperparameter settings.
"""

# Setting some hyperparameters
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100
INITIAL_LR = 0.01
MOMENTUM = 0.9
REG = 1e-4
EPOCHS = 30
DATAROOT = "./data"
CHECKPOINT_PATH = "./saved_model"

**Your answer:**

In [5]:
"""
Assignment 2(b)
Write functions to load dataset and preprocess the incoming data. 
We recommend that the preprocess scheme \textbf{must} include 
normalize, standardization, batch shuffling to make sure the training 
process goes smoothly. 
The preprocess scheme may also contain some data augmentation methods 
(e.g., random crop, random flip, etc.). 

Reference value for mean/std:

mean(RGB-format): (0.4914, 0.4822, 0.4465)
std(RGB-format): (0.2023, 0.1994, 0.2010)


NOTE: Considering this process has strong corrlelation with assignment 3(b), 
please leave the data preprocessing method which can achieve the highest 
validation accuracy here. You can include your original data augmentation
method as comments and denotes the accuracy difference between thest two 
methods.
"""
# Specify preprocessing function.
# Reference mean/std value for 
transform_train = transforms.Compose([transforms.RandomHorizontalFlip(), 
                                      transforms.ToTensor(), 
                                      transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),])

transform_val = transforms.Compose([transforms.RandomHorizontalFlip(), 
                                    transforms.ToTensor(), 
                                    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),])
transform_test = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) 

**Your answer:**

In [6]:
class TEST10():
    def __init__(self, root, train=True, transform=None, download=False):
        self.transform = transform
        self.root = root
        self.data = []
        img_name = os.path.join(root, "cifar10-batches-images-test.npy")

        self.data = np.load(img_name)


    def __getitem__(self, index):
        img = self.data[index]

        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        return img

    def __len__(self):
        return len(self.data)
    
    def download(self):
        try:
            download_and_extract_archive(self.url, self.root, filename=self.filename)
        except Exception as e:
            print("Interrupted during dataset downloading. "
                  "Cleaning up...")
            # Clean up
            cwd = os.getcwd()
            rm_path = os.path.join(cwd, self.root, "cifar10_trainval")
            shutil.rmtree(rm_path)
            raise e

        print('Files already downloaded and verified')
        
    def extra_repr(self):
        return "Split: {}".format("Train" if self.train is True else "Test")

In [7]:
# Call the dataset Loader
trainset = CIFAR10(root=DATAROOT, train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=1)
valset = CIFAR10(root=DATAROOT, train=False, download=True, transform=transform_val)
valloader = torch.utils.data.DataLoader(valset, batch_size=VAL_BATCH_SIZE, shuffle=False, num_workers=1)


Using downloaded and verified file: ./data/cifar10_trainval.tar.gz
Extracting ./data/cifar10_trainval.tar.gz to ./data
Files already downloaded and verified
Using downloaded and verified file: ./data/cifar10_trainval.tar.gz
Extracting ./data/cifar10_trainval.tar.gz to ./data
Files already downloaded and verified


In [8]:
testset = TEST10(root=DATAROOT, train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=VAL_BATCH_SIZE, shuffle=False, num_workers=1)

In [9]:
# Specify the device for computation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = VGG()
net = net.to(device)
if device =='cuda':
    print("Train on GPU...")
else:
    print("Train on CPU...")

Train on GPU...


In [10]:
# FLAG for loading the pretrained model
TRAIN_FROM_SCRATCH = False
# Code for loading checkpoint and recover epoch id.
CKPT_PATH = "./saved_model/model.h5"
def get_checkpoint(ckpt_path):
    try:
        ckpt = torch.load(ckpt_path)
    except Exception as e:
        print(e)
        return None
    return ckpt

ckpt = get_checkpoint(CKPT_PATH)
if ckpt is None or TRAIN_FROM_SCRATCH:
    if not TRAIN_FROM_SCRATCH:
        print("Checkpoint not found.")
    print("Training from scratch ...")
    start_epoch = 0
    current_learning_rate = INITIAL_LR
else:
    print("Successfully loaded checkpoint: %s" %CKPT_PATH)
    net.load_state_dict(ckpt['net'])
    start_epoch = ckpt['epoch'] + 1
    current_learning_rate = ckpt['lr']
    print("Starting from epoch %d " %start_epoch)

print("Starting from learning rate %f:" %current_learning_rate)

[Errno 2] No such file or directory: './saved_model/model.h5'
Checkpoint not found.
Training from scratch ...
Starting from learning rate 0.010000:


In [11]:
"""
Assignment 2(c)
In the targeted classification task, we use cross entropy loss with L2 
regularization as the learning object.
You need to formulate the cross-entropy loss function in PyTorch.
You should also specify a PyTorch Optimizer to optimize this loss function.
We recommend you to use the SGD-momentum with an initial learning rate 0.01 
and momentum 0.9 as a start.
"""
# Create loss function and specify regularization
criterion = nn.CrossEntropyLoss()
# Add optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

In [12]:
"""
Assignment 3(a)
Start the training process over the whole CIFAR-10 training dataset. 
For sanity check, you are required to report the initial loss value at 
the beginning of the training process and briefly justify this value. 
Run the training process for \textbf{a maximum of 30} epochs and you 
should be able to reach around \textbf{65\%} accuracy on the validation 
dataset.
"""
# Start the training/validation process
# The process should take about 5 minutes on a GTX 1070-Ti
# if the code is written efficiently.
global_step = 0
best_val_acc = 0

for i in range(start_epoch, EPOCHS):
    print(datetime.datetime.now())
    # Switch to train mode
    net.train()
    print("Epoch %d:" %i)

    total_examples = 0
    correct_examples = 0

    train_loss = 0
    train_acc = 0
    # Train the training dataset for 1 epoch.
    print(len(trainloader))
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # Copy inputs to device
        inputs = inputs.cuda()
        targets = targets.cuda()
        # Zero the gradient
        optimizer.zero_grad()
        # Generate output
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        # Now backward loss
        loss.backward()
        # Apply gradient
        optimizer.step()
        # Calculate predicted labels
        _, predicted = torch.max(outputs.data, 1)
        # Calculate accuracy
        total_examples += targets.size(0)
        correct_examples += (predicted == targets).sum().item()

        train_loss += loss

        global_step += 1
        if global_step % 100 == 0:
            avg_loss = train_loss / (batch_idx + 1)
        pass
    avg_acc = correct_examples / total_examples
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))
    print(datetime.datetime.now())
    # Validate on the validation dataset
    print("Validation...")
    total_examples = 0
    correct_examples = 0
    
    net.eval()

    val_loss = 0
    val_acc = 0
    # Disable gradient during validation
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(valloader):
            # Copy inputs to device
            inputs = inputs.cuda()
            targets = targets.cuda()
            # Zero the gradient
            optimizer.zero_grad()
            # Generate output from the DNN.
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            # Calculate predicted labels
            _, predicted = torch.max(outputs.data, 1)
            # Calculate accuracy
            total_examples += targets.size(0)
            correct_examples += (predicted == targets).sum().item()
            val_loss += loss

    avg_loss = val_loss / len(valloader)
    avg_acc = correct_examples / total_examples
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))
    
    if i == EPOCHS - 1:
        with torch.no_grad():
            pred_all = []
            for batch_idx, inputs in enumerate(testloader):
                inputs = inputs.cuda()
                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                pred_array = predicted.cpu().numpy()
                pred_all.extend(pred_array)
    '''
    """
    Assignment 4(b)
    Learning rate is an important hyperparameter to tune. Specify a 
    learning rate decay policy and apply it in your training process. 
    Briefly describe its impact on the learning curveduring your 
    training process.    
    Reference learning rate schedule: 
    decay 0.98 for every 2 epochs. You may tune this parameter but 
    minimal gain will be achieved.
    Assignment 4(c)
    As we can see from above, hyperparameter optimization is critical 
    to obtain a good performance of DNN models. Try to fine-tune the 
    model to over 70% accuracy. You may also increase the number of 
    epochs to up to 100 during the process. Briefly describe what you 
    have tried to improve the performance of the LeNet-5 model.
    '''
    DECAY_EPOCHS = 2
    DECAY = 1.00
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate * 0.98
        for param_group in optimizer.param_groups:
            # Assign the learning rate parameter
            param_group['lr'] = current_learning_rate
        print("Current learning rate has decayed to %f" %current_learning_rate)
    
    # Save for checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_PATH):
            os.makedirs(CHECKPOINT_PATH)
        print("Saving ...")
        state = {'net': net.state_dict(),
                 'epoch': i,
                 'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_PATH, 'model.h5'))
    
print("Optimization finished.")

2019-10-04 16:40:03.287580
Epoch 0:
352
Training loss: 1.9307, Training accuracy: 0.4836
2019-10-04 16:40:22.885904
Validation...
Validation loss: 1.2280, Validation accuracy: 0.5848
Saving ...
2019-10-04 16:40:24.428211
Epoch 1:
352
Training loss: 0.9174, Training accuracy: 0.6891
2019-10-04 16:40:39.441714
Validation...
Validation loss: 0.8351, Validation accuracy: 0.7166
Saving ...
2019-10-04 16:40:41.013209
Epoch 2:
352
Training loss: 0.7042, Training accuracy: 0.7622
2019-10-04 16:40:56.666292
Validation...
Validation loss: 0.7192, Validation accuracy: 0.7548
Current learning rate has decayed to 0.009800
Saving ...
2019-10-04 16:40:58.195628
Epoch 3:
352
Training loss: 0.5730, Training accuracy: 0.8048
2019-10-04 16:41:13.527156
Validation...
Validation loss: 0.6357, Validation accuracy: 0.7882
Saving ...
2019-10-04 16:41:14.976655
Epoch 4:
352
Training loss: 0.4897, Training accuracy: 0.8334
2019-10-04 16:41:30.380144
Validation...
Validation loss: 0.6261, Validation accuracy: 0.

In [13]:
pred_pd = pd.DataFrame(pred_all)
pred_pd = pred_pd.reset_index()
pred_pd = pred_pd.rename(columns={"index": "Id", 0: "Category"})
pred_pd.to_csv("./test.csv", index=False)