In [1]:
from __future__ import print_function
from __future__ import division
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
# from tensorboardX import SummaryWriter  # for pytorch below 1.14
from torch.utils.tensorboard import SummaryWriter # for pytorch above or equal 1.14

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


In [2]:
print(torch.__version__)
print(torchvision.__version__)

1.4.0
0.5.0


In [3]:
BATCH_SIZE = 32 #mini_batch size

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) #torchvision.transforms.Normalize(mean, std)

## data augmentation for Q4
# transform = transforms.Compose([
#     transforms.RandomHorizontalFlip(), # randomly flip and rotate
#     transforms.RandomRotation(10),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
#     ])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)

# images, labels = next(iter(trainloader))
# writer = SummaryWriter(log_dir='./log')
# # writer.add_images('images', grid)
# writer.add_graph(net, [images.cuda()])
# grid = torchvision.utils.make_grid(images)
# writer.add_images('images', grid)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

### Question 1
* Add a batch normalization layer after the first fully-connected layer(fc1) (8 points).
* Save the model after training(Checkout our tutorial on how to save your model).
Becareful that batch normalization layer performs differently between training and evalation process, make sure you understand how to convert your model between training mode and evaluation mode(you can find hints in my code).
* Observe the difference of final training/testing accuracy with/without batch normalization layer.

In [9]:
## Q1
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, 10)
        self.bnorm = nn.BatchNorm1d(512)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.bnorm(x)
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

### Question 2
* Modify our model by adding another fully connected layer with 512 nodes at the second-to-last layer (before the fc2 layer) (8 points).
* Apply the model weights you saved at step 1 to initialize to the new model(only up to fc2 layer since after that all layers are newly created) before training. 
* Train and save the model (Hint: check the end of the assignment description to see how to partially restore weights from a pretrained weights file).

In [3]:
## Q2-1
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 512)
        self.fc_q2 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 10)
        self.bnorm = nn.BatchNorm1d(512)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.bnorm(x)
        x = F.relu(self.fc_q2(x))
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [34]:
# Q2-2
def partially_restore_weights(filepath):
    pretrained_dict = torch.load(filepath)
    model_dict = net.state_dict()
    pretrained_dict = {key: val for key, val in pretrained_dict.items() if key in model_dict}
    model_dict.update(pretrained_dict)
    net.load_state_dict(model_dict)
    
    return net

### Question 4
* Try to tune your network in another way (e.g. add/remove a layer, change the activation function, add/remove regularizer, change the number of hidden units, more batch normalization layers) not described in the previous four. You can start from random initialization or previous results as you wish (8 points).

In [23]:
## Q4
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding=1)
        self.conv5 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv6 = nn.Conv2d(128, 128, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.fc_q2 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 10)
        self.bnorm1d1 = nn.BatchNorm1d(512)
        self.bnorm1d2 = nn.BatchNorm1d(512)
        self.bnorm2d1 = nn.BatchNorm2d(32)
        self.bnorm2d2 = nn.BatchNorm2d(64)
        self.bnorm2d3 = nn.BatchNorm2d(128)
#         self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        # Understanding the Disharmony between Dropout and Batch Normalization by Variance Shift, https://arxiv.org/abs/1801.05134
        x = F.relu(self.conv1(x))
        x = self.bnorm2d1(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
#         x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.bnorm2d2(x)
        x = F.relu(self.conv4(x))
        x = self.pool(x)
#         x = self.dropout(x)
        x = F.relu(self.conv5(x))
        x = self.bnorm2d3(x)
        x = F.relu(self.conv6(x))
        x = self.pool(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.bnorm1d1(x)         
        x = F.relu(self.fc_q2(x))
        x = self.bnorm1d2(x)
        x = self.fc2(x)
        
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [5]:
def eval_net(dataloader):
    correct = 0
    total = 0
    total_loss = 0
    net.eval() # Why would I do this?
    criterion = nn.CrossEntropyLoss(reduction='mean')
    for data in dataloader:
        images, labels = data
        images, labels = Variable(images).cuda(), Variable(labels).cuda()
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.data).sum()
        loss = criterion(outputs, labels)
        total_loss += loss.item()
    net.train() # Why would I do this?
    return total_loss / total, correct.float() / total

In [6]:
print('Building model...')
net = Net().cuda()
net

Building model...


Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=4096, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [37]:
net = partially_restore_weights('mytraining1.pth')
net

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=4096, out_features=512, bias=True)
  (fc_q2): Linear(in_features=512, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
  (bnorm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

### Question 3
* Try to use an adaptive schedule to tune the learning rate, you can choose from RMSprop, Adagrad and Adam (Hint: you don't need to implement any of these, look at Pytorch documentation please) (8 points).

In [7]:
net.train() # Why would I do this?

# writer = SummaryWriter(log_dir='./log')
# writer.add_images('images', grid)
# writer.add_graph(net, [images])

images, labels = next(iter(trainloader))
grid = torchvision.utils.make_grid([images.cuda()])
writer = SummaryWriter(log_dir='./log')
writer.add_images('images', grid)
writer.add_graph(net, [images.cuda()])
# grid = torchvision.utils.make_grid([images.cuda()])
# writer.add_images('images', grid)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.Adam(net.parameters(), lr=0.008) # Q3(lr=0.003) & Q4

In [8]:
MAX_EPOCH = 10 #maximum epoch to train

print('Start training...')
for epoch in range(MAX_EPOCH):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 500 == 499:    # print every 2000 mini-batches
            print('    Step: %5d avg_batch_loss: %.5f' %
                  (i + 1, running_loss / 500))
            running_loss = 0.0
    print('    Finish training this EPOCH, start evaluating...')
    train_loss, train_acc = eval_net(trainloader)
    test_loss, test_acc = eval_net(testloader)
    print('EPOCH: %d train_loss: %.5f train_acc: %.5f test_loss: %.5f test_acc %.5f' %
          (epoch+1, train_loss, train_acc, test_loss, test_acc))

    writer.add_scalar('train_loss', train_loss)
    writer.add_scalar('train_acc', train_acc)
    writer.add_scalar('test_loss', test_loss)
    writer.add_scalar('test_acc', test_acc)
    

writer.close()
print('Finished Training')
print('Saving model...')
torch.save(net.state_dict(), 'mytraining6.pth')

Start training...
    Step:   500 avg_batch_loss: 2.01295
    Step:  1000 avg_batch_loss: 1.52447
    Step:  1500 avg_batch_loss: 1.31798
    Finish training this EPOCH, start evaluating...
EPOCH: 1 train_loss: 0.03434 train_acc: 0.60480 test_loss: 0.03535 test_acc 0.59190
    Step:   500 avg_batch_loss: 1.10507
    Step:  1000 avg_batch_loss: 0.99943
    Step:  1500 avg_batch_loss: 0.93074
    Finish training this EPOCH, start evaluating...
EPOCH: 2 train_loss: 0.02455 train_acc: 0.72542 test_loss: 0.02841 test_acc 0.68330
    Step:   500 avg_batch_loss: 0.76452
    Step:  1000 avg_batch_loss: 0.75116
    Step:  1500 avg_batch_loss: 0.72206
    Finish training this EPOCH, start evaluating...
EPOCH: 3 train_loss: 0.01721 train_acc: 0.80966 test_loss: 0.02465 test_acc 0.72890
    Step:   500 avg_batch_loss: 0.54224
    Step:  1000 avg_batch_loss: 0.56735
    Step:  1500 avg_batch_loss: 0.56525
    Finish training this EPOCH, start evaluating...
EPOCH: 4 train_loss: 0.01161 train_acc: 0.

2020-02-27 11:48:56.801215: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-02-27 11:48:56.801276: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-02-27 11:48:56.801304: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.1.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
tensorboard --logdir ./