In [247]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
import torchvision.models as models
import os
import sys
import math
import argparse
import torch
import shutil
import setproctitle
import make_graph

In [44]:
"""
Like you concatenate the input and output of this network at the end
"""
# Bottleneck network uses 2 set of BatchNorm2d+ReLU+Conv2d to expand the channels
# to interChannels, then shrink it back to growthRate number of channels output
# then concatenate these processed channels with the original intput channels
# as the final output
class Bottleneck(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(Bottleneck, self).__init__()
        interChannels = 4*growthRate
        # Input shape (N,nChannels,H,W), normalize each Channel by learnable
        # mean & std value and output (N,nChannels,H,W)
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, interChannels, kernel_size=1, bias=False)
        # output shape (N,interChannels,H,W)
        self.bn2 = nn.BatchNorm2d(interChannels)
        # input (N,interChannels,H,W) output (N,growthRate,H,W)
        self.conv2 = nn.Conv2d(interChannels, growthRate, kernel_size=3, padding=1, bias=False)
    
    def forward(self, x):
        # input shape (N,nChannels,H,W)
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        # concatenate in Channel dimension, final ouput shape (N,growthRate+nChannels,H,W)
        out = torch.cat((x,out), 1)
        return out
    
# SingleLayer uses 1 batchnormal + ReLU + 2d convolutional layer
# output the transferred output concatenated with the input Channels
class SingleLayer(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(SingleLayer, self).__init__()
        # input (N,nChannels,H,W)
        self.bn1 = nn.BatchNorm2d(nChannels)
        # intput (N,nChannels,H,W)
        self.conv1 = nn.Conv2d(nChannels, growthRate, kernel_size=3, padding=1, bias=False)
        # output (N,growthRate,H,W)
        
    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = torch.cat((x,out),1)
        # output shape (N,nChannels+growthRate,H,W)
        return out
    
# Transition shrinks the H & W by half while changing the number of Channels
# to a pre-specified value
class Transition(nn.Module):
    def __init__(self, nChannels, nOutChannels):
        super(Transition, self).__init__()
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, nOutChannels, kernel_size=1, bias=False)
        # output shape (N,nOutChannels,H,W)
        
    def forward(self,x):
        out = self.conv1(F.relu(self.bn1(x)))
        # out shape here: (N,nOutChannels,H,W)
        # note that default stride value for avg_pool2d is equal to kernel size
        out = F.avg_pool2d(out,2)
        # output shape (N,nOutChannels,H//2,W//2)
        return out

In [146]:
class DenseNet(nn.Module):
    def __init__(self, growthRate, depth, reduction, nClasses, bottleneck):
        super(DenseNet, self).__init__()
        
        nDenseBlocks = (depth-4) // 3
        if bottleneck:
            # if use bottleneck, then half the channels expansion are needed
            nDenseBlocks //= 2
        
        self.nDenseBlocks = nDenseBlocks
        nChannels = 2*growthRate
        # input (N,nChannels,H,W) output (N,nChannels,H,W)
        self.conv1 = nn.Conv2d(3, nChannels, kernel_size=3, padding=1, bias=False)
        # intput (N,nChannels,H,W) output (N,nChannels+nDenseBlocks*growthRate,H,W)
        self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        # update on the new number of Channels of current output
        nChannels += nDenseBlocks * growthRate
        # use Transition network to reduce the channels & shrink the image
        # reduction of number of Channels according to reduction rate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.inchan_trans1 = nChannels
        self.outchan_trans1 = nOutChannels
        self.trans1 = Transition(nChannels, nOutChannels)
        
        # update on number of Channels
        nChannels = nOutChannels
        self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks * growthRate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.trans2 = Transition(nChannels, nOutChannels)
        
        nChannels = nOutChannels
        self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks * growthRate
        
        self.bn1 = nn.BatchNorm2d(nChannels)
        # for classification task specifially
        # input shape (-1, nChannels) output shape (-1, nClasses)
        self.inchan_fc = nChannels
        self.outchan_fc = nClasses
        self.fc = nn.Linear(nChannels,nClasses)
                        
        
    
    def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck):
        layers = []
        for i in range(int(nDenseBlocks)):
            if bottleneck:
                # input (N,nChannels,H,W) output (N,nChannels+growthRate,H,W)
                layers.append(Bottleneck(nChannels, growthRate))
            else:
                # input (N,nChannels,H,W) output (N,nChannels+growthRate,H,W)
                layers.append(SingleLayer(nChannels, growthRate))
            # each time nChannels expand by growthRate (additional increment)
            nChannels += growthRate
        # like an easy way of getting around it
        return nn.Sequential(*layers)
    
    
    def forward(self,x):
        # x shape (N,nChannels,H,W)
        # out shape (N,nChannels,H,W)
        out = self.conv1(x)
        # out shape (N,(nChannels+nDenseBlock*growthRate)*reduction,H,W) 
        out = self.trans1(self.dense1(out))
        # keeps on increasing channels number 
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = F.relu(self.bn1(out))
        # essentially shrink the H & W by 8 times
        out = F.avg_pool2d(out, 8)
        # seems to be confident that either H & W get shrinked to 1, in that case drop it
        out = torch.squeeze(out)
        "Assumption is made that by now the H&W dimenison is SQUEEZED OUT!"
        "Otherwise the self.fc CANNOT be applied to o|ut here!"
        "So now out has shape (N, # of Channels)"
        out = F.log_softmax(self.fc(out),dim=1)
        return out

In [175]:
parser = argparse.ArgumentParser()
parser.add_argument('--batchSz', type=int, default=64)
parser.add_argument('--nEpochs', type=int, default=300)
parser.add_argument('--no-cuda', action='store_true')
parser.add_argument('--save')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--opt', type=str, default='sgd',
                    choices=('sgd', 'adam', 'rmsprop'))
# modifies to suit for Jupyter notebook implementation
args = parser.parse_args(args=[])

args.cuda = not args.no_cuda and torch.cuda.is_available()
# when args.save is undefined with any value (per initialization), use the string
args.save = args.save or 'work/densenet.base'
setproctitle.setproctitle(args.save)

torch.manual_seed(args.seed)

if args.cuda:
    torch.cuda.manual_seed(args.seed)

if os.path.exists(args.save):
    shutil.rmtree(args.save)
# if the directory already exist, we don't raise an error
os.makedirs(args.save, exist_ok=True)

normMean = [0.49139968, 0.48215827, 0.44653124]
normStd = [0.24703233, 0.24348505, 0.26158768]
normTransform = transforms.Normalize(normMean, normStd)

"""
During Training:
1. We randomly fit subimage (flipped) to the network, the network learns to 
make accurate classification on these sub-information set.
2. The network eventually learn 'many pathways' upon which it can make judgement
on what class the object belongs to, based on different sub-parts of the object
3. An interesting question is that WHY such sub-image fitting & training still work
when we fit into it an Entire Image?
"""
trainTransform = transforms.Compose([
    # randomly cropping out a 32*32 sub-image with padding
    transforms.RandomCrop(32, padding=4),
    # by default flip horizontally with probability 0.5
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normTransform
])
"""
During Testing:
1. We simply normalize and fit into the trained network
"""
testTransform = transforms.Compose([
    transforms.ToTensor(),
    normTransform
])

In [244]:
def train(args, epoch, net, trainLoader, optimizer, trainF):
    """
    model.train() tells your model that you are training the model. 
    So effectively layers like dropout, batchnorm etc. which behave 
    different on the train and test procedures know what is going on 
    and hence can behave accordingly.
    """
    net.train()
    nProcessed = 0
    nTrain = len(trainLoader.dataset)
    for batch_idx, (data, target) in enumerate(trainLoader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        """
        A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents
        a node in a computational graph. If x is a Variable then x.data is a
        Tensor giving its value, and x.grad is another Variable holding the 
        gradient of x with respect to some scalar value.
        """
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = net(data)
        loss = F.nll_loss(output, target)
        # make_graph.save('/tmp/t.dot', loss.creator); assert(False)
        loss.backward()
        optimizer.step()
        nProcessed += len(data)
        pred = output.data.max(1)[1] # get the index of the max log-probability
        incorrect = pred.ne(target.data).cpu().sum()
        err = 100.*incorrect/len(data)
        partialEpoch = epoch + batch_idx / len(trainLoader) - 1
        print('Train Epoch: {:.2f} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tError: {:.6f}'.format(
            partialEpoch, nProcessed, nTrain, 100. * batch_idx / len(trainLoader),
            loss.data, err))
        
        trainF.write('{},{},{}\n'.format(partialEpoch, loss.data, err))
        trainF.flush()
        
def test(agrs, epoch, net, testLoader, optimizer, testF):
    net.eval()
    test_loss = 0
    incorrect = 0
    for data, target in testLoader:
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        with torch.no_grad(): 
            output = net(data)
            test_loss += F.nll_loss(output, target).data[0]
        pred = output.data.max(1)[1] # get the index of the max log-probability
        incorrect += pred.ne(target.data).cpu().sum()
        
    test_loss = test_loss
    test_loss /= len(testLoader) # loss function already averages over batch size
    nTotal = len(testLoader.dataset)
    err = 100.*incorrect/nTotal
    print('\nTest set: Average loss: {:.4f}, Error: {}/{} ({:.0f}%)\n'.format(
        test_loss, incorrect, nTotal, err))

    testF.write('{},{},{}\n'.format(epoch, test_loss, err))
    testF.flush()
    
# in-place modification of learning rate
def adjust_opt(optAlg, optimizer, epoch):
    if optAlg == 'sgd':
        if epoch < 150: lr = 1e-1
        elif epoch == 150: lr = 1e-2
        elif epoch == 225: lr = 1e-3
        else: return

        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

In [246]:
kwargs = {'num_workers':1, 'pin_memory': True} if args.cuda else {}
trainLoader = DataLoader(
    dset.CIFAR10(root='cifar', train=True, download=True, transform=trainTransform),
    batch_size = args.batchSz, shuffle=True, **kwargs)
testLoader = DataLoader(
    dset.CIFAR10(root='cifar', train=False, download=True, transform=testTransform),
    batch_size=args.batchSz, shuffle=False, **kwargs)

net = DenseNet(growthRate=12, depth=100, reduction=0.5,
                        bottleneck=True, nClasses=10)

Files already downloaded and verified
Files already downloaded and verified


In [245]:
print('  Total Number of params: {}'.format(
        sum([p.data.nelement() for p in net.parameters()])))

if args.cuda:
    net = net.cuda()

if args.opt == 'sgd':
    optimizer = optim.SGD(net.parameters(), lr=1e-1,
                        momentum=0.9, weight_decay=1e-4)
elif args.opt == 'adam':
    optimizer = optim.Adam(net.parameters(), weight_decay=1e-4)
elif args.opt == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4)
    
# the 'w' here ensure creation of the file if it didn't exit already
trainF = open(os.path.join(args.save, 'train.csv'), 'w')
testF = open(os.path.join(args.save, 'test.csv'), 'w')

for epoch in range(1, args.nEpochs + 1):
    adjust_opt(args.opt, optimizer, epoch)
    train(args, epoch, net, trainLoader, optimizer, trainF)
    test(args, epoch, net, testLoader, optimizer, testF)
    torch.save(net, os.path.join(args.save, 'latest.pth'))
    os.system('./plot.py {} &'.format(args.save))

trainF.close()
testF.close()

  Total Number of params: 769162


KeyboardInterrupt: 