In [None]:
import torch
from torch.autograd import Variable
from torch import nn
import torch.nn.functional as F
from torch import optim
import torchvision
from torchvision import transforms, datasets
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import copy

In [None]:
force_cpu = False
if torch.cuda.is_available() and not force_cpu:
    use_gpu = True
    FloatTensor = torch.cuda.FloatTensor
    LongTensor = torch.cuda.LongTensor
else:
    use_gpu = False
    FloatTensor = torch.FloatTensor
    LongTensor = torch.LongTensor

In [None]:
batch_size = 32

training_set_transform = transforms.Compose([
    transforms.RandomResizedCrop(32, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
training_set = datasets.CIFAR10(root='CIFAR10_data', train=True,
                               transform=training_set_transform,
                               download=True)
training_set_loader = torch.utils.data.DataLoader(training_set,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=4)

test_set_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
test_set = datasets.CIFAR10(root='CIFAR10_data', train=False,
                           transform=test_set_transform,
                           download=True)
test_set_loader = torch.utils.data.DataLoader(test_set,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=4)

In [None]:
images, _ = next(iter(training_set_loader))
grid = torchvision.utils.make_grid(images, normalize=True)
grid = np.transpose(grid.numpy(), (1, 2, 0))
plt.imshow(grid)

In [None]:
def mv_avg(l, n):
    n = min(n, len(l))
    s = sum(l[-n:])
    return s / n

In [None]:
class _Scheduler:
    
    def __init__(self, optimizer):
        self.i = 0
        self.optimizer = optimizer
        self.lrs = []
        self.losses = []
        self.mv_avg_losses = []
        self.accs = []
        self.mv_avg_accs = []
        
    def get_lr(self):
        return 0
    
    def step(self):
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        self.lrs.append(lr)
        self.i += 1

class CosineAnnealing(_Scheduler):
    
    def __init__(self, optimizer, min_lr, max_lr, cycle_len, cycle_mult):
        super().__init__(optimizer)
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.cycle_len = cycle_len
        self.i_max = self.cycle_len - 1
        self.cycle_mult = cycle_mult
        
    def get_lr(self):
        # linearly scale iteration to be between 0 and pi
        # so cosine is between -1 and 1
        x = self.i / self.i_max * np.pi
        # take cosine of scaled iteration and linearly
        # scale it to be between min_lr and max_lr
        lr = (self.max_lr - self.min_lr) / 2 * (np.cos(x) + 1) + self.min_lr
        return lr
    
    def step(self):
        super().step()
        if self.i > self.i_max:
            self.i = 0
            self.cycle_len *= self.cycle_mult
            self.i_max = self.cycle_len - 1

class Exponential(_Scheduler):
    
    def __init__(self, optimizer, base_lr=5e-6, n=1.01):
        super().__init__(optimizer)
        self.base_lr = base_lr
        self.n = n
        
    def get_lr(self):
        lr = self.base_lr * self.n ** self.i
        return lr

In [None]:
class PerformanceHistory:
    
    def __init__(self):
        self.losses = []
        self.mv_avg_losses = []
        self.accs = []
        self.mv_avg_accs = []
        
    def update_history(self, loss, acc):
        self.losses.append(loss)
        mv_avg_loss = mv_avg(self.losses, 32)
        self.mv_avg_losses.append(mv_avg_loss)
        self.accs.append(acc)
        mv_avg_acc = mv_avg(self.accs, 32)
        self.mv_avg_accs.append(mv_avg_acc)

In [None]:
class ConvBnLayer(nn.Module):
    
    def __init__(self, in_channels, out_channels,
                 kernel_size, stride, padding):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size,
                              stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        
    def forward(self, x):
        return F.relu(self.bn(self.conv(x)))

class ResLayer(ConvBnLayer):
    
    def forward(self, x):
        return x + super().forward(x)

class CNN(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.conv1 = ConvBnLayer(3, 32, 5, 1, 2)
        self.layer1 = nn.Sequential(
            ConvBnLayer(32, 64, 3, 2, 1),
            ResLayer(64, 64, 3, 1, 1),
            ResLayer(64, 64, 3, 1, 1))
        self.layer2 = nn.Sequential(
            ConvBnLayer(64, 128, 3, 2, 1),
            ResLayer(128, 128, 3, 1, 1),
            ResLayer(128, 128, 3, 1, 1)
        )
        self.avgpool = nn.AdaptiveAvgPool2d(4)
        self.fc = nn.Linear(2048, 10)
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.avgpool(x)
        x = self.dropout(x.view(x.size(0), -1))
        x = self.fc(x)
        return x

In [None]:
model = CNN()
optimizer = optim.Adam(model.parameters(), lr=0.0002)
scheduler = CosineAnnealing(optimizer, 1e-5, 5e-4, 
                            len(training_set_loader)*2, 1)
train_history = PerformanceHistory()
if use_gpu:
    model = model.cuda()

In [None]:
def forward(data, model):
    inputs, targets = data
    inputs = Variable(inputs.type(FloatTensor))
    targets = Variable(targets.type(LongTensor))
    outputs = model(inputs)
    _, predictions = outputs.max(1)
    loss = F.cross_entropy(outputs, targets)
    acc = (predictions == targets).sum().data[0] / batch_size
    return loss, acc

def train(epoch, epochs):
    with tqdm(training_set_loader,
              desc="[train] Epoch %d/%d" % (epoch, epochs)) as t:
        for data in t:
            optimizer.zero_grad()
            loss, acc = forward(data, model)
            loss.backward()
            scheduler.step()
            train_history.update_history(loss.data[0], acc)
            optimizer.step()
            t.set_postfix(loss=train_history.mv_avg_losses[-1],
                          acc=train_history.mv_avg_accs[-1])
            
def test(epoch, epochs):
    running_loss = 0.0
    running_acc = 0.0
    i = 1
    with tqdm(test_set_loader,
              desc="[test] Epoch %d/%d" % (epoch, epochs)) as t:
        for data in t:
            loss, acc = forward(data, model)
            running_loss += loss.data[0]
            running_acc += acc
            t.set_postfix(loss=running_loss/i, acc=running_acc/i)
            i += 1
        
def lr_find(epoch, epochs):
    lrf_model = copy.deepcopy(model)
    lrf_optimizer = optim.Adam(lrf_model.parameters())
    lrf_history = PerformanceHistory()
    lrf_scheduler = Exponential(lrf_optimizer)
    with tqdm(training_set_loader) as t:
        for data in t:
            lrf_optimizer.zero_grad()
            loss, acc = forward(data, lrf_model)
            loss.backward()
            lrf_scheduler.step()
            lrf_optimizer.step()
            lrf_history.update_history(loss.data[0], acc)
            t.set_postfix(loss=lrf_history.mv_avg_losses[-1],
                          acc=lrf_history.mv_avg_accs[-1],
                          lr=lrf_scheduler.lrs[-1])
            starting_loss_i = min(31, len(lrf_history.mv_avg_losses)-1)
            loss_threshold = lrf_history.mv_avg_losses[starting_loss_i] * 1.3
            if lrf_history.mv_avg_losses[-1] > loss_threshold:
                break
    return lrf_scheduler, lrf_history

In [None]:
s, h = lr_find(1, 1)

In [None]:
s, h = l
plt.semilogx(s.lrs, h.mv_avg_losses)

In [None]:
epochs = 1
for epoch in range(1, epochs+1):
    train(epoch, epochs)
    test(epoch, epochs)

In [None]:
fig, axes = plt.subplots(nrows=3)
axes[0].plot(scheduler.lrs, color="Blue")
axes[0].set_ylabel("Learning Rate", color="Blue")
axes[1].plot(train_history.mv_avg_losses, color="Red")
axes[1].set_ylabel("Loss", color="Red")
axes[2].plot(train_history.mv_avg_accs, color="Green")
axes[2].set_ylabel("Accuracy", color="Green")
axes[2].set_xlabel("Mini-batch")
fig.tight_layout()
plt.show()