In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import datasets , transforms
from torch.optim import lr_scheduler

from tqdm import tqdm

import random
import PIL 
from PIL import Image

from torch.autograd import Variable
from collections import OrderedDict
import math
from pathlib import Path
device = ("cuda:2" if torch.cuda.is_available() else "cpu")
print(f"torch version {torch.__version__}\nPIL version {PIL.__version__}\nDevice {device}")


torch version 1.2.0
PIL version 6.1.0
Device cuda:2


In [2]:
dataset = datasets.CIFAR10(train=True,root="data/" , download=True)
mean = np.mean(dataset.data ,axis=(0,1,2))
std = np.std(dataset.data ,axis=(0,1,2))
print(f"\nMean is {mean}\nStd dev is  {std}")

Files already downloaded and verified

Mean is [125.30691805 122.95039414 113.86538318]
Std dev is  [62.99321928 62.08870764 66.70489964]


In [3]:
train_transforms = transforms.Compose([transforms.RandomCrop(size=32 , padding=4 , padding_mode="symmetric",pad_if_needed=True),
                                       transforms.RandomHorizontalFlip(p=0.5),
                                       transforms.ToTensor(),
                                       #transforms.Normalize(mean = mean , std=std)
                                      ])

val_transforms = transforms.Compose([transforms.ToTensor() , 
                                     #transforms.Normalize(mean=mean , std=std)
                                    ])

mean = torch.from_numpy(mean).type(torch.float32)
std = torch.from_numpy(std).type(torch.float32)

def denormalize(image):
  image = image.clone().detach().to("cpu") # take out of computational graph
  image = image.squeeze() 
  image = image.permute(1,2,0) # channel swapping H*W*C
 # image.mul_(std).add_(mean) # denormalize

  return image.numpy() 

trainset = datasets.CIFAR10(train=True,root="data/" , download=True,transform=train_transforms)
valset  = datasets.CIFAR10(train=False,root="data/" , download=True,transform=val_transforms)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
class DenseLayer(nn.Module):
    def __init__(self,num_channels,growth_rate,bn_size,drop_rate):
        super(DenseLayer,self).__init__()
        mid_channel = int(growth_rate*bn_size)
        self.add_module("bn1",nn.BatchNorm2d(num_channels))
        self.add_module("relu1",nn.ReLU(inplace=True))
        self.add_module("conv1",nn.Conv2d(num_channels,mid_channel ,kernel_size=1 , bias=False))
        self.add_module("bn2",nn.BatchNorm2d(mid_channel))
        self.add_module("relu2",nn.ReLU(inplace=True))
        self.add_module("conv2",nn.Conv2d(mid_channel ,growth_rate,kernel_size=3,padding=1 , bias=False))
        self.drop_rate=drop_rate
    def forward(self,*prev_features):
        concated_features = torch.cat(prev_features, 1)
        bottleneck_output = self.conv1(self.relu1(self.bn1(concated_features)))
        new_features = self.conv2(self.relu2(self.bn2(bottleneck_output)))
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
        return new_features


In [5]:
class Transition(nn.Module):
    def __init__(self,num_channels,num_out_channels):
        super(Transition,self).__init__()
        self.add_module("bn",nn.BatchNorm2d(num_channels))
        self.add_module("relu",nn.ReLU(inplace=True))
        self.add_module("conv",nn.Conv2d(num_channels,num_out_channels ,kernel_size=1 , bias=False))
        self.add_module("pool",nn.AvgPool2d(kernel_size=2, stride=2))
    def forward(self,x):
        out = self.conv(self.relu(self.bn(x)))
        out = self.pool(out)
        return out

In [6]:
class DenseBlock(nn.Module):
    
    def __init__(self,num_layers,num_channels,growth_rate,bn_size,drop_rate):
        super(DenseBlock,self).__init__()
        for i in range(num_layers):
            layer = DenseLayer(num_channels=num_channels+i*growth_rate,
                               growth_rate=growth_rate,
                               bn_size=bn_size,
                               drop_rate=drop_rate)
            self.add_module(f"denselayer{i+1}",layer)
    
    def forward(self, init_features):
        features = [init_features]
        for name, layer in self.named_children():
            new_features = layer(*features)
            features.append(new_features)
        return torch.cat(features, 1)

In [7]:
class DenseNet(nn.Module):
    def __init__(self,growth_rate=32,block_config=(6,12,24,16),
                num_init_features=64,bn_size=4, drop_rate=0.1,num_classes=dataset.classes.__len__()):
        super(DenseNet,self).__init__()
        
        self.features = nn.Sequential(OrderedDict([
            ("conv0",nn.Conv2d(3,num_init_features,kernel_size=3,bias=False)),
        ]))
        
        num_features=num_init_features
        for i, num_layers in enumerate(block_config):
            block  = DenseBlock(num_layers = num_layers,
                               num_channels=num_features,
                                growth_rate=growth_rate,
                                bn_size=bn_size,
                                drop_rate=drop_rate)
            self.features.add_module(f"denseblock{i+1}",block)
            num_features = num_features + num_layers * growth_rate
            if i<len(block_config)-1:
                transition = Transition(num_features,num_features//2)
                num_features=num_features//2
                self.features.add_module(f"transition{i+1}",transition)
        self.features.add_module("norm5",nn.BatchNorm2d(num_features))
        self.classifier = nn.Linear(num_features,num_classes)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

In [8]:
batch_size=64*torch.cuda.device_count()
epochs = 10  # number of epochs to run without early-stopping
workers = 4  # number of workers for loading data in the DataLoader
lr = 1e-3  # learning rate
weight_decay = 1e-4  # weight decay
n_classes = len(trainset.classes)

In [9]:
loader_param = { "batch_size":batch_size,
                 "pin_memory":True,
                 "num_workers":workers,
                "shuffle":True}

trainLoader = DataLoader(trainset,**loader_param)

valLoader = DataLoader(valset  ,**loader_param)

data_loader={"train":trainLoader , "val":valLoader}

In [23]:
import copy
model = DenseNet()
model_super_conv = copy.deepcopy(model)
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.nn.DataParallel(model).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()

model_super_conv = torch.nn.DataParallel(model_super_conv).to(device)
optimizer_super_conv = optim.Adam(model_super_conv.parameters(), lr=lr)


In [11]:
from tqdm import trange

def train(model , data_loader , criterion , optimizer , num_epochs=5):

  for epoch in trange(num_epochs,desc="Epochs"):
    result = []
    for phase in ['train', 'val']:
      if phase=="train":     # put the model in training mode
        model.train()
      else:     # put the model in validation mode
        model.eval()
       
      # keep track of training and validation loss
      running_loss = 0.0
      running_corrects = 0.0  
      
      for data , target in data_loader[phase]:
        #load the data and target to respective device
        data , target = data.to(device)  , target.to(device)

        with torch.set_grad_enabled(phase=="train"):
          #feed the input
          output = model(data)
          #calculate the loss
          loss = criterion(output,target)
          preds = torch.argmax(output,1)

          if phase=="train"  :
            # backward pass: compute gradient of the loss with respect to model parameters 
            loss.backward()
            # update the model parameters
            optimizer.step()
            # zero the grad to stop it from accumulating
            optimizer.zero_grad()


        # statistics
        running_loss += loss.item() * data.size(0)
        running_corrects += torch.sum(preds == target.data).item()
        
        
      epoch_loss = running_loss / len(data_loader[phase].dataset)
      epoch_acc = running_corrects / len(data_loader[phase].dataset)

      result.append('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
    print(result)

In [12]:
train(model,data_loader , criterion, optimizer,num_epochs=epochs)

Epochs:  10%|█         | 1/10 [01:28<13:17, 88.59s/it]

['train Loss: 1.4295 Acc: 0.4769', 'val Loss: 1.3306 Acc: 0.5422']


Epochs:  20%|██        | 2/10 [02:40<11:09, 83.73s/it]

['train Loss: 0.9521 Acc: 0.6616', 'val Loss: 0.9158 Acc: 0.6860']


Epochs:  30%|███       | 3/10 [03:52<09:20, 80.10s/it]

['train Loss: 0.7254 Acc: 0.7466', 'val Loss: 0.8087 Acc: 0.7457']


Epochs:  40%|████      | 4/10 [05:05<07:47, 77.90s/it]

['train Loss: 0.6033 Acc: 0.7917', 'val Loss: 0.7423 Acc: 0.7456']


Epochs:  50%|█████     | 5/10 [06:16<06:19, 75.96s/it]

['train Loss: 0.5112 Acc: 0.8235', 'val Loss: 0.5424 Acc: 0.8178']


Epochs:  60%|██████    | 6/10 [07:28<04:59, 74.82s/it]

['train Loss: 0.4433 Acc: 0.8471', 'val Loss: 0.6686 Acc: 0.7810']


Epochs:  70%|███████   | 7/10 [08:41<03:42, 74.07s/it]

['train Loss: 0.4110 Acc: 0.8575', 'val Loss: 0.4973 Acc: 0.8311']


Epochs:  80%|████████  | 8/10 [09:53<02:27, 73.52s/it]

['train Loss: 0.3714 Acc: 0.8727', 'val Loss: 0.3734 Acc: 0.8678']


Epochs:  90%|█████████ | 9/10 [11:05<01:13, 73.09s/it]

['train Loss: 0.3342 Acc: 0.8841', 'val Loss: 0.4605 Acc: 0.8484']


Epochs: 100%|██████████| 10/10 [12:18<00:00, 72.92s/it]

['train Loss: 0.3100 Acc: 0.8921', 'val Loss: 0.5318 Acc: 0.8355']





In [13]:
class Stepper():
    "Used to \"step\" from start,end (`vals`) over `n_iter` iterations on a schedule defined by `func`"
    
    def __init__(self, val, n_iter:int, func):
        self.start,self.end = val
        self.n_iter = max(1,n_iter)
        self.func = func
        self.n = 0

    def step(self):
        "Return next value along annealed schedule."
        self.n += 1
        return self.func(self.start, self.end, self.n/self.n_iter)

    @property
    def is_done(self):
        "Return `True` if schedule completed."
        return self.n >= self.n_iter
    
# Annealing functions
def annealing_no(start, end, pct):
    "No annealing, always return `start`."
    return start
  
def annealing_linear(start, end, pct):
    "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    return start + pct * (end-start)
  
def annealing_exp(start, end, pct):
    "Exponentially anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    return start * (end/start) ** pct

def annealing_cos(start, end, pct):
    "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    cos_out = np.cos(np.pi * pct) + 1
    return end + (start-end)/2 * cos_out

In [21]:

class OneCyclePolicy:
  
  def __init__(self,model , optimizer , criterion ,num_iteration,num_epochs,max_lr, momentum = (0.95,0.85) , div_factor=25 , pct_start=0.4, device=None ):
    
    self.model =model
    self.optimizer = optimizer
    self.criterion = criterion
    self.num_epochs = num_epochs
    if device is None:
      self.device = next(model.parameters()).device
    else:
      self.device = device
      
    n = num_iteration * self.num_epochs
    a1 = int(n*pct_start)
    a2 = n-a1
    self.phases = ((a1 , annealing_linear) , (a2 , annealing_cos))
    min_lr = max_lr/div_factor
    self.lr_scheds = self.steps((min_lr,max_lr) , (max_lr,min_lr/1e4))
    self.mom_scheds =self.steps(momentum , momentum[::-1])
    self.idx_s = 0
    self.update_lr_mom(self.lr_scheds[0].start,self.mom_scheds[0].start)
  
  def steps(self, *steps):
      "Build anneal schedule for all of the parameters."
      return [Stepper(step, n_iter, func=func)for (step,(n_iter,func)) in zip(steps, self.phases)]

  def train(self, data_loader ):
    self.model.to(self.device)
#     data_loader = {"train":trainLoader , "val":validLoader}
    for epoch in tqdm(range(self.num_epochs),desc="Epochs"):
      result = []
      for phase in ['train', 'val']:
        if phase=="train":     # put the model in training mode
          model.train()
        else:     # put the model in validation mode
          model.eval()

        # keep track of training and validation loss
        running_loss = 0.0
        running_corrects = 0  

        for data , target in data_loader[phase]:
          #load the data and target to respective device
          data , target = data.to(device)  , target.to(device)

          with torch.set_grad_enabled(phase=="train"):
            #feed the input
            output = self.model(data)
            #calculate the loss
            loss = self.criterion(output,target)
            preds = torch.argmax(output,1)

            if phase=="train"  :
              # backward pass: compute gradient of the loss with respect to model parameters 
              loss.backward()
              # update the model parameters
              self.optimizer.step()
              # zero the grad to stop it from accumulating
              self.optimizer.zero_grad()
            
              self.update_lr_mom(self.lr_scheds[self.idx_s].step() ,self.mom_scheds[self.idx_s].step() )

              if self.lr_scheds[self.idx_s].is_done:
                self.idx_s += 1
          
          # statistics
          running_loss += loss.item() * data.size(0)
          running_corrects += torch.sum(preds == target.data).item()


        epoch_loss = running_loss / len(data_loader[phase].dataset)
        epoch_acc = running_corrects/ len(data_loader[phase].dataset)

        result.append('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
      print(result)

  def update_lr_mom(self,lr=0.001,mom=0.99):
    for l in self.optimizer.param_groups:
      l["lr"]=lr
      if isinstance(self.optimizer , ( torch.optim.Adamax,torch.optim.Adam)):
          l["betas"] = ( mom, 0.999)
      elif isinstance(self.optimizer, torch.optim.SGD):
          l["momentum"] =mom

In [24]:
fit_one_cycle = OneCyclePolicy(model_super_conv ,optimizer_super_conv , criterion,num_iteration=len(trainLoader)  , num_epochs =10 , max_lr =0.01 ,device=device)
fit_one_cycle.train(data_loader)


Epochs:   0%|          | 0/10 [00:00<?, ?it/s][A
Epochs:  10%|█         | 1/10 [01:11<10:44, 71.65s/it][A

['train Loss: 1.4461 Acc: 0.4697', 'val Loss: 1.0721 Acc: 0.6178']



Epochs:  20%|██        | 2/10 [02:25<09:38, 72.35s/it][A

['train Loss: 1.0015 Acc: 0.6448', 'val Loss: 0.8674 Acc: 0.6971']



Epochs:  30%|███       | 3/10 [03:39<08:28, 72.67s/it][A

['train Loss: 0.8092 Acc: 0.7162', 'val Loss: 0.6769 Acc: 0.7651']



Epochs:  40%|████      | 4/10 [04:52<07:17, 72.95s/it][A

['train Loss: 0.6752 Acc: 0.7659', 'val Loss: 0.5890 Acc: 0.7985']



Epochs:  50%|█████     | 5/10 [06:06<06:06, 73.20s/it][A

['train Loss: 0.5759 Acc: 0.8013', 'val Loss: 0.4973 Acc: 0.8312']



Epochs:  60%|██████    | 6/10 [07:20<04:53, 73.39s/it][A

['train Loss: 0.4715 Acc: 0.8388', 'val Loss: 0.4445 Acc: 0.8460']



Epochs:  70%|███████   | 7/10 [08:33<03:40, 73.36s/it][A

['train Loss: 0.3879 Acc: 0.8655', 'val Loss: 0.3772 Acc: 0.8733']



Epochs:  80%|████████  | 8/10 [09:46<02:26, 73.26s/it][A

['train Loss: 0.3149 Acc: 0.8902', 'val Loss: 0.3235 Acc: 0.8907']



Epochs:  90%|█████████ | 9/10 [11:00<01:13, 73.44s/it][A

['train Loss: 0.2467 Acc: 0.9131', 'val Loss: 0.2872 Acc: 0.9028']



Epochs: 100%|██████████| 10/10 [12:14<00:00, 73.59s/it][A

['train Loss: 0.2066 Acc: 0.9283', 'val Loss: 0.2844 Acc: 0.9036']
