[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/devdastl/EVA-8_Phase-1_Assignment-10/blob/main/ConvMixer_experimentation_2/ConvMixer_exp_2.ipynb)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import time
import argparse

In [2]:
import torch.nn as nn

class residual(nn.Module):
    def __init__(self, res_block):
        super().__init__()
        self.res_block = res_block

    def forward(self, x):
        return self.res_block(x) + x

class MixerModel():
    def __init__(self, dim, depth, kernel_size=5, patch_size=2, n_classes=10):
        self.depth = depth
        self.dim = dim
        self.kernel_size = kernel_size
        self.patch_size = patch_size
        self.n_classes = n_classes

    def depth_wise(self):
        return nn.Sequential(
            nn.Conv2d(self.dim, self.dim, kernel_size=self.kernel_size, groups=self.dim, padding="same"),
            nn.GELU(),
            nn.BatchNorm2d(self.dim)
        )
    def point_wise(self):
        return nn.Sequential(
            nn.Conv2d(self.dim, self.dim, kernel_size=1),
            nn.GELU(),
            nn.BatchNorm2d(self.dim)
        )

    def get_model(self):
        embedding_prep = nn.Sequential(
            nn.Conv2d(3, self.dim, kernel_size=self.patch_size, stride=self.patch_size),
            nn.GELU(),
            nn.BatchNorm2d(self.dim)
        )

        depth_wise = nn.Sequential(
            nn.Conv2d(self.dim, self.dim, kernel_size=self.kernel_size, groups=self.dim, padding="same"),
            nn.GELU(),
            nn.BatchNorm2d(self.dim)
        )   

        point_wise =  nn.Sequential(
            nn.Conv2d(self.dim, self.dim, kernel_size=1),
            nn.GELU(),
            nn.BatchNorm2d(self.dim)
        )

        mixer_block = [nn.Sequential(
            residual(self.depth_wise()),
            self.point_wise()
        ) for i in range(self.depth)]

        model = nn.Sequential(
            embedding_prep,
            *mixer_block,
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Linear(self.dim, self.n_classes)
        )

        return model


In [3]:
from torchsummary import summary

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
depth = 10
hdim = 256
psize = 2
conv_ks = 5
clip_norm = True

model = MixerModel(hdim, depth, patch_size=psize, kernel_size=conv_ks, n_classes=10).get_model()


summary(model.to(device), input_size=(3, 32, 32))

cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 256, 16, 16]           3,328
              GELU-2          [-1, 256, 16, 16]               0
       BatchNorm2d-3          [-1, 256, 16, 16]             512
            Conv2d-4          [-1, 256, 16, 16]           6,656
              GELU-5          [-1, 256, 16, 16]               0
       BatchNorm2d-6          [-1, 256, 16, 16]             512
          residual-7          [-1, 256, 16, 16]               0
            Conv2d-8          [-1, 256, 16, 16]          65,792
              GELU-9          [-1, 256, 16, 16]               0
      BatchNorm2d-10          [-1, 256, 16, 16]             512
           Conv2d-11          [-1, 256, 16, 16]           6,656
             GELU-12          [-1, 256, 16, 16]               0
      BatchNorm2d-13          [-1, 256, 16, 16]             512
         residual-14          [-1,

In [4]:
cifar10_mean = (0.4914, 0.4822, 0.4465)
cifar10_std = (0.2471, 0.2435, 0.2616)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(32, scale=(0.75, 1.0), ratio=(1.0, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandAugment(num_ops=1, magnitude=8),
    transforms.ColorJitter(0.1, 0.1, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(cifar10_mean, cifar10_std),
    transforms.RandomErasing(p=0.25)
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(cifar10_mean, cifar10_std)
])

epochs = 25
batch_size =512 * 3

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4*3)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=test_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=4*3)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [6]:
lr_schedule = lambda t: np.interp([t], [0, epochs*2//5, epochs*4//5, epochs], 
                                  [0, 0.01, 0.01/20.0, 0])[0]

depth = 10
hdim = 256
psize = 2
conv_ks = 5
clip_norm = True

model = MixerModel(hdim, depth, patch_size=psize, kernel_size=conv_ks, n_classes=10).get_model()
model = nn.DataParallel(model, device_ids=[0,1,2]).cuda()

opt = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

for epoch in range(epochs):
    start = time.time()
    train_loss, train_acc, n = 0, 0, 0
    for i, (X, y) in enumerate(trainloader):
        model.train()
        X, y = X.cuda(), y.cuda()

        lr = lr_schedule(epoch + (i + 1)/len(trainloader))
        opt.param_groups[0].update(lr=lr)

        opt.zero_grad()
        with torch.cuda.amp.autocast():
            output = model(X)
            loss = criterion(output, y)

        scaler.scale(loss).backward()
        if clip_norm:
            scaler.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(opt)
        scaler.update()
        
        train_loss += loss.item() * y.size(0)
        train_acc += (output.max(1)[1] == y).sum().item()
        n += y.size(0)
        
    model.eval()
    test_acc, m = 0, 0
    with torch.no_grad():
        for i, (X, y) in enumerate(testloader):
            X, y = X.cuda(), y.cuda()
            with torch.cuda.amp.autocast():
                output = model(X)
            test_acc += (output.max(1)[1] == y).sum().item()
            m += y.size(0)

    print(f'ConvMixer: Epoch: {epoch} | Train Acc: {train_acc/n:.4f}, Test Acc: {test_acc/m:.4f}, Time: {time.time() - start:.1f}, lr: {lr:.6f}')



ConvMixer: Epoch: 0 | Train Acc: 0.2912, Test Acc: 0.2866, Time: 35.2, lr: 0.001000
ConvMixer: Epoch: 1 | Train Acc: 0.4887, Test Acc: 0.5342, Time: 27.4, lr: 0.002000
ConvMixer: Epoch: 2 | Train Acc: 0.5808, Test Acc: 0.6012, Time: 27.2, lr: 0.003000
ConvMixer: Epoch: 3 | Train Acc: 0.6546, Test Acc: 0.6397, Time: 27.5, lr: 0.004000
ConvMixer: Epoch: 4 | Train Acc: 0.7028, Test Acc: 0.6840, Time: 27.0, lr: 0.005000
ConvMixer: Epoch: 5 | Train Acc: 0.7350, Test Acc: 0.7436, Time: 26.9, lr: 0.006000
ConvMixer: Epoch: 6 | Train Acc: 0.7612, Test Acc: 0.7489, Time: 27.6, lr: 0.007000
ConvMixer: Epoch: 7 | Train Acc: 0.7796, Test Acc: 0.7831, Time: 27.3, lr: 0.008000
ConvMixer: Epoch: 8 | Train Acc: 0.7901, Test Acc: 0.7813, Time: 27.5, lr: 0.009000
ConvMixer: Epoch: 9 | Train Acc: 0.8041, Test Acc: 0.8112, Time: 27.2, lr: 0.010000
ConvMixer: Epoch: 10 | Train Acc: 0.8146, Test Acc: 0.8281, Time: 27.5, lr: 0.009050
ConvMixer: Epoch: 11 | Train Acc: 0.8338, Test Acc: 0.8410, Time: 27.1, lr: