[View in Colaboratory](https://colab.research.google.com/github/erconui/cs501r/blob/master/first_dll.ipynb)

In [68]:
!pip3 install torch 
!pip3 install torchvision
!pip3 install tqdm

Collecting torchsummary
  Downloading https://files.pythonhosted.org/packages/57/a8/f935291ecb02228ad2a114a55ceb32345d6d722d27a2861d230fcca11096/torchsummary-1.5-py3-none-any.whl
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5


In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import pdb
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm

assert torch.cuda.is_available() # You need to request a GPU from Runtime > Change Runtime Type

In [89]:
class Conv2d(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride=1, 
               padding=0, dilation=1, groups=1, bias=True, initialization=0):
    self.__dict__.update(locals())
    super(Conv2d, self).__init__()
    
    self.weight = Parameter(torch.Tensor(out_channels, in_channels,
                                         *kernel_size))
    
    self.bias = Parameter(torch.Tensor(out_channels))
    
    if initialization==1:
        ## Uniform Initialization
        self.weight.data.uniform_(-1,1)
        self.bias.data.uniform_(0,0)
    if initialization==2:
        ## XE Initialization
        self.weight.data.fill_(0.01)
        self.bias.data.fill_(0.01)
    if initialization==3:
        ## Orthogonal Initialization
        self.init_orthogonal(in_channels, out_channels, kernel_size)

  def init_orthogonal(self, in_channels, out_channels, kernel_size):
    width = in_channels*kernel_size[0]*kernel_size[1]
    X = np.random.random((out_channels, width))
    U, _, VT = np.linalg.svd(X,full_matrices=False)
    if out_channels > width:
      weight = U
    else:
      weight = VT
    weight = weight.reshape((out_channels, in_channels, kernel_size[0], kernel_size[1]))
    self.weight = Parameter(torch.tensor(weight).float())
    
  def forward(self, x):
    return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
                    self.dilation, self.groups)
  
  def extra_repr(self):
    return '501r is so cool'

class CrossEntropyLoss(nn.Module):
  def __init__(self, weight=None, size_average=None, ignore_index=-100,reduce=None, reduction='elementwise_mean'):
    super(CrossEntropyLoss, self).__init__()
    
  def forward(self, x, labels):
    alpha = torch.max(x).item()
    softmax = -torch.log(torch.exp(x-alpha)/torch.exp(x-alpha).sum(1, keepdim=True))
    r = torch.arange(softmax.size(0))
    return softmax[r, labels].mean()
  
class ConvNetwork(nn.Module):
  def __init__(self, dataset, million_parameters=False):
    super(ConvNetwork, self).__init__()
    x, y = dataset[0]
    c,h,w = x.size()
    output = 10
    
    if million_parameters:
        self.net = nn.Sequential(
              nn.Conv2d(c, 100, (3,3), padding=(1,1)),
              nn.ReLU(),
              nn.Conv2d(100, 100, (3,3), padding=(1,1)),
              nn.ReLU(),
              nn.Conv2d(100, 100, (3,3), padding=(1,1)),
              nn.ReLU(),
              nn.Conv2d(100, output, (28,28), padding=(0,0))
        )
    else:
        self.net = nn.Sequential(
              nn.Conv2d(c, 10, (3,3), padding=(1,1)),
              nn.ReLU(),
              nn.Conv2d(10, output, (28,28), padding=(0,0))
        )
   
  def forward(self, x):
    return self.net(x).squeeze(2).squeeze(2)

class FashionMNISTProcessedDataset(Dataset):
  def __init__(self, root, train=True):
    self.data = datasets.FashionMNIST(
        root, train=train, transform=transforms.ToTensor(), download=True)
    
  def __getitem__(self, i):
    x, y = self.data[i]
    return x, y
  
  def __len__(self):
    return len(self.data)

def count_parameters(model):
    total_param = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            total_param += np.prod(param.size())
    return total_param

In [90]:
def train_data(model, objective, optimizer, train_loader, val_loader):
    losses = []
    validations = []
    accuracies = []
    val_accuracies = []
    for epoch in range(1):
      loop = tqdm(total=len(train_loader), position=0)
      for batch, (x, y_truth) in enumerate(train_loader):
        x, y_truth = x.cuda(async=True), y_truth.cuda(async=True)

        optimizer.zero_grad()
        y_hat = model(x)

        loss = objective(y_hat,y_truth)

        loss.backward()

        losses.append(loss)
        accuracy = (torch.softmax(y_hat, 1).argmax(1) == y_truth).float().mean()
        accuracies.append(accuracy)
        loop.set_description('loss:{:.4f}'.format(loss.item()))
        loop.update(1)

        optimizer.step()

        if batch % 50 == 0:
          vals = []
          tmp_accuracies = []
          for x1,y in val_loader:
            x1, y_truth1 = x1.cuda(async=True), y.cuda(async=True)
            y_hat = model(x1)
            tmp = objective(y_hat, y_truth1).item()
            vals.append(tmp)
            accuracy = (y_hat.argmax(1) == y_truth1).float().mean()
            tmp_accuracies.append(accuracy)

          val_accuracies.append((len(losses), np.mean(tmp_accuracies)))
          validations.append((len(losses), np.mean(vals)))

      loop.close()
    return validations, losses, val_accuracies, accuracies

loss:0.4600:  25%|██▍       | 351/1429 [00:47<00:55, 19.37it/s]

In [87]:
train_dataset = FashionMNISTProcessedDataset('/tmp/fashionmnist', train=True)
val_dataset = FashionMNISTProcessedDataset('/tmp/fashionmnist', train=False)

model = ConvNetwork(train_dataset, million_parameters=True)
print("Number of Parameters is: ", count_parameters(model))

model.cuda()
objective = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
train_loader = DataLoader(train_dataset, batch_size=42, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=42, pin_memory=True)

validations, losses, val_accuracies, accuracies = train_data(model, objective, optimizer,
                                                             train_loader, val_loader)
a, b = zip(*validations)
# fig, ax = plt.subplots(1,2)
plt.title("Loss with 1,000,000 Parameters")
plt.plot(losses, label='train')
plt.plot(a, b, label='val')
plt.legend()
plt.show()
plt.title("Accuracy with Xavier implementation")
plt.plot(accuracies, label='training accuracy')
a, b = zip(*val_accuracies)
plt.plot(a, b, label="validation accuracy")
plt.legend()
plt.show()

loss:2.3070:   0%|          | 0/1429 [00:00<?, ?it/s]

Number of Parameters is:  875110


loss:0.4600:  24%|██▍       | 350/1429 [00:32<00:55, 19.37it/s]

KeyboardInterrupt: 

In [None]:
# Done with Xavier Implementation
model = ConvNetwork(train_dataset, )
print("Number of Parameters is: ", count_parameters(model))

model.cuda()
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
train_loader = DataLoader(train_dataset, batch_size=42, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=42, pin_memory=True)

validations, losses, val_accuracies, accuracies = train_data(model, objective, optimizer,
                                                             train_loader, val_loader)
a, b = zip(*validations)
# fig, ax = plt.subplots(1,2)
plt.title("Loss with 1,000,000 Parameters")
plt.plot(losses, label='train')
plt.plot(a, b, label='val')
plt.legend()
plt.show()
plt.title("Accuracy with Xavier implementation")
plt.plot(accuracies, label='training accuracy')
a, b = zip(*val_accuracies)
plt.plot(a, b, label="validation accuracy")
plt.legend()
plt.show()

In [None]:

train_dataset = FashionMNISTProcessedDataset('/tmp/fashionmnist', train=True)
val_dataset = FashionMNISTProcessedDataset('/tmp/fashionmnist', train=False)

model = ConvNetwork(train_dataset)
print("Number of Parameters is: ", count_parameters(model))

model.cuda()
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
train_loader = DataLoader(train_dataset, batch_size=42, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=42, pin_memory=True)

validations, losses, val_accuracies, accuracies = train_data(model, objective, optimizer,
                                                             train_loader, val_loader)

a, b = zip(*validations)
# fig, ax = plt.subplots(1,2)
plt.title("Loss with 1,000,000 Parameters")
plt.plot(losses, label='train')
plt.plot(a, b, label='val')
plt.legend()
plt.show()
plt.title("Accuracy with Xavier implementation")
plt.plot(accuracies, label='training accuracy')
a, b = zip(*val_accuracies)
plt.plot(a, b, label="validation accuracy")
plt.legend()
plt.show()

In [5]:
## Cross Entropy loss
# check it outputs the same as pytorch's crossentropyloss

#necessary for the soft max
a = torch.from_numpy(np.random.randn(3,4,1).astype(np.float32))
b = torch.exp(a)
z = a / b.sum(1, keepdim=True)

# Part 4
## Using a Kernel size of 3×3 what should the settings of your 2d convolution be that results in the following mappings (first answer given to you)

 (c=3, h=10, w=10) ⇒ (c=10, h=8, w=8) : (out_channels=10, kernel_size=(3, 3), padding=(0, 0))

(c=3, h=10, w=10) ⇒ (c=22, h=10, w=10) : (out_channels=22, kernel_size=(3,3), padding=(1,1))

(c=3, h=10, w=10) ⇒ (c=65, h=12, w=12) : (out_channels=65, kernel_size=(3,3), padding=(2,2))

(c=3, h=10, w=10) ⇒ (c=7, h=20, w=20) : (out_channels=7, kernel_size=(3,3), padding=(12,12))

## Using a Kernel size of 5×5:

 (c=3, h=10, w=10) ⇒ (c=10, h=8, w=8) : (out_channels=10, kernel_size=(5, 5), padding=(1, 1))

(c=3, h=10, w=10) ⇒ (c=100, h=10, w=10) : (out_channels=100, kernel_size=(5,5), padding=(2,2))

(c=3, h=10, w=10) ⇒ (c=23, h=12, w=12) : (out_channels=23, kernel_size=(5,5), padding=(3,3))

(c=3, h=10, w=10) ⇒ (c=5, h=24, w=24) : (out_channels=5, kernel_size=(5,5), padding=(18,18))

## Using Kernel size of 5×3:

 (c=3, h=10, w=10) ⇒ (c=10, h=8, w=8) : (out_channels=10, kernel_size=(5,3), padding=(2,0))

(c=3, h=10, w=10) ⇒ (c=100, h=10, w=10) : (out_channels=100, kernel_size=(5,3), padding=(4,2))

(c=3, h=10, w=10) ⇒ (c=23, h=12, w=12) : (out_channels=23, kernel_size=(5,3), padding=(6,4))

(c=3, h=10, w=10) ⇒ (c=5, h=24, w=24) : (out_channels=5, kernel_size=(5,3), padding=(18,16))

## Determine the kernel that requires the smallest padding size to make the following mappings possible:

 (c=3, h=10, w=10) ⇒ (c=10, h=9, w=7) : (out_channels=10, kernel_size=(4,4), padding=(1,0))

(c=3, h=10, w=10) ⇒ (c=22, h=10, w=10) : (out_channels=22, kernel_size=(1,1), padding=(0,0))