# Setup

In [None]:
# colab specific imports
import os
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# set up code and symlinks
if not os.path.exists('Capstone2021'):
  os.system('git clone https://github.com/Bulbasaurzc/Capstone2021')
  os.system('ln -s /content/gdrive/MyDrive/"CDS Capstone Project"/Data/torch_arrays_128/ Capstone2021/data/')
  os.system('ln -s /content/gdrive/MyDrive/"CDS Capstone Project"/Data/Raw Capstone2021/data/')
  os.system('ln -s /content/gdrive/MyDrive/"CDS Capstone Project"/Data/models Capstone2021/models/')
os.chdir('Capstone2021')

Mounted at /content/gdrive


In [None]:
import os 
import numpy as np 
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
import time
import json
import cv2

import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader
import torchvision
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
torch.manual_seed(seed)

from src.data.torch_utils import MonkeyEyeballsDataset
from src.models.from_scratch import resnet_for_multimodal_regression as resnet

def train(dataloader_train, 
          dataloader_val, 
          model, 
          optimizer, 
          scheduler, 
          val_interval,
          save_interval, 
          save_folder,
          warm_start_epoch=0,
          loss=nn.MSELoss(reduction='sum'), 
          total_epochs=100):
    # settings
    batches_per_epoch = len(dataloader_train)
    print('{} epochs in total, {} batches per epoch'.format(total_epochs, batches_per_epoch))

    if device == 'cuda':
      loss = loss.to(device)
        
    model.train()
    train_time_sp = time.time()

    for epoch in range(warm_start_epoch, total_epochs):
        print('Start epoch {}'.format(epoch))
        
        for batch_id, batch_data in enumerate(dataloader_train):
            # getting data batch
            batch_id_sp = epoch * batches_per_epoch + batch_id
            icp = batch_data['icp'].float().unsqueeze(1)
            iop = batch_data['iop'].float()
            scan = batch_data['scan'].float()

            if device == 'cuda': 
                scan = scan.to(device)

            # standardize input
            scan = (scan - 30) / 19
            icp = (icp - 15) / 11 

            optimizer.zero_grad()
            # add fake channel dimension as 5-D input is expected
            preds = model(scan.unsqueeze(1))
            
            # calculating loss
            loss_value = loss(preds, icp)
            loss_value.backward()                
            optimizer.step()

            avg_batch_time = (time.time() - train_time_sp) / (1 + batch_id_sp)
            print(
                'Batch: {}-{} ({}), loss = {:.3f}, avg_batch_time = {:.3f}'\
                .format(epoch, batch_id, batch_id_sp, loss_value, avg_batch_time))
          
            # get validation loss
            if batch_id_sp % val_interval == 0:
                model.eval()
                print('')
                print('Validating...')
                for batch_id_val, batch_data_val in enumerate(dataloader_val):
                    icp_val = batch_data_val['icp'].float().unsqueeze(1)
                    iop_val = batch_data_val['iop'].float()
                    

                    scan_val = batch_data_val['scan'].float()
                    scan_val = (scan_val - 30) / 19
                    icp_val = (icp_val - 15) / 11

                    if device == 'cuda': 
                        scan_val = scan_val.to(device)
                    preds_val = model(scan_val)
                    loss_value_val = loss(preds_val, icp_val)
                    
                    print('Loss on validation: {:.3f}'.format(loss_value_val))
                    print('')
                
                model.train()

            # save model
            if batch_id_sp != 0 and batch_id_sp % save_interval == 0:
                model_save_path = os.path.join(save_folder, 'epoch_{}_batch_{}.pth.tar'\
                                               .format(epoch, batch_id))
                model_save_dir = os.path.dirname(model_save_path)
                if not os.path.exists(model_save_dir):
                    os.makedirs(model_save_dir)
                
                print('Save checkpoints: epoch = {}, batch_id = {}'.format(epoch, batch_id)) 
                torch.save({
                            'epoch': epoch,
                            'batch_id': batch_id,
                            'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict()},
                            model_save_path)
        scheduler.step()
        print('lr = {}'.format(scheduler.get_lr()))
                           
    print('Finished training')  

labels = pd.read_csv('data/monkey_data.csv')
labels = labels[labels['torch_present'] & ~labels['icp'].isnull() & ~labels['iop'].isnull() & labels['icp'] > 0] 
labels['icp'] = labels['icp'].astype('float')
labels['iop'] = labels['iop'].astype('float')
train_labels = labels[labels['monkey_id'] != 14]
# 8 handpicked examples 
val_examples = [1751, 1754, 1761, 1766]
val_labels = labels[labels['id'].isin(val_examples)]

med_train = MonkeyEyeballsDataset('data/torch_arrays_128', train_labels)
med_val = MonkeyEyeballsDataset('data/torch_arrays_128', val_labels)

dataloader_train = DataLoader(med_train, batch_size=8, num_workers=2, shuffle=True, pin_memory=True) 
dataloader_val = DataLoader(med_val, batch_size=4, num_workers=2, shuffle=False, pin_memory=True)

model = resnet.resnet10(sample_input_D=128, sample_input_H=128, sample_input_W=512)
EPOCHS = 100
OPTIMIZER = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9, weight_decay=1e-3)
SCHEDULER = lr_scheduler.ExponentialLR(OPTIMIZER, gamma=0.99)
LOSS = nn.MSELoss(reduction='sum')

# load in in case of warm start
warm_start = torch.load('models/models/epoch_0_batch_100.pth.tar')
model.load_state_dict(warm_start['state_dict'])
OPTIMIZER.load_state_dict(warm_start['optimizer'])

if warm_start.get('epoch') is not None:
    current_epoch = warm_start.get('epoch')
else:
    current_epoch = 0

train(dataloader_train=dataloader_train, 
      dataloader_val=dataloader_val,
      model=model, 
      optimizer=OPTIMIZER, 
      scheduler=SCHEDULER, 
      total_epochs=EPOCHS, 
      warm_start_epoch=current_epoch,
      save_interval=25, 
      save_folder='models/models/run_11_28_2021',
      val_interval=10,
      loss=LOSS)    

  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


100 epochs in total, 150 batches per epoch
Start epoch 0


In [None]:
i = 0
for batch_id, batch_data in enumerate(dataloader_train):
    i+=1
    if i == 1:
        break
    

  cpuset_checked))


In [None]:
import os 
import numpy as np 
import pandas as pd
import torch
import json
import cv2

import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader
import torchvision
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
torch.manual_seed(seed)

# colab specific imports
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

import gspread
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('Monkey Data').sheet1
rows = worksheet.get_all_values()
data = pd.DataFrame.from_records(rows[1:])
data.columns = rows[0]

Mounted at /content/gdrive


In [None]:
root_dir = 'gdrive/My Drive/CDS Capstone Project/Data/'
data_dir = os.path.join(root_dir, 'torch_arrays_128')

Build dataloader

In [None]:
from torch_utils import MonkeyEyeballsDataset

In [None]:
torch_present = [int(s.strip('.pt')) for s in os.listdir(data_dir)]
keywords = ['Pre', 'pre', 'Norm', 'norm']
labels = data\
  [data['id'].astype(int).isin(torch_present)]\
  [['id', 'iop', 'icp']]
labels['iop'] = np.where(labels['iop'].isin(keywords), '', labels['iop'])
labels['icp'] = np.where(labels['icp'].isin(keywords), '', labels['icp'])
labels = labels.replace(r'^\s*$', np.nan, regex=True)
labels['icp'] = labels['icp'].astype('float')
labels['iop'] = labels['iop'].astype('float')
labels = labels.dropna()

In [None]:
med = MonkeyEyeballsDataset(data_dir, labels)
dataloader = DataLoader(med, batch_size=8, num_workers=4, shuffle=True, pin_memory=True) 

In [None]:
labels_toy = labels.sample(8)
print(labels_toy['icp'])
med_toy = MonkeyEyeballsDataset(data_dir, labels_toy)
dataloader_toy = DataLoader(med_toy, batch_size=8, shuffle=False, pin_memory=True)

121     3.0
703    21.0
167    10.0
622    21.0
385     5.0
165    10.0
790    30.0
142    24.0
Name: icp, dtype: float64


Train model

In [None]:
import resnet_for_multimodal_regression as resnet
model = resnet.resnet10(sample_input_D=128,
                 sample_input_H=128,
                 sample_input_W=512)
if device == 'cuda':
  model = model.to(device)

  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


In [None]:
import torch
import numpy as np
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
import time
from scipy import ndimage
import os
import resnet_for_multimodal_regression as resnet

EPOCHS = 1
OPTIMIZER = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.9, weight_decay=1e-3)
SCHEDULER = lr_scheduler.ExponentialLR(OPTIMIZER, gamma=0.99)
LOSS = nn.MSELoss(reduction='sum')

def train(data_loader, model, optimizer, scheduler, total_epochs, save_interval, save_folder, loss):
    # settings
    batches_per_epoch = len(data_loader)
    print('{} epochs in total, {} batches per epoch'.format(total_epochs, batches_per_epoch))

    if device == 'cuda':
      loss = loss.to(device)
        
    model.train()
    train_time_sp = time.time()
    for epoch in range(total_epochs):
        print('Start epoch {}'.format(epoch))
        
        for batch_id, batch_data in enumerate(data_loader):
            # getting data batch
            batch_id_sp = epoch * batches_per_epoch + batch_id
            icp = batch_data['icp'].float().unsqueeze(1)
            iop = batch_data['iop'].float()
            scan = batch_data['scan'].float()

            if device == 'cuda': 
                scan = scan.to(device)

            # standardize input
            scan = (scan - 30) / 19
            icp = (icp - 15) / 11 

            optimizer.zero_grad()
            # add fake channel dimension as 5-D input is expected
            preds = model(scan.unsqueeze(1))
            
            # calculating loss
            loss_value = loss(preds, icp)
            loss_value.backward()                
            optimizer.step()

            avg_batch_time = (time.time() - train_time_sp) / (1 + batch_id_sp)
            print(
                    'Batch: {}-{} ({}), loss = {:.3f}, avg_batch_time = {:.3f}'\
                    .format(epoch, batch_id, batch_id_sp, loss_value, avg_batch_time))
          
            # save model
            if batch_id_sp != 0 and batch_id_sp % save_interval == 0:
                model_save_path = os.path.join(save_folder, 'epoch_{}_batch_{}.pth.tar'\
                                               .format(epoch, batch_id))
                model_save_dir = os.path.dirname(model_save_path)
                if not os.path.exists(model_save_dir):
                    os.makedirs(model_save_dir)
                
                print('Save checkpoints: epoch = {}, batch_id = {}'.format(epoch, batch_id)) 
                torch.save({
                            'epoch': epoch,
                            'batch_id': batch_id,
                            'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict()},
                            model_save_path)
        scheduler.step()
        print('lr = {}'.format(scheduler.get_lr()))
                           
    print('Finished training')            

ModuleNotFoundError: ignored

In [None]:
train(data_loader=dataloader, 
      model=model, 
      optimizer=OPTIMIZER, 
      scheduler=SCHEDULER, 
      total_epochs=10, 
      save_interval=100, 
      save_folder=os.path.join(root_dir, 'models'),
      loss=LOSS)

10 epochs in total, 125 batches per epoch
Start epoch 0
Batch: 0-0 (0), loss = 7.256, avg_batch_time = 137.217
Batch: 0-1 (1), loss = 15.114, avg_batch_time = 129.703
Batch: 0-2 (2), loss = 6.264, avg_batch_time = 126.608
Batch: 0-3 (3), loss = 17.440, avg_batch_time = 124.792
Batch: 0-4 (4), loss = 6.694, avg_batch_time = 123.639
Batch: 0-5 (5), loss = 7.626, avg_batch_time = 122.969
Batch: 0-6 (6), loss = 7.398, avg_batch_time = 122.258
Batch: 0-7 (7), loss = 12.130, avg_batch_time = 121.907
Batch: 0-8 (8), loss = 6.731, avg_batch_time = 121.434
Batch: 0-9 (9), loss = 11.450, avg_batch_time = 121.096
Batch: 0-10 (10), loss = 11.122, avg_batch_time = 120.706
Batch: 0-11 (11), loss = 8.668, avg_batch_time = 120.283
Batch: 0-12 (12), loss = 16.955, avg_batch_time = 120.129
Batch: 0-13 (13), loss = 6.261, avg_batch_time = 120.009
Batch: 0-14 (14), loss = 7.751, avg_batch_time = 119.861
Batch: 0-15 (15), loss = 5.457, avg_batch_time = 119.828
Batch: 0-16 (16), loss = 6.804, avg_batch_time



Batch: 1-0 (125), loss = 6.146, avg_batch_time = 118.885
Batch: 1-1 (126), loss = 4.650, avg_batch_time = 118.920
Batch: 1-2 (127), loss = 8.485, avg_batch_time = 118.950
Batch: 1-3 (128), loss = 7.275, avg_batch_time = 118.969
Batch: 1-4 (129), loss = 7.095, avg_batch_time = 118.995
Batch: 1-5 (130), loss = 8.246, avg_batch_time = 119.014
Batch: 1-6 (131), loss = 7.572, avg_batch_time = 119.042
Batch: 1-7 (132), loss = 4.830, avg_batch_time = 119.081
Batch: 1-8 (133), loss = 15.873, avg_batch_time = 119.097
Batch: 1-9 (134), loss = 8.208, avg_batch_time = 119.118
Batch: 1-10 (135), loss = 6.314, avg_batch_time = 119.140
Batch: 1-11 (136), loss = 10.616, avg_batch_time = 119.165
Batch: 1-12 (137), loss = 10.695, avg_batch_time = 119.188
Batch: 1-13 (138), loss = 6.262, avg_batch_time = 119.209
Batch: 1-14 (139), loss = 4.723, avg_batch_time = 119.229
Batch: 1-15 (140), loss = 3.624, avg_batch_time = 119.240
Batch: 1-16 (141), loss = 9.286, avg_batch_time = 119.256
Batch: 1-17 (142), lo

In [None]:
train(data_loader=dataloader, 
      model=model, 
      optimizer=OPTIMIZER, 
      scheduler=SCHEDULER, 
      total_epochs=10, 
      save_interval=100, 
      save_folder=os.path.join(root_dir, 'models'),
      loss=LOSS)

10 epochs in total, 125 batches per epoch
Start epoch 0
Batch: 0-0 (0), loss = 3.943, avg_batch_time = 154.437
Batch: 0-1 (0), loss = 2.725, avg_batch_time = 276.230
Batch: 0-2 (0), loss = 11.046, avg_batch_time = 399.578
Batch: 0-3 (0), loss = 9.860, avg_batch_time = 521.917
Batch: 0-4 (0), loss = 12.751, avg_batch_time = 644.145
Batch: 0-5 (0), loss = 7.252, avg_batch_time = 766.314
Batch: 0-6 (0), loss = 7.185, avg_batch_time = 890.304
Batch: 0-7 (0), loss = 7.130, avg_batch_time = 1012.247
Batch: 0-8 (0), loss = 4.522, avg_batch_time = 1134.873
Batch: 0-9 (0), loss = 8.529, avg_batch_time = 1255.715
Batch: 0-10 (0), loss = 5.453, avg_batch_time = 1376.416
Batch: 0-11 (0), loss = 1.983, avg_batch_time = 1496.725
Batch: 0-12 (0), loss = 18.831, avg_batch_time = 1618.205
Batch: 0-13 (0), loss = 6.627, avg_batch_time = 1739.805
Batch: 0-14 (0), loss = 7.120, avg_batch_time = 1864.680
Batch: 0-15 (0), loss = 3.252, avg_batch_time = 1988.287
Batch: 0-16 (0), loss = 15.135, avg_batch_time



Batch: 1-0 (125), loss = 8.970, avg_batch_time = 118.433
Batch: 1-1 (125), loss = 9.017, avg_batch_time = 119.384
Batch: 1-2 (125), loss = 5.375, avg_batch_time = 120.337
Batch: 1-3 (125), loss = 7.075, avg_batch_time = 121.284
Batch: 1-4 (125), loss = 7.481, avg_batch_time = 122.229
Batch: 1-5 (125), loss = 1.734, avg_batch_time = 123.177
Batch: 1-6 (125), loss = 19.267, avg_batch_time = 124.127
Batch: 1-7 (125), loss = 19.820, avg_batch_time = 125.075
Batch: 1-8 (125), loss = 10.516, avg_batch_time = 126.022
Batch: 1-9 (125), loss = 10.060, avg_batch_time = 126.974
Batch: 1-10 (125), loss = 7.029, avg_batch_time = 127.918
Batch: 1-11 (125), loss = 9.782, avg_batch_time = 128.862
Batch: 1-12 (125), loss = 17.294, avg_batch_time = 129.816
Batch: 1-13 (125), loss = 9.519, avg_batch_time = 130.759
Batch: 1-14 (125), loss = 24.435, avg_batch_time = 131.705
Batch: 1-15 (125), loss = 10.206, avg_batch_time = 132.658
Batch: 1-16 (125), loss = 11.990, avg_batch_time = 133.606
Batch: 1-17 (125

KeyboardInterrupt: ignored

In [None]:
lmbda = lambda epoch: 1.00
scheduler_toy = lr_scheduler.MultiplicativeLR(OPTIMIZER, lr_lambda=lmbda)
train(data_loader=dataloader_toy, 
      model=model, 
      optimizer=OPTIMIZER, 
      scheduler=scheduler_toy,
      total_epochs=1000, 
      save_interval=100, 
      save_folder=os.path.join(root_dir, 'models'),
      loss=LOSS)

1000 epochs in total, 1 batches per epoch
Start epoch 0
Batch: 0-0 (0), loss = 6.230, avg_batch_time = 140.363
lr = [1e-05]
Start epoch 1




Batch: 1-0 (1), loss = 6.179, avg_batch_time = 133.846
lr = [1e-05]
Start epoch 2
Batch: 2-0 (2), loss = 6.086, avg_batch_time = 131.606
lr = [1e-05]
Start epoch 3
Batch: 3-0 (3), loss = 5.963, avg_batch_time = 130.472
lr = [1e-05]
Start epoch 4
Batch: 4-0 (4), loss = 5.820, avg_batch_time = 129.705
lr = [1e-05]
Start epoch 5
Batch: 5-0 (5), loss = 5.673, avg_batch_time = 129.202
lr = [1e-05]
Start epoch 6
Batch: 6-0 (6), loss = 5.531, avg_batch_time = 128.844
lr = [1e-05]
Start epoch 7
Batch: 7-0 (7), loss = 5.401, avg_batch_time = 128.569
lr = [1e-05]
Start epoch 8
Batch: 8-0 (8), loss = 5.288, avg_batch_time = 128.373
lr = [1e-05]
Start epoch 9
Batch: 9-0 (9), loss = 5.195, avg_batch_time = 128.223
lr = [1e-05]
Start epoch 10
Batch: 10-0 (10), loss = 5.124, avg_batch_time = 128.019
lr = [1e-05]
Start epoch 11
Batch: 11-0 (11), loss = 5.073, avg_batch_time = 127.770
lr = [1e-05]
Start epoch 12
Batch: 12-0 (12), loss = 5.039, avg_batch_time = 127.173
lr = [1e-05]
Start epoch 13
Batch:

KeyboardInterrupt: ignored

In [None]:
model(med[0]['scan'].unsqueeze(0).unsqueeze(0))

tensor([[-0.1229]], grad_fn=<AddmmBackward0>)

In [None]:
(med[0]['icp'] - 15) / 11

-0.5909090909090909

In [None]:
model2 = resnet.resnet10(sample_input_D=128,
                 sample_input_H=128,
                 sample_input_W=512)
model2(med[0]['scan'].unsqueeze(0).unsqueeze(0))

  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


tensor([[0.0331]], grad_fn=<AddmmBackward0>)

In [None]:
LOSS(torch.tensor(32.), torch.tensor(1.))

tensor(961.)

In [None]:
model(med[0]['scan'].unsqueeze(0).unsqueeze(0))

tensor([[-18131.1230]], grad_fn=<AddmmBackward0>)

In [None]:
import torch 
import torchvision
import torch.nn as nn 
from IPython.display import Image 
from torchvision import transforms
import matplotlib.pyplot as plt
import seaborn as sns
import random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 7)

from google.colab import drive
drive.mount('/content/drive')

cifar10_train = torchvision.datasets.CIFAR10(root='./cifar10', 
                                             train=True, 
                                             transform=None, 
                                             target_transform=None,
                                             download=True)
cifar10_test = torchvision.datasets.CIFAR10(root='./cifar10', 
                                             train=False, 
                                             transform=None, 
                                             target_transform=None,
                                             download=True)

# Divides the dataset into train and val so that we can use the val to choose our hyperparameters
train_dataset, val_dataset = torch.utils.data.random_split(cifar10_train, [40000, 10000], 
                                                           generator=torch.Generator().manual_seed(12345))
test_dataset = cifar10_test

class MapDataset(torch.utils.data.Dataset):
    """
    Given a dataset, creates a dataset which applies a mapping function
    to its items (lazily, only when an item is called).

    Note that data is not cloned/copied from the initial dataset.
    """

    def __init__(self, dataset, map_fn):
        self.dataset = dataset
        self.map = map_fn

    def __getitem__(self, index):
        if self.map:     
            x = self.map(self.dataset[index][0]) 
        else:     
            x = self.dataset[index][0]  
        y = self.dataset[index][1]         
        return x, y

    def __len__(self):
        return len(self.dataset)

# Notice that we apply the same mean and std normalization calculated on train, to both the train and test datasets.
test_transform = transforms.Compose([
                                     transforms.ToTensor(),
                                     transforms.Normalize(
                                         [0.4373, 0.4434, 0.4725],
                                         [0.1201, 0.1231, 0.1052])
                                     ])

train_transform = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize(
                                          [0.4373, 0.4434, 0.4725],
                                          [0.1201, 0.1231, 0.1052])
                                      ])

train_dataset_w_transform  = MapDataset(train_dataset, train_transform)
val_dataset_w_transform = MapDataset(val_dataset, test_transform)
test_dataset_w_transform = MapDataset(test_dataset, test_transform)

bs = 128
torch.backends.cudnn.benchmark = True
train_loader = DataLoader(train_dataset_w_transform, batch_size=bs, shuffle=True, drop_last=False,num_workers=10, pin_memory=True)
val_loader = DataLoader(val_dataset_w_transform, batch_size=bs, shuffle=False, drop_last=False,num_workers=10, pin_memory=True)
test_loader = DataLoader(test_dataset_w_transform, batch_size=bs, shuffle=False, drop_last=False,num_workers=10, pin_memory=True)

def train_loop(model, criterion, optimizer,  train_loader, val_loader, device):
    """
    Generic training loop

    Parameters
    ----------
    model : Object instance of your model class 
    criterion : Loss function 
    optimizer : Instance of optimizer class of your choice 
    train_loader : Training data loader 
    val_loader : Validation data loader

    Returns
    -------
    train_losses : List with train loss on dataset per epoch
    train_accuracies : List with train accuracy on dataset per epoch
    val_losses : List with validation loss on dataset per epoch
    val_accuracies : List with validation accuracy on dataset per epoch

    """
    best_val = 0.0
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    max_patience = 5
    patience_counter = 0

    # Training
    for t in tqdm(range(50)):
        # TODO : Set the model to train mode        
        model.train()
        train_loss = 0
        correct = 0
        n = 0
        # TODO: Loop over the training set 
        for batch_idx, samples in enumerate(train_loader):
            # TODO: Put the inputs and targets on the write device
            image = samples[0].to(device)
            label = samples[1].to(device)
            # TODO: Feed forward to get the logits
            output = model(image)
            _, preds = torch.max(output, dim = 1)
            # TODO: Compute the loss and accuracy
            loss = criterion(output, label)
            # TODO: zero the gradients before running
            # the backward pass.
            optimizer.zero_grad()
            # TODO: Backward pass to compute the gradient
            # of loss w.r.t our learnable params. 
            loss.backward()
            # TODO: Update params
            optimizer.step()
            # TODO: Keep track of accuracy and loss
            train_loss += loss.item()
            n += image.shape[0]
            correct += torch.sum(preds == label).item()
        
        train_losses.append(train_loss / n)
        train_accuracies.append(correct / n)
        
        # TODO: Switch the model to eval mode
        model.eval()

        val_loss = 0
        val_correct = 0
        val_n = 0
        with torch.no_grad():
            # TODO: Loop over the validation set 
            for batch_idx, samples in enumerate(val_loader):
                # TODO: Put the inputs and targets on the write device
                image = samples[0].to(device)
                label = samples[1].to(device)
                # TODO: Feed forward to get the logits
                output = model(image)
                _, preds = torch.max(output, dim=1)
                # TODO: Compute the loss and accuracy
                loss = criterion(output, label)
                # TODO: Keep track of accuracy and loss
                val_loss += loss.item()
                val_n += image.shape[0]
                val_correct += torch.sum(preds == label).item()
            val_losses.append(val_loss / val_n)
            val_accuracies.append(val_correct / val_n)
                
        if val_accuracies[-1] > best_val:
            best_val = val_accuracies[-1]
            patience_counter = 0
            # TODO: Save best model, optimizer, epoch_number
            checkpoint = {
                'model':model.state_dict(),
                'epoch':t,
                'optimizer':optimizer.state_dict()
            }
            torch.save(checkpoint, 'checkpoint.pth')
            
        else:
            patience_counter += 1    
            if patience_counter > max_patience: 
                break

        print("[EPOCH]: %i, [TRAIN LOSS]: %.6f, [TRAIN ACCURACY]: %.3f" % (t, train_losses[-1], train_accuracies[-1]))
        print("[EPOCH]: %i, [VAL LOSS]: %.6f, [VAL ACCURACY]: %.3f \n" % (t, val_losses[-1] ,val_accuracies[-1]))

    return train_losses, train_accuracies, val_losses, val_accuracies

class View(nn.Module):
    def __init__(self, shape):
        super().__init__()
        self.shape = shape

    def forward(self, x):
        return x.view(*self.shape)

ShallowNet =  nn.Sequential(
      nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size=5, padding=2),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size=3, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=8),
      View((-1,256)),
      nn.Linear(256, 10),
)

class ShallowConvnet(nn.Module):
    def __init__(self, input_channels, num_classes):
        """

        Parameters
        ----------
        input_channels : Number of input channels
        num_classes : Number of classes for the final prediction 
        """
        
        # TODO
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels = input_channels, out_channels = 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=8),
            View((-1,256)),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        """

        Parameters
        ----------
        x

        Returns
        -------
        output : Result after running through the model
        """
        
        # TODO
        return self.model(x)

# TODO : Initialize the model and cast to correct device
model = ShallowConvnet(input_channels=3, num_classes=10)
model.to(device)
# TODO : Initialize the criterion
criterion = nn.CrossEntropyLoss()
# TODO : Initialize the SGD optimizer with lr 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# TODO : Run the training loop using this model
loss_train, acc_train, loss_val, acc_val = train_loop(model,
    criterion,
    optimizer,
    train_loader,
    val_loader,
    device)

Mounted at /content/drive
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./cifar10/cifar-10-python.tar.gz to ./cifar10
Files already downloaded and verified


  cpuset_checked))


  0%|          | 0/50 [00:00<?, ?it/s]

[EPOCH]: 0, [TRAIN LOSS]: 0.017868, [TRAIN ACCURACY]: 0.133
[EPOCH]: 0, [VAL LOSS]: 0.017815, [VAL ACCURACY]: 0.161 

[EPOCH]: 1, [TRAIN LOSS]: 0.017435, [TRAIN ACCURACY]: 0.214
[EPOCH]: 1, [VAL LOSS]: 0.017377, [VAL ACCURACY]: 0.247 

[EPOCH]: 2, [TRAIN LOSS]: 0.016973, [TRAIN ACCURACY]: 0.260
[EPOCH]: 2, [VAL LOSS]: 0.016897, [VAL ACCURACY]: 0.261 

[EPOCH]: 3, [TRAIN LOSS]: 0.016488, [TRAIN ACCURACY]: 0.277
[EPOCH]: 3, [VAL LOSS]: 0.016421, [VAL ACCURACY]: 0.284 

[EPOCH]: 4, [TRAIN LOSS]: 0.016030, [TRAIN ACCURACY]: 0.292
[EPOCH]: 4, [VAL LOSS]: 0.015975, [VAL ACCURACY]: 0.296 

[EPOCH]: 5, [TRAIN LOSS]: 0.015633, [TRAIN ACCURACY]: 0.305
[EPOCH]: 5, [VAL LOSS]: 0.015607, [VAL ACCURACY]: 0.308 

[EPOCH]: 6, [TRAIN LOSS]: 0.015302, [TRAIN ACCURACY]: 0.317
[EPOCH]: 6, [VAL LOSS]: 0.015293, [VAL ACCURACY]: 0.316 

[EPOCH]: 7, [TRAIN LOSS]: 0.015010, [TRAIN ACCURACY]: 0.327
[EPOCH]: 7, [VAL LOSS]: 0.015011, [VAL ACCURACY]: 0.324 



KeyboardInterrupt: ignored

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7411 sha256=61a3a59f66b8151048a6cd17fcafa54a158b57f841edc22e3c1d1aca2c53e458
  Stored in directory: /root/.cache/pip/wheels/6e/f8/83/534c52482d6da64622ddbf72cd93c35d2ef2881b78fd08ff0c
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 25.6 GB  | Proc size: 10.8 GB
GPU RAM Free: 14907MB | Used: 1373MB | Util   8% | Total 16280MB


In [None]:
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.3 GB




In [None]:
# Import packages
import os,sys,humanize,psutil,GPUtil

def mem_report():
  print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ))
  
  GPUs = GPUtil.getGPUs()
  for i, gpu in enumerate(GPUs):
    print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    
# Execute function
mem_report()

CPU RAM Free: 25.6 GB
GPU 0 ... Mem Free: 14907MB / 16280MB | Utilization   8%


In [None]:
import torchvision.models as models

wide_resnet50_2 = models.wide_resnet50_2(pretrained=True)
if torch.cuda.is_available():
  wide_resnet50_2.cuda()

mem_report()

CPU RAM Free: 25.6 GB
GPU 0 ... Mem Free: 14901MB / 16280MB | Utilization   8%
