In [1]:
## Fast Import
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys, os
import string
import pathlib
import time
import math, random
import pprint
import yaml
from collections import OrderedDict
from tqdm import tqdm

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

import torch, torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import albumentations as A

np.set_printoptions(precision=3)
curr_path = pathlib.Path(os.getcwd()).absolute()

cards = !echo $SGE_HGR_gpu_card
device = torch.device(f"cuda:{cards[0]}" if torch.cuda.is_available() else 'cpu')
print(device)

cuda:2


In [2]:
class EpochMeters:
    """ Updates every iteration and keeps track of accumulated stats. """
    def __init__(self):
        self.accums = {}
        self.ns = {}

    def update(self, metrics_d, n=1):
        for k, item in metrics_d.items():
            if k not in self.accums:
                self.accums[k] = item
                self.ns[k] = n
                continue
            self.accums[k] += item
            self.ns[k] += n

    def avg(self, no_avg=[]):
        ret = {}
        for k, v in self.accums.items():
            if k in no_avg:
                ret[k] = v
            else:
                ret[k] = v/self.ns[k]
        return ret

def mem():
    """ Get primary GPU card memory usage. """
    if not torch.cuda.is_available():
        return -1.
    import subprocess
    result = subprocess.check_output(
        [
            'nvidia-smi', '--query-gpu=memory.used',
            '--format=csv,nounits,noheader'
        ], encoding='utf-8')
    # Convert lines into a dictionary
    gpu_memory = [int(x) for x in result.strip().split('\n')]
    mem_map = dict(zip(range(len(gpu_memory)), gpu_memory))
    prim_card_num = int(cards[0])
    return mem_map[prim_card_num]/1000

In [3]:
data_dir = '/afs/crc.nd.edu/user/y/yzhang46/datasets/CIFAR-10/torch'
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
train_set = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, 
                transform=T.Compose([
                    T.RandomCrop(32, padding=4),
                    T.RandomHorizontalFlip(p=0.5),
#                     T.RandomChoice([T.ColorJitter(0.2, 0.2, 0.2), 
#                                     T.RandomGrayscale(p=0.4)]),
                    T.ToTensor(),
                    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                ]))
train_loader = torch.utils.data.DataLoader(train_set, batch_size=256,
                shuffle=True, num_workers=4)
test_set = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True,
                transform=T.Compose([
                    T.ToTensor(),
                    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                ]))
test_loader = torch.utils.data.DataLoader(test_set, batch_size=256,
                shuffle=False, num_workers=4)

# model = torchvision.models.resnet50(pretrained=False)
# model.fc = nn.Linear(model.fc.in_features, 10, bias=True)
import resnet
model = resnet.ResNet50()
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
def run_epoch(loader, train=True):
    tot, corr = 0, 0
    for it, batch in enumerate(loader):
        X = batch[0].to(device)
        Y = batch[1].long().to(device, non_blocking=True)
        
        def imshow(img):
            plt.figure(figsize=(10, 30))
            img = img / 2 + 0.5     # unnormalize
            npimg = img.numpy()
            plt.imshow(np.transpose(npimg, (1, 2, 0)))
            plt.show()
        # imshow(torchvision.utils.make_grid(X.cpu()))
        
        out = model(X)
        loss = criterion(out, Y)
        
        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        _, predicted = out.max(1)
        batch_corr = (predicted == Y).sum()
        corr += batch_corr
        batch_tot = X.shape[0]
        tot += batch_tot
        
        if train and it % 50 == 0:
            print(
                f"    Iter {it+1}/{len(loader)} ({mem():.1f} GB) - "
                f"loss {loss.item():.3f}, acc = {batch_corr}/{batch_tot}"
            )
    print(f"** Epoch Summary: acc = {corr}/{tot} {100 * corr / tot:.2f}%")

for epoch in range(200):
    print("\n=============================")
    print(f"Starting Epoch {epoch+1} (lr: {next(iter(optimizer.param_groups))['lr']:.7f})")
    print("=============================")
    
    print('Training..')
    model.train()
    run_epoch(train_loader)
    scheduler.step()
    
    print('Testing..')
    model.eval()
    with torch.no_grad():
        run_epoch(test_loader, train=False)
    torch.save(model.state_dict(), 'resnet50_cifar10.pth')    

In [12]:
"""
- 200 epochs, +color/gray, 0.5 norm, pretrain

- 200 epochs, no color, reg norm, pretrain
   SGD(lr=0.05)
    Train 99.81%, Test 89.12%
   SGD(lr=0.1) + Cosine Scheduler
    Train 99.7%, Test 89.49%
   SGD(lr=0.1) + Cosine Scheduler (random init)
    Train 99.8%, Test 88.4%
- 200 epochs, no color, reg norm, random init, new resnet
   SGD(lr=0.1) + Cosine Scheduler (random init)
    Train 100%, Test 94.99%
"""

rnet = torchvision.models.resnet50()
print(sum([p.numel() for p in rnet.parameters()]))
print(sum([p.numel() for p in model.parameters()]))

### Quick Eval
# model.load_state_dict(torch.load('resnet50_cifar10'))
# model.eval()
with torch.no_grad():
    batch = next(iter(test_loader))
    X = batch[0].to(device)
    model(X)
    
    

25557032
23520842
torch.Size([256, 64, 32, 32])
torch.Size([256, 2048, 4, 4])
torch.Size([256, 2048, 1, 1])
torch.Size([256, 2048])
