In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
import torchvision
import torchvision.transforms as transforms

torch.set_printoptions(linewidth = 120) 
torch.set_grad_enabled(True) 

from itertools import product #computes cartesian product given multiple list inputs  
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter #allows to send data to tensorboard files 
from IPython.display import display, clear_output

import pandas as pd
import time
import json

from collections import OrderedDict
from collections import namedtuple
from itertools import product

In [3]:
class RunBuilder():
    @staticmethod #The main thing to note about using this class is that it has a static method called get_runs(). This method will get the runs for us that it builds based on the parameters we pass in.
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs
        

In [4]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [5]:
class Network(nn.Module):
        
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)


    def forward(self, t):
        
        # (1) input layer
        t = t
        
        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size = 2, stride =2)

        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size = 2, stride =2)

        # (4) hidden linear layer
        t = t.reshape(-1, 12*4*4)
        t = self.fc1(t)
        t = F.relu(t)
       
        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t) 

        # (6) ouput layer
        t = self.out(t)
        return t

In [6]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data'  
    ,train=True    
    ,download=True 
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

In [7]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=100, shuffle = True)


RunManager helps us to pull out the following clutter of tensorboard calls and also add additional functionality


            tb.add_scalar('Loss', total_loss, epoch)
            tb.add_scalar('Number Correct', total_correct, epoch)
            tb.add_scalar('Accuracy', total_correct/len(train_set), epoch)

        for name,weight in network.named_parameters():
  #
            tb.add_histogram(name, weight, epoch)
   #
            tb.add_histogram(f'{name}.grad', weight.grad, epoch)


In [8]:
#Extract a class - refactoring techniques 

class Epoch():
    def __init__(self):
        self.count = 0
        self.loss = 0
        self.num_correct = 0
        self.start_time = None 

#Then, we'll replace these class variable with an instance of the Epoch class.
#We might even change the count variable to have a more intuitive name, like say number or id.

params = OrderedDict(
    lr = [.01]
    ,batch_size = [1000, 10000]   
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    
    network = Network()
    train_loader = torch.utils.data.DataLoader(train_set, batch_size = run.batch_size)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, train_loader)

    for epoch in range(2):
        m.begin_epoch()
        for batch in train_loader:

                images, labels = batch 
                preds = network(images) 
                loss = F.cross_entropy(preds, labels) 
                optimizer.zero_grad()               
                loss.backward()          
                optimizer.step()
                
                m.track_loss(loss)
                m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')

# Pytorch and the GPU : CUDA

Moving to GPU

In [9]:
t = torch.ones(1,1,28,28)
network = Network()

In [10]:
t = t.cuda()
newtork = network.cuda()

In [11]:
gpu_pred = network(t)
gpu_pred.device

device(type='cuda', index=0)

Moving to CPU

In [12]:
t = t.cpu()
network = network.cpu()


In [13]:
cpu_pred = network(t)
cpu_pred.device

device(type='cpu')

Working with tensors 

In [14]:
t1 = torch.tensor([[1,2],[3,4]])

t2= torch.tensor([[5,6],[7,8]])


In [15]:
t1.device, t2.device

(device(type='cpu'), device(type='cpu'))

In [16]:
t1 = t1.to('cuda')
t1.device

device(type='cuda', index=0)

In [17]:
try : t1+t2
except Exception as e: print(e)

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [18]:
try : t2+t1
except Exception as e: print(e)

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [19]:
t2 = t2.to('cuda')

In [20]:
t1+t2

tensor([[ 6,  8],
        [10, 12]], device='cuda:0')

Working with NN modules 

what is putting network on a device ?

In [21]:
for name, param in network.named_parameters():
    print(name, '\t\t\t', param.shape)

conv1.weight 			 torch.Size([6, 1, 5, 5])
conv1.bias 			 torch.Size([6])
conv2.weight 			 torch.Size([12, 6, 5, 5])
conv2.bias 			 torch.Size([12])
fc1.weight 			 torch.Size([120, 192])
fc1.bias 			 torch.Size([120])
fc2.weight 			 torch.Size([60, 120])
fc2.bias 			 torch.Size([60])
out.weight 			 torch.Size([10, 60])
out.bias 			 torch.Size([10])


In [22]:
for name, p in network.named_parameters():
    print(p.device,'',name)

cpu  conv1.weight
cpu  conv1.bias
cpu  conv2.weight
cpu  conv2.bias
cpu  fc1.weight
cpu  fc1.bias
cpu  fc2.weight
cpu  fc2.bias
cpu  out.weight
cpu  out.bias


In [23]:
#network isnt on device, the paramters are on the device 

In [24]:
network.to('cpu')

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)

In [25]:
for name, p in network.named_parameters():
    print(p.device,'',name)

cpu  conv1.weight
cpu  conv1.bias
cpu  conv2.weight
cpu  conv2.bias
cpu  fc1.weight
cpu  fc1.bias
cpu  fc2.weight
cpu  fc2.bias
cpu  out.weight
cpu  out.bias


In [26]:
network.to('cuda')

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)

In [27]:
for name, p in network.named_parameters():
    print(p.device,'',name)

cuda:0  conv1.weight
cuda:0  conv1.bias
cuda:0  conv2.weight
cuda:0conv2.bias
cuda:0  fc1.weight
cuda:0  fc1.bias
cuda:0  fc2.weight
cuda:0  fc2.bias
cuda:0  out.weight
cuda:0  out.bias


In [28]:
sample = torch.ones(1,1,28,28)
sample.shape

torch.Size([1, 1, 28, 28])

In [29]:
try : network(sample)
except Exception as e : print(e)

Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same


In [30]:
try:
    pred = network(sample.to('cuda'))
    print(pred)
except Exception as e:
        print(e)

tensor([[ 0.0937,  0.0243,  0.0549,  0.1752, -0.0259,  0.0533, -0.1197, -0.0053,  0.1205,  0.0757]], device='cuda:0',
       grad_fn=<AddmmBackward>)


Checking for GPU

In [31]:
torch.cuda.is_available()

True

In [33]:
class RunManager():

    def __init__(self):

        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None 
        self.run_count = 0
        self.run_data = []  #track param,results for each epoch for each run
        self.run_start_time = None  #run durations

        self.network = None #save network
        self.loader = None #save data loader for run
        self.tb = None #summarywriter for tensorboard


#Anytime we see this, we need to be thinking about removing these prefixes. (epoch_count, epoch_loss, ....)
# Data that belongs together should be together. 
# This is done by encapsulating the data inside of a class.
#this is done in next cell

    def begin_run(self, run, network, loader):

        self.run_start_time = time.time()

        self.run_params = run
        self.run_count += 1

        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')

        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network, images.to(getattr(run, 'device', 'cpu')))     ####### getattr is modified - check which device the run for 

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self):
        self.epoch_start_time = time.time()

        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)

        results = OrderedDict()
        results['run'] = self.run_count
        results['epoch'] = self.epoch_count
        results['loss'] = loss
        results['accuracy'] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        
        clear_output(wait=True) #specific to jy notebooks clear curr o/p and display new data frame
        display(df)
    
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * batch[0].shape[0]

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    @torch.no_grad()

    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):

        pd.DataFrame.from_dict(self.run_data, orient='columns').to_csv(f'{fileName}.csv')
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:json.dump(self.run_data, f, ensure_ascii=False, indent=4)


# Using the GPU : Test

In [34]:
params = OrderedDict(
    lr = [.01]
    ,batch_size = [1000, 10000, 20000]
    ,num_workers = [0,1]
    ,device = ['cuda', 'cpu']   
)
m = RunManager()

for run in RunBuilder.get_runs(params):
    
    device = torch.device(run.device) ## create a pytroch device and pass cpu or cuda
    network = Network().to(device) #initialize network on cuda
    train_loader = torch.utils.data.DataLoader(train_set, batch_size = run.batch_size)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, train_loader)

    for epoch in range(1):
        m.begin_epoch()
        for batch in train_loader:

                images = batch[0].to(device) #unpack them & send to device 
                labels = batch[1].to(device) #unpack them & send to device
                preds = network(images) 
                loss = F.cross_entropy(preds, labels) 
                optimizer.zero_grad()               
                loss.backward()          
                optimizer.step()
                
                m.track_loss(loss)
                m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')

TypeError: track_loss() takes 2 positional arguments but 3 were given

In [None]:
pd.DataFrame.from_dict(m.run_data, orient='columns').sort_values('epoch duration') #sorting by run duration 