In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as functional

import torchvision 
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display,clear_output
import pandas as pd
import time
import json

from itertools import product
from collections import OrderedDict
from collections import namedtuple

In [2]:
train_set=torchvision.datasets.FashionMNIST(
    root="./data",train=True,download=True,
    transform=transforms.Compose([transforms.ToTensor()]))

In [3]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1=nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5)
        self.conv2=nn.Conv2d(in_channels=6,out_channels=12,kernel_size=5)
        
        self.fc1=nn.Linear(in_features=12*4*4,out_features=120)
        self.fc2=nn.Linear(in_features=120,out_features=60)
        self.out=nn.Linear(in_features=60,out_features=10)
        
    def forward(self,t):
        # layer1-input layer
        t=t
        
        # layer2-hidden conv layer        
        t=self.conv1(t)
        t=functional.relu(t)
        t=functional.max_pool2d(t,kernel_size=2,stride=2)
        
        # layer3-hidden conv layer        
        t=self.conv2(t)
        t=functional.relu(t)
        t=functional.max_pool2d(t,kernel_size=2,stride=2)
        
        # layer4-hidden linear layer        
        t=t.reshape(-1,12*4*4)
        t=self.fc1(t)
        t=functional.relu(t)
        
        # layer5-hidden linear layer        
        t=self.fc2(t)
        t=functional.relu(t)
        
        # layer6-output layer        
        t=self.out(t)
        
        return t

#### Moving to GPU

In [4]:
t=torch.ones(1,1,28,28)
network=Network()

In [5]:
t=t.cuda()
network=network.cuda()

In [6]:
gpu_pred=network(t)
gpu_pred.device

device(type='cuda', index=0)

#### Moving to CPU

In [7]:
t=t.cpu()
network=network.cpu()

In [8]:
cpu_pred=network(t)
cpu_pred.device

device(type='cpu')

### Working with Tensors

In [9]:
t1=torch.tensor([[1,2],[3,4]])
t2=torch.tensor([[5,6],[7,8]])

In [10]:
t1.device,t2.device

(device(type='cpu'), device(type='cpu'))

In [11]:
t1=t1.cuda()
t1.device

device(type='cuda', index=0)

In [12]:
try: t1+t2
except Exception as e:print(e)

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [13]:
try: t2+t1
except Exception as e:print(e)

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [14]:
t2=t2.to('cuda')

In [15]:
t2=t2.cuda()

In [16]:
t1+t2

tensor([[ 6,  8],
        [10, 12]], device='cuda:0')

#### Working with NN Modules

In [17]:
network=Network()

In [18]:
for name,param in network.named_parameters():
    print(name,'\t\t',param.device,'\t\t',param.shape)

conv1.weight 		 cpu 		 torch.Size([6, 1, 5, 5])
conv1.bias 		 cpu 		 torch.Size([6])
conv2.weight 		 cpu 		 torch.Size([12, 6, 5, 5])
conv2.bias 		 cpu 		 torch.Size([12])
fc1.weight 		 cpu 		 torch.Size([120, 192])
fc1.bias 		 cpu 		 torch.Size([120])
fc2.weight 		 cpu 		 torch.Size([60, 120])
fc2.bias 		 cpu 		 torch.Size([60])
out.weight 		 cpu 		 torch.Size([10, 60])
out.bias 		 cpu 		 torch.Size([10])


In [19]:
network.to('cuda')

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)

In [20]:
for name,params in network.named_parameters():
    print(name,'\t\t',param.device,'\t\t',param.shape)

conv1.weight 		 cuda:0 		 torch.Size([10])
conv1.bias 		 cuda:0 		 torch.Size([10])
conv2.weight 		 cuda:0 		 torch.Size([10])
conv2.bias 		 cuda:0 		 torch.Size([10])
fc1.weight 		 cuda:0 		 torch.Size([10])
fc1.bias 		 cuda:0 		 torch.Size([10])
fc2.weight 		 cuda:0 		 torch.Size([10])
fc2.bias 		 cuda:0 		 torch.Size([10])
out.weight 		 cuda:0 		 torch.Size([10])
out.bias 		 cuda:0 		 torch.Size([10])


In [21]:
sample=torch.ones(1,1,28,28)
sample.shape

torch.Size([1, 1, 28, 28])

In [22]:
try: network(sample)
except Exception as e: print(e)

Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same


In [23]:
try:
    pred=network(sample.to('cuda'))
    print(pred)
except Exception as e:
    print(e)

tensor([[ 0.0520, -0.0532,  0.0976, -0.1109,  0.0837, -0.0172, -0.0454, -0.0684,
         -0.1101,  0.0482]], device='cuda:0', grad_fn=<AddmmBackward>)


#### checking GPU

In [24]:
torch.cuda.is_available()

True

#### Using the Gpu: Test

In [25]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        Run=namedtuple("Run",params.keys())
        
        runs=[]
        for v in product(*params.values()):
            runs.append(Run(*v))
        
        return runs

In [26]:
class RunManager():
    def __init__(self):
        self.epoch_count=0
        self.epoch_loss=0
        self.epoch_num_correct=0
        self.epoch_start_time=None
        
        self.run_params=None
        self.run_count=0
        self.run_data=[]
        self.run_start_time=None
        
        self.network=None
        self.loader=None
        self.tb=None
        
    def begin_run(self,run,network,loader):
        self.run_start_time=time.time()
        self.run_params=run
        self.run_count+=1
        
        self.network=network
        self.loader=loader
        self.tb=SummaryWriter(comment=f"-{run}")
        
        images,labels=next(iter(self.loader))
        grid=torchvision.utils.make_grid(images)
        
        self.tb.add_image("images",grid)
        self.tb.add_graph(self.network,images.to(getattr(run,'device','cpu')))
        
    def end_run(self):
        self.tb.close()
        self.epoch_count=0
        
    def begin_epoch(self):
        self.epoch_start_time=time.time()
        self.epoch_count+=1
        self.epoch_loss=0
        self.epoch_num_correct=0
        
    def end_epoch(self):
        epoch_duration=time.time()-self.epoch_start_time
        run_duration=time.time()-self.run_start_time
        
        loss=self.epoch_loss/len(self.loader.dataset)
        accuracy=self.epoch_num_correct/len(self.loader.dataset)
        
        self.tb.add_scalar("Loss",loss,self.epoch_count)
        self.tb.add_scalar("Accuracy",accuracy,self.epoch_count)
        
        for name,param in self.network.named_parameters():
            self.tb.add_histogram(name,param,self.epoch_count)
            self.tb.add_histogram(f"{name}.grad",param.grad,self.epoch_count)
        
        results=OrderedDict()
        results["run"]=self.run_count
        results["epoch"]=self.epoch_count
        results["loss"]=loss
        results["accuracy"]=accuracy
        results["epoch duration"]=epoch_duration
        results["run duration"]=run_duration
        
        for k,v in self.run_params._asdict().items(): results[k]=v
        self.run_data.append(results)
        
        df=pd.DataFrame.from_dict(self.run_data,orient="columns")
        
        clear_output(wait=True)
        display(df)
        
    def track_loss(self,loss,batch):
        self.epoch_loss+=loss.item()*batch[0].shape[0]

    def track_num_correct(self,preds,labels):
        self.epoch_num_correct+=self._get_num_correct(preds,labels)

    @torch.no_grad()
    def _get_num_correct(self,preds,labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self,filename):
        pd.DataFrame.from_dict(self.run_data,orient="columns").to_csv(f"{filename}.csv")

        with open(f"{filename}.json","w",encoding="utf-8") as f:
            json.dump(self.run_data,f,ensure_ascii=False,indent=4)
            

In [28]:
params=OrderedDict( lr=[0.01],
                    batch_size=[1000,10000,20000],
                    num_workers=[0,1],
                    device=['cuda','cpu'])

m=RunManager()
for run in RunBuilder.get_runs(params):
    device=torch.device(run.device)
    network=Network().to(device)
    loader=DataLoader(train_set,batch_size=run.batch_size,num_workers=run.num_workers)
    optimizer=optim.Adam(network.parameters(),lr=run.lr)
    
    m.begin_run(run,network,loader)
    for epoch in range(1):
        m.begin_epoch()
        for batch in loader:
            images=batch[0].to(device)
            labels=batch[1].to(device)
            preds=network(images)
            loss=functional.cross_entropy(preds,labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            m.track_loss(loss,batch)
            m.track_num_correct(preds,labels)
        m.end_epoch()
    m.end_run()
m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
0,1,1,0.972195,0.630267,6.219584,7.202937,0.01,1000,0,cuda
1,2,1,0.975532,0.6263,12.531101,13.320128,0.01,1000,0,cpu
2,3,1,1.026296,0.611767,6.083094,7.028283,0.01,1000,1,cuda
3,4,1,0.989903,0.616567,10.554919,11.799164,0.01,1000,1,cpu
4,5,1,2.210332,0.194167,8.273864,12.540068,0.01,10000,0,cuda
5,6,1,2.184789,0.182867,13.544657,20.219034,0.01,10000,0,cpu
6,7,1,2.198799,0.226617,6.87904,11.875301,0.01,10000,1,cuda
7,8,1,2.117992,0.218133,9.519657,17.062767,0.01,10000,1,cpu
8,9,1,2.274265,0.135317,11.671031,19.492289,0.01,20000,0,cuda
9,10,1,2.28391,0.115467,13.236289,25.076207,0.01,20000,0,cpu


In [29]:
pd.DataFrame.from_dict(m.run_data,orient='columns').sort_values('epoch duration')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
2,3,1,1.026296,0.611767,6.083094,7.028283,0.01,1000,1,cuda
0,1,1,0.972195,0.630267,6.219584,7.202937,0.01,1000,0,cuda
10,11,1,2.28507,0.179567,6.718554,17.165303,0.01,20000,1,cuda
6,7,1,2.198799,0.226617,6.87904,11.875301,0.01,10000,1,cuda
4,5,1,2.210332,0.194167,8.273864,12.540068,0.01,10000,0,cuda
7,8,1,2.117992,0.218133,9.519657,17.062767,0.01,10000,1,cpu
11,12,1,2.297009,0.131417,10.105667,24.794647,0.01,20000,1,cpu
3,4,1,0.989903,0.616567,10.554919,11.799164,0.01,1000,1,cpu
8,9,1,2.274265,0.135317,11.671031,19.492289,0.01,20000,0,cuda
1,2,1,0.975532,0.6263,12.531101,13.320128,0.01,1000,0,cpu
