In [1]:
%matplotlib inline

import torch
import matplotlib.pyplot as plt
import torchvision
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
import time

  warn(f"Failed to load image Python extension: {e}")


## Exploring training capabilities with GPUs and pytorch

### First define the network and datasets

For this example we have a classification problem with only 2 classes, dogs and cats. We have a train dataset of 18745 items and a test dataset of 6253 items.

In [2]:
#Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Dataset location
TRAIN_DATA_PATH = "../Data/CATS_DOGS/train"
TEST_DATA_PATH = "../Data/CATS_DOGS/test"

TRANSFORM_IMG = transforms.Compose([
    transforms.Resize(150),
    transforms.CenterCrop(150),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225] )
    ])


We use a simple custom CNN for this problem

In [3]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 20, kernel_size=3)
        self.conv2 = torch.nn.Conv2d(20, 32, kernel_size=3)
        self.conv3 = torch.nn.Conv2d(32, 64, kernel_size=3)
        self.fc1 = torch.nn.Linear((64*17*17), 50)
        self.fc2 = torch.nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = x.view(-1, (64*17*17))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.sigmoid(x)

#### This command show interesting information about the network

In [4]:
testNet = Network()
testNet.to(device)
summary(testNet, (3, 150, 150))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 20, 148, 148]             560
            Conv2d-2           [-1, 32, 72, 72]           5,792
            Conv2d-3           [-1, 64, 34, 34]          18,496
            Linear-4                   [-1, 50]         924,850
            Linear-5                    [-1, 1]              51
Total params: 949,749
Trainable params: 949,749
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.26
Forward/backward pass size (MB): 5.17
Params size (MB): 3.62
Estimated Total Size (MB): 9.05
----------------------------------------------------------------




This shows an estimated total size of arround 9M
This should be the memory requirements but only for the model.

#### What is the amount of memory needed for training or test dataset?

In [22]:
print("This is the shape of the training batch")
print(img.shape)
print("And every element is of type")
print(img.dtype)
print("So the memory requirements for this batch are: {} MB".format((img.shape[0] * img.shape[1] * img.shape[2] * img.shape[3] * 4) / 1024 /1024))

This is the shape of the training batch
torch.Size([64, 3, 160, 160])
And every element is of type
torch.float32
So the memory requirements for this batch are: 18.75 MB


#### Define test and training functions

In [5]:
def test():
    #Sets the module in evaluation mode
    network.eval()
    test_loss = 0
    correct = 0
    #correct.to(device)
    
    #dont update dynamic computation graph
    with torch.no_grad():
        #for every example in test
        for data, target in test_data_loader:
            
            target = target.view(-1, 1).float()
            target.to(device)
            
            #evaluate the model
            output = network(data.to(device))
            
            #acumulate the loss
            test_loss += F.binary_cross_entropy(output, target.to(device)).item()
            
            pred_cls = output.round()
            correct += pred_cls.eq(target.view(-1, 1).to(device)).sum() 
        
    test_loss /= len(test_data_loader.dataset)
    test_losses.append(test_loss)
    print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_data_loader.dataset), 100. * correct / len(test_data_loader.dataset)))

In [6]:
def train(test_on_every_epoch = False):
    
    for epoch in range(1, num_epochs+1):
    
        
        if test_on_every_epoch:
            test()
        
        start = time.time()
  
        for batch_idx, (data, target) in enumerate(train_data_loader):
        
            network.train()
        
            #forward pass
            out = network(data.to(device))
        
            #Use negative log likelihood loss.
            loss = criterion(out, target.view(-1, 1).float().to(device))
        
        
            #with this gradients are calculated
            loss.backward()
    
            #update gradients
            optimizer.step()
        
            #Set gradients to zero
            optimizer.zero_grad()

            #Display iteration statistics
            if batch_idx % log_interval == 0:
            
                #print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_data_loader.dataset),100. * batch_idx / len(train_data_loader), loss.item()))
    
        end = time.time()
        print('Time: {} '.format(end - start))
    

### Before analisis, train model to check accuracy

In [8]:
# Hyper parameters
num_epochs = 4
batchsize = 32
batchsize_test = 32
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [9]:
train(True)


Test set: Avg. loss: 0.0218, Accuracy: 3125/6251 (50%)





Time: 5.971116304397583 

Test set: Avg. loss: 0.0192, Accuracy: 4156/6251 (66%)





Time: 5.925740957260132 

Test set: Avg. loss: 0.0177, Accuracy: 4482/6251 (72%)





Time: 5.991588115692139 

Test set: Avg. loss: 0.0167, Accuracy: 4587/6251 (73%)





Time: 5.989449977874756 


In [10]:
test()


Test set: Avg. loss: 0.0155, Accuracy: 4741/6251 (76%)



### How number of workers affect GPU utilization

#### One worker

In [10]:
# Hyper parameters
num_epochs = 3
batchsize = 220
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=1)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=1) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [11]:
train()





Time: 45.21055483818054 




Time: 45.3118040561676 




Time: 45.38182520866394 


![image-2.png](attachment:image-2.png)

- GPU utilization: 5% avg
- Time spent per epoch: 45s on average

#### Four workers

In [15]:
# Hyper parameters
num_epochs = 3
batchsize = 220
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=4)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=4) 

In [16]:
train()





Time: 12.162627458572388 




Time: 12.263587951660156 




Time: 12.302703142166138 


![image.png](attachment:image.png)

- GPU utilization: 20% avg
- Time spent per epoch: 12s on average

#### Eight workers

In [19]:
# Hyper parameters
num_epochs = 3
batchsize = 220
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=8)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=8) 

In [20]:
train()





Time: 6.958216428756714 




Time: 7.237610578536987 




Time: 6.975800275802612 


![image.png](attachment:image.png)

- GPU utilization: 40% avg
- Time spent per epoch: 7s on average

#### Sixteen workers, max for my machine

In [25]:
# Hyper parameters
num_epochs = 3
batchsize = 220
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16)

In [27]:
train()





Time: 6.089748859405518 




Time: 6.118439435958862 




Time: 6.225910425186157 


![image.png](attachment:image.png)

- GPU utilization: 48% avg
- Time spent per epoch: 6.1s on average

#### Why incrementing workers in loaders, also increment GPU utilization?

GPU, as everithing has limited resources, if you want to use all of those, you have to provide it with work. During training this work is sent by pytorch in the form of, one, the model to be trained, and second, the input batch in form of tensors. The input is stored in the host computer in the form of images.
GPU might not be as fast as CPU in doing one task, but it is great when doing multiple task, so for keep the GPU working you have to provide a lot of parallel work, this means you have to read lots of images from host. Here is were workers comes in. 
There is a sweet point in which when GPU finishes its work and is ready to process more, and host has to be ready to send more work. If you have enough workers to satisfy to be in this point, then you are maximizing GPU in terms of workers in loaders, in my example I only have 8 cores and 16 threads, so my limit is 16, but if I check the diference in performance between 8 an 16, it's clear that between those is the sweet point of workers.

### How batch size affect GPU utilization

#### Batch of 1

In [18]:
# Hyper parameters
num_epochs = 1
batchsize = 1
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [19]:
train()







Time: 36.830082178115845 


![image.png](attachment:image.png)

- GPU utilization: 29% avg
- Time spent per epoch: 36.8s on average

#### Batch of 4

In [20]:
# Hyper parameters
num_epochs = 1
batchsize = 4
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [22]:
train()





Time: 12.07694411277771 


![image.png](attachment:image.png)

- GPU utilization: 37% avg
- Time spent per epoch: 12s on average

#### Batch of 8

In [27]:
# Hyper parameters
num_epochs = 1
batchsize = 8
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [26]:
train()





Time: 8.347815752029419 


![image.png](attachment:image.png)

- GPU utilization: 42% avg
- Time spent per epoch: 8.3s on average

#### Batch of 16

In [28]:
# Hyper parameters
num_epochs = 1
batchsize = 16
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [29]:
train()





Time: 6.758173227310181 


![image.png](attachment:image.png)

- GPU utilization: 44% avg
- Time spent per epoch: 6.75s on average

#### Bigger batch

In [55]:
# Hyper parameters
num_epochs = 1
batchsize = 400
batchsize_test = 64
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [57]:
train()



Time: 6.820275545120239 


- Batch 32: gpu 52% and 6.05s
- Batch 64: gpu 56% and 6.07s
- Batch 128: gpu 59% and 6.28s
- Batch 256: gpu 58% and 6.36s

#### So what?

Interesting, if we increase batch size from 1 up to 16 with the current model and input image, time and GPU utilization have a big increase each step, but then it seems to reach the celing, getting it best time performance at 32 batch size. On the other hand having a bigger batch size seems to also increase GPU utilization, but this doesn't help in time.

I think this behavior is because my CPU is the bottle neck, the max number of images I can process asynchronuslly is 16, that's why increasing batch size bigger than that doesn't help a lot.

However, having a big batch size is not always the best option to increase accuracy, next I'll try changing image size input.

### Changing input size 250 * 250

In [7]:
TRANSFORM_IMG = transforms.Compose([
    transforms.Resize(250),
    transforms.CenterCrop(250),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225] )
    ])

In [8]:
class Network2(torch.nn.Module):
    def __init__(self):
        super(Network2, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 20, kernel_size=3)
        self.conv2 = torch.nn.Conv2d(20, 32, kernel_size=3)
        self.conv3 = torch.nn.Conv2d(32, 64, kernel_size=3)
        self.fc1 = torch.nn.Linear((64*29*29), 50)
        self.fc2 = torch.nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = x.view(-1, (64*29*29))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.sigmoid(x)

In [9]:
# Hyper parameters
num_epochs = 1
batchsize = 32
batchsize_test = 32
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network2()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [11]:
summary(network, (3, 250, 250))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 20, 248, 248]             560
            Conv2d-2         [-1, 32, 122, 122]           5,792
            Conv2d-3           [-1, 64, 59, 59]          18,496
            Linear-4                   [-1, 50]       2,691,250
            Linear-5                    [-1, 1]              51
Total params: 2,716,149
Trainable params: 2,716,149
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.72
Forward/backward pass size (MB): 14.72
Params size (MB): 10.36
Estimated Total Size (MB): 25.80
----------------------------------------------------------------




In [13]:
train()





Time: 11.01717758178711 


![image.png](attachment:image.png)

- GPU utilization: 72% avg
- Time spent per epoch: 11s on average

### Changing input size 300 * 300

In [14]:
TRANSFORM_IMG = transforms.Compose([
    transforms.Resize(300),
    transforms.CenterCrop(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225] )
    ])

In [23]:
class Network3(torch.nn.Module):
    def __init__(self):
        super(Network3, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 20, kernel_size=3)
        self.conv2 = torch.nn.Conv2d(20, 32, kernel_size=3)
        self.conv3 = torch.nn.Conv2d(32, 64, kernel_size=3)
        self.fc1 = torch.nn.Linear((64*35*35), 50)
        self.fc2 = torch.nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = x.view(-1, (64*35*35))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.sigmoid(x)

In [24]:
# Hyper parameters
num_epochs = 1
batchsize = 32
batchsize_test = 32
learning_rate = 0.01
momentum = 0.5
log_interval = 60

#Network model
network = Network3()
network.to(device)

#Stochastic gradient decent
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                      momentum=momentum)
#Binary cross entropy loss
criterion = torch.nn.BCELoss()

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = torch.utils.data.DataLoader(test_data, batch_size=batchsize_test, shuffle=True, num_workers=16) 

train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_data_loader.dataset) for i in range(num_epochs + 1)]

In [25]:
summary(network, (3, 300, 300))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 20, 298, 298]             560
            Conv2d-2         [-1, 32, 147, 147]           5,792
            Conv2d-3           [-1, 64, 71, 71]          18,496
            Linear-4                   [-1, 50]       3,920,050
            Linear-5                    [-1, 1]              51
Total params: 3,944,949
Trainable params: 3,944,949
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 1.03
Forward/backward pass size (MB): 21.29
Params size (MB): 15.05
Estimated Total Size (MB): 37.37
----------------------------------------------------------------


In [26]:
train()





Time: 14.696714639663696 


![image.png](attachment:image.png)

- GPU utilization: 79% avg
- Time spent per epoch: 14s on average

#### So what?

When changing input image but maintain the network architecture (only changing the size of some layers due to the change in the input), the number of parameters and the memory of the model increases linearly. Also GPU utilization reaches almost 80%. This is because the size of the matrix multiplication increases and more work can be done in parallel. However increasing the number of the input might not be the best solution specially with big models.

So far GPU memory usage has been moderate, but for big models this won't be the case.
Next, I'll try to use a big model for this problem and 

In [15]:
#just for testing
TRANSFORM_IMG = transforms.Compose([
    transforms.Resize(300),
    transforms.CenterCrop(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225] )
    ])
train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batchsize, shuffle=True,  num_workers=16)

test = iter(train_data_loader)
img, target= next(test)

In [16]:
img.shape

torch.Size([32, 3, 300, 300])

In [17]:
conv1 = torch.nn.Conv2d(3, 20, kernel_size=3)
conv2 = torch.nn.Conv2d(20, 32, kernel_size=3)
conv3 = torch.nn.Conv2d(32, 64, kernel_size=3)
fc1 = torch.nn.Linear((64*17*17), 50)
fc2 = torch.nn.Linear(50, 1)

x = F.relu(F.max_pool2d(conv1(img), 2))
x = F.relu(F.max_pool2d(conv2(x), 2))
x = F.relu(F.max_pool2d(conv3(x), 2))
#x = x.view(-1, (64*23*23))

x.shape
#x = F.relu(self.fc1(x))
#x = F.dropout(x)

torch.Size([32, 64, 35, 35])

### Train model

In [11]:
print("torch.cuda.memory_allocated: %fMB"%(torch.cuda.memory_allocated(0)/1024/1024))
print("torch.cuda.memory_reserved: %fMB"%(torch.cuda.memory_reserved(0)/1024/1024))
print("torch.cuda.max_memory_reserved: %fMB"%(torch.cuda.max_memory_reserved(0)/1024/1024))


torch.cuda.memory_allocated: 1016.760254MB
torch.cuda.memory_reserved: 1718.000000MB
torch.cuda.max_memory_reserved: 1718.000000MB


### ToDo

- Create a section in which the network is described
- Create sections to show how changing num of workers, batch size, optimizes training time
- Check other kind of accuaracy measurements
- Check how to maximize utilization of GPU, does this affect training results?
- Monitor gpu usage, memory, SM usage?