# Transfer Learning

In [1]:
%matplotlib inline

In [7]:
pip install requests



In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from tqdm import tqdm
from tqdm.notebook import tqdm

In [9]:
from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
import time
import os
import copy
plt.ion()   # interactive mode
import requests

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import datasets, models, transforms

## Load Data

In [5]:
# Create a folder for our data
# The ! prefix runs this as a shell command
!mkdir data
!mkdir data/israeli_politicians

In [10]:
# Download our dataset and extract it
# import requests
from zipfile import ZipFile

url = 'https://github.com/omriallouche/ydata_deep_learning_2021/blob/main/data/israeli_politicians.zip?raw=true'
r = requests.get(url, allow_redirects=True)
open('./data/israeli_politicians.zip', 'wb').write(r.content)

with ZipFile('./data/israeli_politicians.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall(path='./data/israeli_politicians/')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Datasets definition

In [112]:
means = [0.485, 0.456, 0.406]
stds = [0.229, 0.224, 0.225]

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(means, stds)
    ]),
    'val': transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(means, stds)
    ]),
}

In [113]:
data_dir = r'./data/israeli_politicians/'

image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

A data loader wraps an iterable around the Dataset to enable easy access to the samples. It combines a dataset and a sampler.

In [114]:
dataloaders = {
    'train': torch.utils.data.DataLoader(image_datasets['train'], batch_size=16,
                                             shuffle=True, num_workers=4),
    'val': torch.utils.data.DataLoader(image_datasets['val'], batch_size=16,
                                          shuffle=False, num_workers=4)
  }

In [115]:
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
print('dataset_sizes: ', dataset_sizes)

class_names = image_datasets['train'].classes
print('class_names:', class_names)

dataset_sizes:  {'train': 929, 'val': 234}
class_names: ['ayelet_shaked', 'benjamin_netanyahu', 'benny_gantz', 'danny_danon', 'gideon_saar', 'kostya_kilimnik', 'naftali_bennett', 'ofir_akunis', 'yair_lapid']


In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### Datasets and Dataloaders
Let's examine the dataloaders and datasets and learn more about their attributes and functions.  

In [30]:
train_dataloader = dataloaders['train']

In [18]:
train_dataloader.dataset.class_to_idx

{'ayelet_shaked': 0,
 'benjamin_netanyahu': 1,
 'benny_gantz': 2,
 'danny_danon': 3,
 'gideon_saar': 4,
 'kostya_kilimnik': 5,
 'naftali_bennett': 6,
 'ofir_akunis': 7,
 'yair_lapid': 8}

In [63]:
import copy
def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25, freeze_all = False):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    train_res= np.zeros((2,num_epochs))
    val_res=np.zeros((2,num_epochs))
    dict_res={'train':train_res, 'val':val_res}

    for epoch in tqdm(range(num_epochs)):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  
            else:
                model.eval()   

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()

                with torch.set_grad_enabled((phase=='train') & (not freeze_all)):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if ((phase == 'train') & (not freeze_all)):
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()  

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            dict_res[phase][0,epoch]=epoch_loss
            dict_res[phase][1,epoch]=epoch_acc

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {(time_elapsed // 60):.0f}m {(time_elapsed % 60):.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    model.load_state_dict(best_model_wts)
    
    return model, dict_res


## Pretrained Vgg16 model with layers' weights frozen
requires_grad = False

In [64]:
model = models.vgg16(pretrained=True)
model.classifier[6] = nn.Linear(in_features=4096, out_features=9)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
num_epochs = 8

for param in model.parameters():
    param.requires_grad = False

In [45]:
%%time
model,dict_res = train_model(model.to('cuda'), 
                    dataloaders,
                       criterion, 
                       optimizer_ft, 
                       exp_lr_scheduler,
                       num_epochs=num_epochs, freeze_all=True)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0/7
----------
train Loss: 2.2189 Acc: 0.1399
val Loss: 2.1801 Acc: 0.0769

Epoch 1/7
----------
train Loss: 2.2241 Acc: 0.1259
val Loss: 2.1801 Acc: 0.0769

Epoch 2/7
----------
train Loss: 2.2151 Acc: 0.1356
val Loss: 2.1801 Acc: 0.0769

Epoch 3/7
----------
train Loss: 2.2137 Acc: 0.1367
val Loss: 2.1801 Acc: 0.0769

Epoch 4/7
----------
train Loss: 2.2292 Acc: 0.1281
val Loss: 2.1801 Acc: 0.0769

Epoch 5/7
----------
train Loss: 2.2273 Acc: 0.1313
val Loss: 2.1801 Acc: 0.0769

Epoch 6/7
----------
train Loss: 2.2308 Acc: 0.1302
val Loss: 2.1801 Acc: 0.0769

Epoch 7/7
----------
train Loss: 2.2210 Acc: 0.1152
val Loss: 2.1801 Acc: 0.0769

Training complete in 2m 54s
Best val Acc: 0.076923
CPU times: user 2min 32s, sys: 4.76 s, total: 2min 37s
Wall time: 2min 54s


## Pretrained Vgg16 model with 25 first layers weights frozen 

In [116]:
model = models.vgg16(pretrained=True)
model.classifier[6] = nn.Linear(in_features=4096, out_features=9)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
num_epochs = 8

In [117]:
layer_counter = 0
for (name, module) in model.named_children():
    if name == 'features':
        for layer in module.children():
            if(layer_counter >= 25):
                break
            for param in layer.parameters():
                param.requires_grad = False
            layer_counter+=1

optimizer_ft = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [118]:
%%time
model,dict_res = train_model(model.to('cuda'), 
                    dataloaders,
                       criterion, 
                       optimizer_ft, 
                       exp_lr_scheduler,
                       num_epochs=num_epochs, freeze_all=False)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0/7
----------
train Loss: 1.8157 Acc: 0.3434
val Loss: 1.4742 Acc: 0.4658

Epoch 1/7
----------
train Loss: 1.2203 Acc: 0.5791
val Loss: 1.1157 Acc: 0.6282

Epoch 2/7
----------
train Loss: 0.7259 Acc: 0.7643
val Loss: 1.5214 Acc: 0.6026

Epoch 3/7
----------
train Loss: 0.4909 Acc: 0.8461
val Loss: 0.9637 Acc: 0.7094

Epoch 4/7
----------
train Loss: 0.3885 Acc: 0.8611
val Loss: 0.9462 Acc: 0.7051

Epoch 5/7
----------
train Loss: 0.2069 Acc: 0.9419
val Loss: 0.7910 Acc: 0.7650

Epoch 6/7
----------
train Loss: 0.1066 Acc: 0.9731
val Loss: 0.8821 Acc: 0.7350

Epoch 7/7
----------
train Loss: 0.0480 Acc: 0.9892
val Loss: 0.8097 Acc: 0.7521

Training complete in 4m 23s
Best val Acc: 0.764957
CPU times: user 4min 7s, sys: 5.41 s, total: 4min 12s
Wall time: 4min 22s


##Training the network from scratch (without pretrained weights)

In [110]:
model = models.vgg16(pretrained=False)
model.classifier[6] = nn.Linear(in_features=4096, out_features=9)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
num_epochs = 8

In [111]:
%%time
model,dict_res = train_model(model.to('cuda'), 
                    dataloaders,
                       criterion, 
                       optimizer_ft, 
                       exp_lr_scheduler,
                       num_epochs=num_epochs, freeze_all=False)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0/7
----------
train Loss: 2.1057 Acc: 0.2433
val Loss: 2.0715 Acc: 0.2479

Epoch 1/7
----------
train Loss: 2.0729 Acc: 0.2551
val Loss: 2.0560 Acc: 0.2479

Epoch 2/7
----------
train Loss: 2.0541 Acc: 0.2551
val Loss: 2.0664 Acc: 0.2479

Epoch 3/7
----------
train Loss: 2.0443 Acc: 0.2530
val Loss: 2.0467 Acc: 0.2479

Epoch 4/7
----------
train Loss: 2.0471 Acc: 0.2551
val Loss: 2.0328 Acc: 0.2692

Epoch 5/7
----------
train Loss: 2.0164 Acc: 0.2357
val Loss: 2.0195 Acc: 0.2564

Epoch 6/7
----------
train Loss: 2.0008 Acc: 0.2583
val Loss: 2.0210 Acc: 0.2778

Epoch 7/7
----------
train Loss: 1.9685 Acc: 0.2766
val Loss: 2.0055 Acc: 0.2735

Training complete in 9m 10s
Best val Acc: 0.277778
CPU times: user 8min 55s, sys: 5.81 s, total: 9min
Wall time: 9min 9s


## Pretrained Vgg16 model with unfrozen weights
requires_grad = True (by defalut)


In [49]:
model = models.vgg16(pretrained=True)
model.classifier[6] = nn.Linear(in_features=4096, out_features=9)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

num_epochs = 8

In [50]:
%%time
model,dict_res = train_model(model.to('cuda'), 
                    dataloaders,
                       criterion, 
                       optimizer_ft, 
                       exp_lr_scheduler,
                       num_epochs=num_epochs)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0/7
----------
train Loss: 1.7647 Acc: 0.3724
val Loss: 1.3821 Acc: 0.5128

Epoch 1/7
----------
train Loss: 1.0065 Acc: 0.6631
val Loss: 0.9215 Acc: 0.6752

Epoch 2/7
----------
train Loss: 0.6802 Acc: 0.7696
val Loss: 0.8964 Acc: 0.6880

Epoch 3/7
----------
train Loss: 0.2966 Acc: 0.8999
val Loss: 1.4140 Acc: 0.6282

Epoch 4/7
----------
train Loss: 0.8737 Acc: 0.7266
val Loss: 0.7498 Acc: 0.7479

Epoch 5/7
----------
train Loss: 0.3523 Acc: 0.8816
val Loss: 0.7291 Acc: 0.7778

Epoch 6/7
----------
train Loss: 0.1534 Acc: 0.9559
val Loss: 0.7222 Acc: 0.7991

Epoch 7/7
----------
train Loss: 0.0611 Acc: 0.9839
val Loss: 0.6568 Acc: 0.8162

Training complete in 7m 32s
Best val Acc: 0.816239
CPU times: user 7min 10s, sys: 5.5 s, total: 7min 16s
Wall time: 7min 31s


# Conclusions
We got the following results: <br>
Pretrained Vgg16 model with frozen layers: Acc = **0.07** <br>
Pretrained Vgg16 model with 25 first layers' weights frozen: best Acc = **0.76**<br>
Trained from scratch network with untrained weights: best Acc = **0.24**<br>
Pretrained Vgg16 model with unfrozen weights:  best Acc = **0.81** <br><br>

Not surprisingly we got the best accuracy for the pretrained model with unfrozen weights because  it's already trained and could even improve the performance by training the weights. <br>
We got the worst results for the pretrained frozen model because the filters can't be trained and stay with bad weights.
For the trained from scratch model we got a bit better performance because the filters were trained and succeeded to improve weights.
Regarding the model with first 25 frozen layers - we got surprisingly not bad results which means that probably those frozen layers are not so important and the the last layers can easily be trained.