<a href="https://colab.research.google.com/github/kode-git/FER-Visual-Transformers/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training

This notebook is used for trains transformers and deep neural networks.

## Install Dependencies and Import Libraries

In [10]:
!pip install timm
!pip install fvcore


Collecting fvcore
  Downloading fvcore-0.1.5.post20220512.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 2.8 MB/s 
Collecting yacs>=0.1.6
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 10.1 MB/s 
Collecting iopath>=0.1.7
  Downloading iopath-0.1.9-py3-none-any.whl (27 kB)
Collecting portalocker
  Downloading portalocker-2.4.0-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: fvcore
  Building wheel for fvcore (setup.py) ... [?25l[?25hdone
  Created wheel for fvcore: filename=fvcore-0.1.5.post20220512-py3-none-any.whl size=61288 sha256=98873706aad03f4fb2b03dfa9dfe39248eddcb9c2ecfa073433fe8068550bf05
  Stored in directory: /root/.cache/pip/wheels/68/20/f9/a11a0dd63f4c13678b2a5ec488e48078756505c7777b75b29e
Successfully built fvcore
Installing colle

In [11]:
# classic libraries for collections.
import pandas as pd
import numpy as np

# utility library.
import random, time, copy

# plot libraries.
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# libraries for image processing.
import os, cv2, glob, imageio, sys
from PIL import Image
# warning library for service warnings.
import warnings

# machine learning libraries .
import timm, torch, torchvision
from torchsummary import summary

# image dataset loading and transformations.
from torchvision import datasets, models, transforms

# utility functions for specific uses.
from __future__ import print_function
from __future__ import division

# optimizer libraries.
from torch.optim import lr_scheduler
import torch.optim as optim
from sam.sam import SAM

# library for basic building blocks.
import torch.nn as nn

# library for saving and loading checkpoints.
import pickle

# libraries for metrics and evaluation phase.
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

# libraries for flop analysis.
from fvcore.nn import FlopCountAnalysis, flop_count_table, flop_count_str

# colab library.
from google.colab import drive

In [7]:
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.11.0+cu113
Torchvision Version:  0.12.0+cu113


In [8]:
# load Google Drive environment.
drive.mount('/content/drive')

Mounted at /content/drive


## GPU Configuration

Transformers are trained using Google Colab Pro GPU: NVIDIA P100.

In [1]:
!nvidia-smi

Sat May 14 21:26:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
# Detect if we have a GPU available.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Common utilities

In [32]:
def mkdir_model(base_dir, name_model, counter):
  """
  Making a directory for the model dump.
  """
  try:
    d = "{}/{}".format(base_dir,name_model)
    os.mkdir(d)
  except FileExistsError:
    counter += 1
    mkdir_model(base_dir, str(name_model) + "_" + str(counter), counter)

def save_history(history, filename):
  """
  Save the history in the file.
  """
  if os.path.isfile(filename):
    os.remove(filename)
  file_handler = open(filename + ".pkl", "wb")
  pickle.dump(history, file_handler)
  file_handler.close()


def load_history(filename):
  """
  Load the history from the file.
  """
  file_handler = open(filename + ".pkl", "rb")
  output = pickle.load(file_handler)
  file_handler.close()
  return output


def train_model(model, dataloaders, criterion, optimizer,lr_scheduler, num_epochs=25, is_inception=False, 
                is_loaded = False, load_state_ws=None, history_file_acc="history_accuracy",
                history_file_loss="history_loss", n_partial=0, model_folder="", best_acc=0.0 ):
    """
    PyTorch training model with loading support and dump management.
    Trains a model in a series of epochs and return the best configuration.
    Best configuration is given by the best validation accuracy around epochs.
    Training metrics are saved in well formated files.
    """
    
    history = {'val' : [], 'train' : []}
    loss_history = {'val' : [], 'train' : []}

    if is_loaded and load_state_ws != None:
      # load the model.
      state_dict = torch.load(load_state_ws)
      model.load_state_dict(state_dict)
      model.eval()
      print('Model loaded correctly')

    print('Starting Training')
    print('-' * 12)

    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = best_acc
    for epoch in range(num_epochs):
        epoch_since = time.time()
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print('-' * 12)
        # Each epoch has a training and validation phase.
        for phase in ['train', 'val']:
            total = len(dataloaders[phase])
            current = 0
            if phase == 'train':
                model.train()  # Set model to training mode.
            else:
                model.eval()   # Set model to evaluate mode.

            running_loss = 0.0
            running_corrects = 0

            dl = dataloaders[phase]
            totalIm=0
            # Iterate over data.
            for inputs, labels in dl:
                totalIm+=len(inputs)
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients.
                optimizer.zero_grad()

                # forward.
                # track history if only in train.
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss.
                      outputs = model(inputs)
                      loss = criterion(outputs, labels)

                      _, preds = torch.max(outputs, 1)
                      def closure():
                          outputs = model(inputs)
                          _, preds = torch.max(outputs, 1)
                          loss = criterion(outputs, labels)
                          loss.backward()
                          return loss

                    # backward + optimize only if in training phase.
                      if phase == 'train':
                        loss.backward()
                        if type(optimizer) != SAM:
                          optimizer.step()
                        else:
                          optimizer.step(closure)

                        

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                epoch_loss = running_loss / totalIm
                epoch_acc = running_corrects.double() / totalIm
                # status update.
                current += 1
                sys.stdout.write("\r" + f"{epoch + 1}/{num_epochs} - {phase} step : " + str(current * batch_size) + "/" +  str(total * batch_size) + " - " + 
                                 "{}_accuracy : ".format(phase) + "{:4f}".format(epoch_acc) + " - {}_loss : ".format(phase) + "{:4f}".format(epoch_loss))
                sys.stdout.flush()
            epoch_loss = running_loss / totalIm
            epoch_acc = running_corrects.double() / totalIm
            print() # avoid result cleaning .
            if phase == 'train':
              history['train'].append(epoch_acc)
              loss_history['train'].append(epoch_loss)

            # deep copy the model only in case the accusary is better in evaluation (local optima).
            local_optima = False
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                local_optima = True
            if phase == 'val':
                history['val'].append(epoch_acc)
                loss_history['val'].append(epoch_loss)

        # Increases the internal counter.
        if lr_scheduler:            
            lr_scheduler.step()            
        lr = optimizer.param_groups[0]['lr']
        interval_epoch = time.time() - epoch_since 
        print('\nEpoch {} complete in. {:.0f}m {:.0f}s {} and with a learning rate of {}'.format(epoch + 1, interval_epoch // 60, interval_epoch % 60, "with best local accuracy" if local_optima else "",lr))
        save_history(loss_history, model_folder + os.path.basename(model_folder) + "_" + history_file_loss)
        
        torch.save(model.state_dict(), model_folder + "epoch_{}_{}".format(epoch + 1, os.path.basename(model_folder[:len(model_folder) - 1])))
        print("-" * 12)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val accuracy: {:4f}'.format(best_acc))

    # load best model weights.
    model.load_state_dict(best_model_wts)
    return model, history['train'], history['val'], best_acc

## Dataset Loading

In [13]:
# input and batch size specification.
input_size = (224,224)
batch_size = 60

# dataset directory.
data_dir= "/content/drive/MyDrive/Datasets/VFER/"

# removing possible .ipybn_checkpoints.
for fd in glob.glob("/content/drive/MyDrive/Datasets/VFER/*"):
  for cl in glob.glob(fd + "/.*"):
    os.rmdir(cl)

# loading training and validation set.
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ]),
}


print("Initializing Datasets and Dataloaders...")

# Create training and validation datasets.
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}

# Create training and validation dataloaders.
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=8,pin_memory=True) for x in ['train', 'val']}

Initializing Datasets and Dataloaders...


  cpuset_checked))


In [36]:
# specify the total number of classes.
NUM_CLASSES = 8
model_name = 'vit_base_patch16_224'
# model_name = 'resnet18'
# loading pretrained model.
model = timm.create_model(model_name, pretrained=True)

In [37]:
# flops analysis.
inputs = (torch.randn((1, 3, 224, 224)))
model.eval() 
print('-'*40)

# flop data display.
flop = FlopCountAnalysis(model, inputs)
print(flop_count_table(flop, max_depth=4))
print(flop_count_str(flop))
print("Tot. flops:", flop.total())

----------------------------------------
| module                     | #parameters or shape   | #flops     |
|:---------------------------|:-----------------------|:-----------|
| model                      | 86.568M                | 17.583G    |
|  cls_token                 |  (1, 1, 768)           |            |
|  pos_embed                 |  (1, 197, 768)         |            |
|  patch_embed.proj          |  0.591M                |  0.116G    |
|   patch_embed.proj.weight  |   (768, 3, 16, 16)     |            |
|   patch_embed.proj.bias    |   (768,)               |            |
|  blocks                    |  85.054M               |  17.466G   |
|   blocks.0                 |   7.088M               |   1.455G   |
|    blocks.0.norm1          |    1.536K              |    0.756M  |
|     blocks.0.norm1.weight  |     (768,)             |            |
|     blocks.0.norm1.bias    |     (768,)             |            |
|    blocks.0.attn           |    2.362M              |    0.5

In [38]:
# adapting head for 8 classes classify (fine-tuning).

if model_name == 'resnet18':
  model.fc = nn.Linear(512, NUM_CLASSES)
else: 
  model.head = nn.Linear(768, NUM_CLASSES)
  
# display modified model.
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,),

In [39]:
optimizer_set = input('Digit 0 for SGD or other values for SAM: ')
if optimizer_set == str(0):
  optimizer_set = "SGD"
else:
  optimizer_set = "SAM"
print('Chosen {} for the model training.'.format(optimizer_set))

Digit 0 for SGD or other values for SAM: 0
Chosen SGD for the model training.


In [40]:
# Detect if we have a GPU available.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# Send the model to GPU
model = model.to(device)
feature_extract=False

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model.parameters()
print("Params to learn:")

for name,param in model.named_parameters():
    if param.requires_grad == True:
          print("\t",name)

print('-'*40)
lr_in = 0.001
momentum_in = 0.9
if optimizer_set == "SGD":
  # stochasic gradient descent.
  optimizer_ft = optim.SGD(params_to_update, lr=lr_in, momentum=momentum_in)
else:
  # shapeness-aware minimizer.
  optimizer_base = optim.SGD # define an optimizer for the sharpness-aware update.
  optimizer_ft = SAM(params_to_update, optimizer_base, lr=lr_in, momentum=momentum_in)

print(optimizer_ft)

cuda:0
Params to learn:
	 cls_token
	 pos_embed
	 patch_embed.proj.weight
	 patch_embed.proj.bias
	 blocks.0.norm1.weight
	 blocks.0.norm1.bias
	 blocks.0.attn.qkv.weight
	 blocks.0.attn.qkv.bias
	 blocks.0.attn.proj.weight
	 blocks.0.attn.proj.bias
	 blocks.0.norm2.weight
	 blocks.0.norm2.bias
	 blocks.0.mlp.fc1.weight
	 blocks.0.mlp.fc1.bias
	 blocks.0.mlp.fc2.weight
	 blocks.0.mlp.fc2.bias
	 blocks.1.norm1.weight
	 blocks.1.norm1.bias
	 blocks.1.attn.qkv.weight
	 blocks.1.attn.qkv.bias
	 blocks.1.attn.proj.weight
	 blocks.1.attn.proj.bias
	 blocks.1.norm2.weight
	 blocks.1.norm2.bias
	 blocks.1.mlp.fc1.weight
	 blocks.1.mlp.fc1.bias
	 blocks.1.mlp.fc2.weight
	 blocks.1.mlp.fc2.bias
	 blocks.2.norm1.weight
	 blocks.2.norm1.bias
	 blocks.2.attn.qkv.weight
	 blocks.2.attn.qkv.bias
	 blocks.2.attn.proj.weight
	 blocks.2.attn.proj.bias
	 blocks.2.norm2.weight
	 blocks.2.norm2.bias
	 blocks.2.mlp.fc1.weight
	 blocks.2.mlp.fc1.bias
	 blocks.2.mlp.fc2.weight
	 blocks.2.mlp.fc2.bias
	 blocks

## Start Training

In [45]:
warnings.filterwarnings('ignore')

# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
num_epochs = input('Digits the initial number of epochs, invalid values are equals to 10 epochs: ')
try:
  int(num_epochs)
except ValueError:
  print('Default number of 10 epochs selected.')
  num_epochs = 10

Digits the initial number of epochs, invalid values are equals to 10 epochs: 25


In [None]:
# model general info.
name_model = "vfer_small_5"
base_dir = "/content/drive/MyDrive/Models/"

# model files for saving history and model data.
model_folder = base_dir + name_model + "/"
model_file = model_folder + name_model + ".pth"
train_history = model_folder + name_model + "_" + "history_train"
val_history = model_folder + name_model + "_" + "history_val"


# Learning Rate schedule: decays the learning rate by a factor of `gamma` .
# every `step_size` epochs.
scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1)

In [None]:
mkdir_model(base_dir, name_model, 0)
# Train and evaluate
model, train_hist, val_hist, best_acc = train_model(model, dataloaders_dict, criterion, optimizer_ft,scheduler, num_epochs=num_epochs, 
                                          is_inception=False)
#Saving the updated model for the inference phase
torch.save(model.state_dict(), model_file)

# Save histories data
save_history(train_hist, train_history)
save_history(val_hist, val_history)

In [None]:
# Num epochs for this snippet
num_epochs = 10

# model general info
name_model = "vfer_small_15"
base_dir = "/content/drive/MyDrive/Models/"
mkdir_model(base_dir, name_model, 0)

# model files for saving history and model data
model_folder = base_dir + name_model + "/"
model_file = model_folder + name_model + ".pth"
train_history = model_folder + name_model + "_" + "history_train"
val_history = model_folder + name_model + "_" + "history_val"

# changing starting lr
lr_in = 0.001
optimizer_ft = optim.SGD(model.parameters(), lr=lr_in, momentum=momentum_in)
scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1)

# Train and evaluate
model, train_hist, val_hist, best_acc = train_model(model, dataloaders_dict, criterion, optimizer_ft,scheduler, num_epochs=num_epochs, 
                                          is_inception=False, is_loaded=True, model_folder= model_folder, best_acc=best_acc,
                                          load_state_ws="/content/drive/MyDrive/Models/vfer_small_5/vfer_small_5.pth")


#Saving the updated model for the inference phase
torch.save(model.state_dict(), model_file)

# Save histories data
save_history(train_hist, train_history)
save_history(val_hist, val_history)

In [None]:
# model general info
name_model = "vfer_sam_25"
base_dir = "/content/drive/MyDrive/Models/"
mkdir_model(base_dir, name_model, 0)

# model files for saving history and model data
model_folder = base_dir + name_model + "/"
model_file = model_folder + name_model + ".pth"
train_history = model_folder + name_model + "_" + "history_train"
val_history = model_folder + name_model + "_" + "history_val"

# updating num_epochs
num_epochs = 5
# changing starting lr
lr_in = 0.001
optimizer_ft = optim.SGD(model.parameters(), lr=lr_in, momentum=momentum_in)
scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1)

# Train and evaluate
model, train_hist, val_hist, best_acc = train_model(model, dataloaders_dict, criterion, optimizer_ft,scheduler, num_epochs=num_epochs, 
                                          is_inception=False, is_loaded=True, model_folder= model_folder,
                                          load_state_ws="/content/drive/MyDrive/Models/vfer_sam_10/vfer_sam_10.pth", best_acc=best_acc )


#Saving the updated model for the inference phase
torch.save(model.state_dict(), model_file)

# Save histories data
save_history(train_hist, train_history)
save_history(val_hist, val_history)