<a href="https://colab.research.google.com/github/csar95/Food-Images-Classification-DL/blob/main/complex_model_%26_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Obtención de datos
Vamos a usar un dataset de imágenes de comida disponible en [Kaggle](https://www.kaggle.com/trolukovich/food11-image-dataset).

In [1]:
import os

os.environ["KAGGLE_USERNAME"] = "cesargutic"
os.environ["KAGGLE_KEY"] = ""

!kaggle datasets download trolukovich/food11-image-dataset --unzip

Downloading food11-image-dataset.zip to /content
100% 1.08G/1.08G [00:39<00:00, 32.7MB/s]
100% 1.08G/1.08G [00:39<00:00, 29.4MB/s]


Se habrán descargado 3 carpetas diferentes para los datos de entrenamiento, validación y evaluación, dentro de las cuales se encuentra una subcarpeta para cada una de las 11 clases de comida.

Con el fin de hacer este problema más accesible, vamos a centrarnos solo en seis de las clases de comida disponibles: Bread, Dairy product, Dessert, Egg, Fried food y Meat.

In [2]:
TRAINDIR = "/content/training"
VALDIR = "/content/validation"
TESTDIR = "/content/evaluation"

In [3]:
from glob import glob
import os

valid_classes = {"Bread", "Dairy product", "Dessert", "Egg", "Fried food", "Meat"}
datasets = {TRAINDIR, VALDIR, TESTDIR}

for dataset in datasets:
    for classdir in glob(f"{dataset}/*"):  # Find subfolders with classes
        if classdir.split("/")[-1] not in valid_classes:  # Ignore those in valid_classes
            print(f"Deleting {classdir}...")
            for fname in glob(f"{classdir}/*.jpg"):  # Remove each image file
                os.remove(fname)
            os.rmdir(classdir)  # Remove folder

Deleting /content/validation/Vegetable-Fruit...
Deleting /content/validation/Seafood...
Deleting /content/validation/Noodles-Pasta...
Deleting /content/validation/Rice...
Deleting /content/validation/Soup...
Deleting /content/evaluation/Vegetable-Fruit...
Deleting /content/evaluation/Seafood...
Deleting /content/evaluation/Noodles-Pasta...
Deleting /content/evaluation/Rice...
Deleting /content/evaluation/Soup...
Deleting /content/training/Vegetable-Fruit...
Deleting /content/training/Seafood...
Deleting /content/training/Noodles-Pasta...
Deleting /content/training/Rice...
Deleting /content/training/Soup...


In [4]:
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
torch.backends.cudnn.benchmark = True

In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Modelo 1
- Data augmentation
- Batch normalization layers
- 4 convolutional blocks
- 3 linear layers

## Preprocesado

In [None]:
image_size = 128
batch_size = 256

transformations = {
    "train": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(degrees=30),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    "val": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    "test": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
}

imageFolders = {
    _set: datasets.ImageFolder(eval(f"{_set.upper()}DIR"), transform=transformations[_set])
    for _set in ["train", "val", "test"]
}

dataLoaders = {
    _set: DataLoader(imageFolders[_set], batch_size=batch_size, shuffle=True)
    for _set in ["train", "val", "test"]
}

## Construcción del modelo

In [None]:
class DeepLearningNet(nn.Module):
    
    def __init__(self):
        super(DeepLearningNet, self).__init__()
        # Input size: (BatchSize, 128, 128, 3)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(5,5)),   # (BatchSize, 124, 124, 64)
            nn.BatchNorm2d(64, momentum=0.9),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),                                # (BatchSize, 62, 62, 64)

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5,5)),  # (BatchSize, 58, 58, 64)
            nn.BatchNorm2d(64, momentum=0.9),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),                                # (BatchSize, 29, 29, 64)

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3)),  # (BatchSize, 27, 27, 64)
            nn.BatchNorm2d(64, momentum=0.9),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),                                # (BatchSize, 13, 13, 64)

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3)),  # (BatchSize, 11, 11, 64)
            nn.BatchNorm2d(64, momentum=0.9),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2))                                 # (BatchSize, 5, 5, 64)
        )
        self.flatten = nn.Flatten(start_dim=1)  # dim=0 es el batch. dim=1 para aplanar cada imagen del batch
        self.classifier = nn.Sequential(                                    # (BatchSize, 1600)
            nn.Linear(in_features=1600, out_features=1024),                 # (BatchSize, 1024)
            nn.ReLU(),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=1024, out_features=1024),                 # (BatchSize, 1024)
            nn.ReLU(),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=1024, out_features=6),                    # (BatchSize, 6)
            nn.Softmax(dim=1)
        )
        
    def forward(self, input):
        output = self.features(input)        
        output = self.flatten(output)        
        output = self.classifier(output)
        
        return output

In [None]:
model = DeepLearningNet()#.to(device)
model

DeepLearningNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 64, kernel_size=(5, 5), stride=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (9): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (13): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): MaxPo

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Entrenamiento

In [None]:
import copy

num_epochs = 100
patience = 7  # Number of epochs with no improvement after which training will be stopped
early_stopping = False
best_loss = float('inf')
epochs_with_no_improvement = 0

for epoch in range(num_epochs):

    epoch_results = {}

    for phase in ["train", "val"]:
        # This sets the execution mode and informs layers (e.g., Dropout, BatchNorm) designed to behave differently during training and evaluation
        if phase == "train":
            model.train()
        else:
            model.eval()

        running_loss = 0.0
        correct_in_dataset = 0
        
        # For each batch of images update model parameters / weights
        for i, (inputs, labels) in enumerate(dataLoaders[phase]):        
            optimizer.zero_grad()               # Sets the gradients of all optimized tensors to zero. Same as model.zero_grad() if all model parameters are in the optimizer
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            if phase == "train":
                loss.backward()                 # Computes the gradient of loss w.r.t all the parameters in loss that have requires_grad=True and store them in x.grad (x.grad += dloss/dx)
                optimizer.step()                # Performs a single optimization step (parameter update based on the gradients)
            
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            correct_in_dataset += (predicted == labels).sum().item()
        
        if phase == "train":
            epoch_results['train_loss'] = running_loss/len(dataLoaders[phase])
            epoch_results['train_accuracy'] = correct_in_dataset/len(imageFolders[phase])

        if phase == "val":
            epoch_results['val_loss'] = running_loss/len(dataLoaders[phase])
            epoch_results['val_accuracy'] = correct_in_dataset/len(imageFolders[phase])
            
            if epoch_results['val_loss'] < best_loss:
                best_loss = epoch_results['val_loss']
                best_model = copy.deepcopy(model.state_dict())
                epochs_with_no_improvement = 0
            else:
                epochs_with_no_improvement += 1

            if epochs_with_no_improvement == patience:
                model.load_state_dict(best_model)
                early_stopping = True

    print(f"Epoch {epoch+1}/{num_epochs} | loss: {epoch_results['train_loss']:.4} - accuracy: {epoch_results['train_accuracy']:.4} - val_loss: {epoch_results['val_loss']:.4} - val_accuracy: {epoch_results['val_accuracy']:.4}")

    if early_stopping:
        print('Early stopping!')
        break

Epoch 1/100 | loss: 1.685 - accuracy: 0.3288 - val_loss: 1.641 - val_accuracy: 0.3814
Epoch 2/100 | loss: 1.648 - accuracy: 0.3775 - val_loss: 1.615 - val_accuracy: 0.4156
Epoch 3/100 | loss: 1.624 - accuracy: 0.4035 - val_loss: 1.621 - val_accuracy: 0.4122
Epoch 4/100 | loss: 1.611 - accuracy: 0.4196 - val_loss: 1.59 - val_accuracy: 0.4355
Epoch 5/100 | loss: 1.612 - accuracy: 0.4189 - val_loss: 1.576 - val_accuracy: 0.4492
Epoch 6/100 | loss: 1.608 - accuracy: 0.4222 - val_loss: 1.584 - val_accuracy: 0.4521
Epoch 7/100 | loss: 1.61 - accuracy: 0.4265 - val_loss: 1.589 - val_accuracy: 0.4331
Epoch 8/100 | loss: 1.596 - accuracy: 0.4405 - val_loss: 1.583 - val_accuracy: 0.444
Epoch 9/100 | loss: 1.593 - accuracy: 0.442 - val_loss: 1.575 - val_accuracy: 0.4602
Epoch 10/100 | loss: 1.571 - accuracy: 0.4653 - val_loss: 1.554 - val_accuracy: 0.4905
Epoch 11/100 | loss: 1.569 - accuracy: 0.4646 - val_loss: 1.574 - val_accuracy: 0.454
Epoch 12/100 | loss: 1.561 - accuracy: 0.4722 - val_loss:

## Evaluación

In [None]:
correct = 0

with torch.no_grad():
    for images, labels in dataLoaders['test']:        
        outputs = model(images)        
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        
print(f"Accuracy: {correct / len(imageFolders['test']):.2%}")

Accuracy: 55.99%


# Modelo 2
- Fine-tuning from EfficientNet
- 3 linear layers

(*) Debido a la limitación computacional de Google Colab se ha tenido que reducir el tamaño de las imágenes y el del batch lo cual tiene un efecto directo en el rendimiento de este modelo. Otra opción es probar con modelos más sencillos como AlexNet.

## Preprocesado

In [8]:
image_size = 64
batch_size = 128

transformations = {
    "train": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(degrees=30),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    "val": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    "test": transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
}

imageFolders = {
    _set: datasets.ImageFolder(eval(f"{_set.upper()}DIR"), transform=transformations[_set])
    for _set in ["train", "val", "test"]
}

dataLoaders = {
    _set: DataLoader(imageFolders[_set], batch_size=batch_size, shuffle=True)
    for _set in ["train", "val", "test"]
}

## Construcción del modelo

In [9]:
from torchvision.models import efficientnet_v2_m


efficientnet_model = efficientnet_v2_m(weights='DEFAULT')

for param in efficientnet_model.parameters():
    param.requires_grad = False

efficientnet_model

Downloading: "https://download.pytorch.org/models/efficientnet_v2_m-dc08266a.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_m-dc08266a.pth


  0%|          | 0.00/208M [00:00<?, ?B/s]

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  

In [10]:
in_features = efficientnet_model.classifier[1].in_features
num_classes = len(imageFolders["train"].class_to_idx)
print(in_features, num_classes)

1280 6


In [11]:
efficientnet_model.classifier[1] = nn.Linear(in_features=in_features, out_features=2048)

In [12]:
class MyEfficientNet(nn.Module):
    def __init__(self, pretrained_model, num_classes):
        super(MyEfficientNet, self).__init__()
        self.extension = nn.Sequential(
            pretrained_model,
            nn.ReLU(),
            nn.Dropout(p=0.4),
            nn.Linear(in_features=2048, out_features=num_classes),
            nn.Softmax(dim=1)
        )
    
    def forward(self, input):
        output = self.extension(input)
        return output

my_extended_model = MyEfficientNet(pretrained_model=efficientnet_model, num_classes=num_classes)
my_extended_model

MyEfficientNet(
  (extension): Sequential(
    (0): EfficientNet(
      (features): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): Sequential(
          (0): FusedMBConv(
            (block): Sequential(
              (0): Conv2dNormActivation(
                (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
                (1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
                (2): SiLU(inplace=True)
              )
            )
            (stochastic_depth): StochasticDepth(p=0.0, mode=row)
          )
          (1): FusedMBConv(
            (block): Sequential(
              (0): Conv2dNormActivation(
                (0): Conv2d(24, 24, kernel_size=(3, 3), stride

In [13]:
parameters_to_update = []

for name, parameter in my_extended_model.named_parameters():
    if parameter.requires_grad == True:
        parameters_to_update.append(parameter)
        print(name)

extension.0.classifier.1.weight
extension.0.classifier.1.bias
extension.3.weight
extension.3.bias


In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(parameters_to_update, lr=0.001)

## Entrenamiento

In [15]:
import copy

num_epochs = 100
patience = 5  # Number of epochs with no improvement after which training will be stopped
early_stopping = False
best_loss = float('inf')
epochs_with_no_improvement = 0

for epoch in range(num_epochs):

    epoch_results = {}

    for phase in ["train", "val"]:
        # This sets the execution mode and informs layers (e.g., Dropout, BatchNorm) designed to behave differently during training and evaluation
        if phase == "train":
            my_extended_model.train()
        else:
            my_extended_model.eval()

        running_loss = 0.0
        correct_in_dataset = 0
        
        # For each batch of images update model parameters / weights
        for i, (inputs, labels) in enumerate(dataLoaders[phase]):        
            optimizer.zero_grad()               # Sets the gradients of all optimized tensors to zero. Same as model.zero_grad() if all model parameters are in the optimizer
            outputs = my_extended_model(inputs)
            loss = criterion(outputs, labels)

            if phase == "train":
                loss.backward()                 # Computes the gradient of loss w.r.t all the parameters in loss that have requires_grad=True and store them in x.grad (x.grad += dloss/dx)
                optimizer.step()                # Performs a single optimization step (parameter update based on the gradients)
            
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            correct_in_dataset += (predicted == labels).sum().item()
        
        if phase == "train":
            epoch_results['train_loss'] = running_loss/len(dataLoaders[phase])
            epoch_results['train_accuracy'] = correct_in_dataset/len(imageFolders[phase])

        if phase == "val":
            epoch_results['val_loss'] = running_loss/len(dataLoaders[phase])
            epoch_results['val_accuracy'] = correct_in_dataset/len(imageFolders[phase])
            
            if epoch_results['val_loss'] < best_loss:
                best_loss = epoch_results['val_loss']
                best_model = copy.deepcopy(my_extended_model.state_dict())
                epochs_with_no_improvement = 0
            else:
                epochs_with_no_improvement += 1

            if epochs_with_no_improvement == patience:
                my_extended_model.load_state_dict(best_model)
                early_stopping = True

    print(f"Epoch {epoch+1}/{num_epochs} | loss: {epoch_results['train_loss']:.4} - accuracy: {epoch_results['train_accuracy']:.4} - val_loss: {epoch_results['val_loss']:.4} - val_accuracy: {epoch_results['val_accuracy']:.4}")

    if early_stopping:
        print('Early stopping!')
        break

Epoch 1/100 | loss: 1.744 - accuracy: 0.2683 - val_loss: 1.731 - val_accuracy: 0.2865
Epoch 2/100 | loss: 1.722 - accuracy: 0.2927 - val_loss: 1.696 - val_accuracy: 0.3245
Epoch 3/100 | loss: 1.711 - accuracy: 0.3085 - val_loss: 1.688 - val_accuracy: 0.3335
Epoch 4/100 | loss: 1.703 - accuracy: 0.3168 - val_loss: 1.68 - val_accuracy: 0.3416
Epoch 5/100 | loss: 1.702 - accuracy: 0.3224 - val_loss: 1.677 - val_accuracy: 0.3406
Epoch 6/100 | loss: 1.702 - accuracy: 0.3173 - val_loss: 1.678 - val_accuracy: 0.3496
Epoch 7/100 | loss: 1.709 - accuracy: 0.3131 - val_loss: 1.669 - val_accuracy: 0.3596
Epoch 8/100 | loss: 1.695 - accuracy: 0.3283 - val_loss: 1.666 - val_accuracy: 0.3553
Epoch 9/100 | loss: 1.698 - accuracy: 0.3249 - val_loss: 1.67 - val_accuracy: 0.3596
Epoch 10/100 | loss: 1.71 - accuracy: 0.3114 - val_loss: 1.68 - val_accuracy: 0.3496
Epoch 11/100 | loss: 1.703 - accuracy: 0.3218 - val_loss: 1.668 - val_accuracy: 0.3601
Epoch 12/100 | loss: 1.695 - accuracy: 0.3339 - val_loss

## Evaluación

In [16]:
correct = 0

with torch.no_grad():
    for images, labels in dataLoaders['test']:        
        outputs = my_extended_model(images)        
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        
print(f"Accuracy: {correct / len(imageFolders['test']):.2%}")

Accuracy: 34.98%
