In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
import os
import time
import copy
from tqdm.auto import tqdm # For nice progress bars


print("All libraries imported.")

All libraries imported.


In [2]:
# --- 1. Setup Device ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 2. Define Data Paths ---
DATA_DIR = '../data/MELD_processed/faces/'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
DEV_DIR = os.path.join(DATA_DIR, 'dev') # We'll use the 'dev' set for validation

# --- 3. Define Image Transforms ---
# Use the feature extractor's recommended settings for normalization
feature_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
image_mean = feature_extractor.image_mean
image_std = feature_extractor.image_std

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((feature_extractor.size['height'], feature_extractor.size['width'])),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std),
    ]),
    'val': transforms.Compose([
        transforms.Resize((feature_extractor.size['height'], feature_extractor.size['width'])),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std),
    ]),
}




print("Transforms defined.")

Using device: cuda:0
Transforms defined.




In [3]:
import os

# Let's list the contents of the 'faces' and 'faces/train' directories
DATA_DIR = '../data/MELD_processed/faces/'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')

print(f"Checking contents of: {DATA_DIR}")
print(os.listdir(DATA_DIR))

print(f"\nChecking contents of: {TRAIN_DIR}")
print(os.listdir(TRAIN_DIR))

Checking contents of: ../data/MELD_processed/faces/
['dev', 'test', 'train']

Checking contents of: ../data/MELD_processed/faces/train
['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']


In [4]:
# --- 4. Create Datasets ---
print("Loading datasets using ImageFolder...")
image_datasets = {
    'train': datasets.ImageFolder(TRAIN_DIR, data_transforms['train']),
    'val': datasets.ImageFolder(DEV_DIR, data_transforms['val'])
}

# --- 5. Create DataLoaders ---
# DataLoaders turn our dataset into batches to feed to the GPU
dataloaders = {
    'train': DataLoader(image_datasets['train'], batch_size=32, shuffle=True, num_workers=0), # <-- SET TO 0
    'val': DataLoader(image_datasets['val'], batch_size=32, shuffle=False, num_workers=0)  # <-- SET TO 0
}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

print(f"Training data size: {dataset_sizes['train']}")
print(f"Validation data size: {dataset_sizes['val']}")
print(f"Found {len(class_names)} classes: {class_names}")

Loading datasets using ImageFolder...
Training data size: 2571
Validation data size: 1256
Found 7 classes: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']


In [5]:
# --- 6. Load Pretrained Model ---
model_name = "trpakov/vit-face-expression"
model = AutoModelForImageClassification.from_pretrained(model_name)

# --- 7. Map MELD Labels to Model Labels ---
# Let's check the model's original labels
model_labels = list(model.config.id2label.values())
print(f"Original model labels: {model_labels}")
print(f"Our dataset labels:    {class_names} (from ImageFolder)")

# Create a mapping from our dataset's integer index to the model's integer index
# This is a bit complex, but ensures 'anger' in our data maps to 'angry' in the model
model_label2id = model.config.label2id
# Our dataset's labels are sorted alphabetically by ImageFolder
# class_names = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
# Model's labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

# Let's create a remap dictionary:
remap = {
    'anger': 'angry',
    'disgust': 'disgust',
    'fear': 'fear',
    'joy': 'happy',  # This is the only name change
    'neutral': 'neutral',
    'sadness': 'sad',
    'surprise': 'surprise'
}

# Now, create the new label maps for the model config
# This ensures the model's output neurons match our folder names
model.config.label2id = {remap[label]: i for i, label in enumerate(class_names)}
model.config.id2label = {i: remap[label] for i, label in enumerate(class_names)}

print(f"\nUpdated model label2id: {model.config.label2id}")

# Move model to GPU
model = model.to(device)
print("\nModel loaded and moved to GPU.")

Original model labels: ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
Our dataset labels:    ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] (from ImageFolder)

Updated model label2id: {'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}

Model loaded and moved to GPU.


In [6]:
# --- 8. Define Optimizer and Loss ---
# We'll only fine-tune the final layer (classifier) for speed,
# but for better performance, we can train all parameters.
# Let's train all parameters.
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-2) # <-- ADDED weight_decay

# Loss function
criterion = nn.CrossEntropyLoss()

print("Optimizer and Loss Function defined.")

Optimizer and Loss Function defined.


In [8]:
#------- 9: The Training & Validation Loop    -------#

import time
import copy
from tqdm.auto import tqdm

def train_model(model, criterion, optimizer, num_epochs=10):
    since = time.time()
    
    # To save the best model weights
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'\n--- Epoch {epoch+1}/{num_epochs} ---')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for batch in tqdm(dataloaders[phase], desc=f"Processing {phase} batches"):
                inputs = batch[0].to(device) # Get images
                labels = batch[1].to(device) # Get labels

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs
                    # The model returns a dict, we need the 'logits'
                    outputs = model(inputs)
                    logits = outputs.logits
                    
                    # Get predictions
                    _, preds = torch.max(logits, 1)
                    
                    # Calculate loss
                    loss = criterion(logits, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model if it's the best one
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                print(f'New best val acc: {best_acc:.4f}!')

    time_elapsed = time.time() - since
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

# --- 8. START TRAINING ---
print("Starting model fine-tuning...")

# Run the training function
model_ft = train_model(model, criterion, optimizer, num_epochs=4) # <-- CHANGED 10 to 4

# --- 9. Save the Best Model ---
# After training, save your fine-tuned model
SAVE_PATH = '../models/fer_model_finetuned.pth'

# --- ADD THESE TWO LINES ---
import os
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
# ---------------------------

torch.save(model_ft.state_dict(), SAVE_PATH)

print(f"\n--- Training Finished ---")
print(f"Best model saved to {SAVE_PATH}")

Starting model fine-tuning...

--- Epoch 1/4 ---
----------


Processing train batches:   0%|          | 0/81 [00:00<?, ?it/s]

train Loss: 1.5483 Acc: 0.4395


Processing val batches:   0%|          | 0/40 [00:00<?, ?it/s]

val Loss: 2.1345 Acc: 0.1815
New best val acc: 0.1815!

--- Epoch 2/4 ---
----------


Processing train batches:   0%|          | 0/81 [00:00<?, ?it/s]

train Loss: 1.3208 Acc: 0.5294


Processing val batches:   0%|          | 0/40 [00:00<?, ?it/s]

val Loss: 2.2372 Acc: 0.1704

--- Epoch 3/4 ---
----------


Processing train batches:   0%|          | 0/81 [00:00<?, ?it/s]

train Loss: 1.0992 Acc: 0.6352


Processing val batches:   0%|          | 0/40 [00:00<?, ?it/s]

val Loss: 2.2923 Acc: 0.1752

--- Epoch 4/4 ---
----------


Processing train batches:   0%|          | 0/81 [00:00<?, ?it/s]

train Loss: 0.8913 Acc: 0.7165


Processing val batches:   0%|          | 0/40 [00:00<?, ?it/s]

val Loss: 2.2632 Acc: 0.1847
New best val acc: 0.1847!

Training complete in 30m 35s
Best val Acc: 0.184713

--- Training Finished ---
Best model saved to ../models/fer_model_finetuned.pth
