In [None]:
import pandas as pd 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim
from itertools import product
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import StratifiedKFold


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
class VisualClassifier(nn.Module):
    def __init__(self):
        super(VisualClassifier, self).__init__()
        self.fc1 = nn.Linear(2048, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.7)
        self.fc2 = nn.Linear(256, 4)
        self.bn2 = nn.BatchNorm1d(4)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        return x

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, input_dim = 768, output_dim = 4, hidden_dim = 256):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x


In [None]:
class AudialClassifier(nn.Module):
    def __init__(self):
        super(AudialClassifier, self).__init__()
        self.fc1 = nn.Linear(128, 16)
        self.bn1 = nn.BatchNorm1d(16)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(16, 4)
        self.bn2 = nn.BatchNorm1d(4)
   
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        return x

In [None]:
def train_model(model, dataloaders, optimizer, criterion, device, num_epochs=50, patience=10):

    """
    Trains and validates the model.
    
    Args:
    - model (torch.nn.Module): The PyTorch model to train.
    - dataloaders (dict): A dictionary containing 'train' and 'val' DataLoaders.
    - optimizer (torch.optim.Optimizer): The optimizer to use for training.
    - criterion (torch.nn.Module): The loss function.
    - num_epochs (int): The number of epochs to train for.
    - patience (int): The patience for early stopping.
    """
    best_val_f1 = -float('inf')  
    patience_counter = 0
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in dataloaders['train']:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        val_preds = []
        val_labels = []
        val_probs = []
        with torch.no_grad():
            for inputs, labels in dataloaders['val']:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                val_probs.extend(outputs.cpu().numpy())


        val_accuracy = np.mean(np.array(val_preds) == np.array(val_labels))
        val_f1 = f1_score(val_labels, val_preds, average='micro')
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1  
            patience_counter = 0  
            # print(f"Validation F1 improved. Saving model...")
            # torch.save(model.state_dict(), 'best_model_checkpoint.pth')
        else:
            patience_counter += 1 
            # print(f'Validation F1 did not improve. Patience: {patience_counter}/{patience}')
        
        # Early stopping check
        if patience_counter >= patience:
            print("Early stopping triggered")
            break  


    print(f'Validation Accuracy: {val_accuracy:.4f}, Best Validation F1 Score: {best_val_f1:.4f}')
    
    return val_accuracy, best_val_f1, np.array(val_probs), np.array(val_preds), np.array(val_labels)


In [None]:
def extract_and_pool_features(df, feature_types, base_path="../data/"):
    """
    Extracts features from specified columns in the DataFrame, applies mean pooling,
    and updates the DataFrame with new columns for these processed features.
    
    Args:
    - df (DataFrame): The pandas DataFrame containing the features.
    - feature_types (dict): A dictionary mapping from 'visual' and 'audio' to their respective column names in df.
    - base_path (str): Base path where the feature files are stored.
    """
    
    for key, column in feature_types.items():
        features_list = []
        for _, row in df.iterrows():
            file_path = row[column]
            features = np.load(f"{base_path}{file_path}")
            features_list.append(np.mean(features, axis=0) if key != 'text' else features)
        
        df[f'extracted_{key}_features'] = features_list


In [None]:
def prepare_datasets_and_loaders(df, feature_columns, label_column='emotion_labels', batch_size = 4, test_size = 0.2):
    """
    Prepares datasets and dataloaders for training and validation.
    
    Args:
    - df (DataFrame): The pandas DataFrame containing the pooled features and labels.
    - feature_columns (list): List of column names for the features to be used.
    - label_column (str): The column name where the label data is stored.
    - batch_size (int): Batch size for the dataloaders.
    - test_size (float): Proportion of the dataset to include in the test split.
    
    Returns:
    - A dictionary of dataloaders for training and validation for each feature type.
    """

    dataloaders = {}
    y = torch.tensor(df[label_column].values, dtype = torch.long)

    for feature_type in feature_columns:
        X = np.array(df[feature_type].tolist(), dtype = np.float32)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = test_size, random_state = 42)
        
        train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train), y_train)
        val_dataset = torch.utils.data.TensorDataset(torch.tensor(X_val), y_val)
        
        train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
        val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
        
        dataloaders[f'{feature_type}_train'] = train_loader
        dataloaders[f'{feature_type}_val'] = val_loader

    return dataloaders



### Handle Class Imbalances

In [None]:
df = pd.read_csv('../data/csv/dataset.csv')

labels = df['emotion_labels'].values
classes = np.unique(labels)
class_weights = compute_class_weight('balanced', classes=classes, y=labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
class_weights_tensor = class_weights_tensor.to('cuda')
class_weights_tensor, class_weights_tensor.shape


### RUN CLASSIFIER

### PARAM GRID SEARCH

In [None]:
# param_grid = {
#     'learning_rate': [0.0001, 0.001, 0.01],
#     'optimizer': [optim.Adam],
#     'criterion': [nn.CrossEntropyLoss],
#     'epochs': [30, 50],
#     'batch_size': [4, 16, 32],
#     'patience': [5, 10, 15],
#     'weight_decay': [0, 1e-4, 1e-2],
# }

param_grid = {
    'learning_rate': [1e-4],
    'optimizer': [optim.Adam],
    'criterion': [nn.CrossEntropyLoss],
    'epochs': [50],
    'batch_size': [32],
    'patience': [15],
    'weight_decay': [0],
}



def get_optimizer(optimizer_class, parameters, lr, weight_decay, momentum=None):
    if optimizer_class == optim.Adam:
        return optim.Adam(parameters, lr=lr, weight_decay=weight_decay)
    elif optimizer_class == optim.SGD:
        # Ensure momentum is provided for SGD; otherwise, default to 0
        return optim.SGD(parameters, lr=lr, momentum=momentum if momentum is not None else 0, weight_decay=weight_decay)


def get_criterion(criterion_class):
    if criterion_class == nn.CrossEntropyLoss:
        return nn.CrossEntropyLoss()
    elif criterion_class == nn.NLLLoss:
        return nn.NLLLoss()


In [None]:
def grid_search(df, feature_columns, param_grid, device='cuda'):
    max_vis_acc, max_aud_acc, max_text_acc = -np.inf, -np.inf, -np.inf
    best_params_vis, best_params_aud, best_params_text = None, None, None

    combinations = list(product(*param_grid.values()))

    for combination in tqdm(combinations):
        lr, optimizer_class, criterion_class, epochs, batch_size, patience, wd = combination
        
        dataloaders = prepare_datasets_and_loaders(df, feature_columns, batch_size=batch_size)
        
        model_vis = VisualClassifier().to(device)
        optimizer_vis = optim.Adam(model_vis.parameters(), lr=lr, weight_decay=wd)
        
        model_aud = AudialClassifier().to(device)
        optimizer_aud = optim.Adam(model_aud.parameters(), lr=lr, weight_decay=wd)

        model_text = TextClassifier().to(device)
        optimizer_text = optim.Adam(model_text.parameters(), lr=lr, weight_decay=wd)

        criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        
        models_optimizers = {
            'extracted_visual_features': (model_vis, optimizer_vis),
            'extracted_audio_features': (model_aud, optimizer_aud),
            'extracted_text_features': (model_text, optimizer_text),
        }

        for feature_type, (model, optimizer) in models_optimizers.items():
            print(f"\nTraining {feature_type.split('_')[1].capitalize()} Model with lr={lr}, optimizer={optimizer_class.__name__}, criterion={criterion_class.__name__}, epochs={epochs}, batch_size={batch_size}, Patience={patience}, Weight decay={wd}")
            val_accuracy, best_val_f1, val_probs, val_preds, val_labels = train_model(
                model, 
                {'train': dataloaders[f'{feature_type}_train'], 'val': dataloaders[f'{feature_type}_val']}, 
                optimizer, criterion, device=device, num_epochs=epochs, patience=patience
            )

            if feature_type == 'extracted_visual_features' and val_accuracy > max_vis_acc:
                max_vis_acc = val_accuracy
                best_params_vis = {'learning_rate': lr, 'optimizer': optimizer_class.__name__, 'criterion': criterion_class.__name__, 'epochs': epochs, 'batch_size': batch_size, 'patience': patience, 'weight_decay': wd, 'validation_accuracy': val_accuracy}
            
            elif feature_type == 'extracted_audio_features' and val_accuracy > max_aud_acc:
                max_aud_acc = val_accuracy
                best_params_aud = {'learning_rate': lr, 'optimizer': optimizer_class.__name__, 'criterion': criterion_class.__name__, 'epochs': epochs, 'batch_size': batch_size, 'patience': patience, 'weight_decay': wd, 'validation_accuracy': val_accuracy}
            
            elif feature_type == 'extracted_text_features' and val_accuracy > max_text_acc:
                max_text_acc = val_accuracy
                best_params_text = {'learning_rate': lr, 'optimizer': optimizer_class.__name__, 'criterion': criterion_class.__name__, 'epochs': epochs, 'batch_size': batch_size, 'patience': patience, 'weight_decay': wd, 'validation_accuracy': val_accuracy}

    return best_params_vis, best_params_aud, best_params_text

In [None]:
model_aud = AudialClassifier()
model_aud = model_aud.to(device)

model_vis = VisualClassifier()
model_vis = model_vis.to(device)

model_text = TextClassifier()
model_text = model_text.to(device)

criterion = torch.nn.CrossEntropyLoss(weight = class_weights_tensor)

feature_types = {'visual': 'visual_features', 'audio': 'acoustic_features', 'text':'lexical_features'}
feature_columns = ['extracted_visual_features', 'extracted_audio_features','extracted_text_features']

extract_and_pool_features(df, feature_types)
dataloaders = prepare_datasets_and_loaders(df, feature_columns, batch_size = 4)

optimizer_aud = optim.Adam(model_aud.parameters(), lr = 0.001, weight_decay = 1e-4)
optimizer_vis = optim.Adam(model_vis.parameters(), lr = 0.001, weight_decay = 0)
optimizer_text = optim.Adam(model_vis.parameters(), lr = 0.001, weight_decay = 1e-4)


models_optimizers = {
    'extracted_visual_features': (model_vis, optimizer_vis),
    'extracted_audio_features': (model_aud, optimizer_aud),
    'extracted_text_features': (model_text, optimizer_text),
}

model_outputs = {}
for feature_type, (model, optimizer) in models_optimizers.items():
    print(f"Training with {feature_type}:")
    val_accuracy, best_val_f1, val_probs, val_preds, val_labels = train_model(
        model, 
        {'train': dataloaders[f'{feature_type}_train'], 'val': dataloaders[f'{feature_type}_val']}, 
        optimizer, criterion, device = device, num_epochs = 30, patience = 15)

    model_outputs[feature_type] = {
        'val_accuracy': val_accuracy,
        'best_val_f1': best_val_f1,
        'val_preds': val_preds,
        'val_labels': val_labels
    }

len(model_outputs['extracted_text_features']['val_labels'])

In [None]:
vis_params, aud_params, text_params = grid_search(df, feature_columns, param_grid, device='cuda')
print("Best Visual Model Params:", vis_params)
print("Best Audio Model Params:", aud_params)
print("Best Text Model Params:", text_params)


In [None]:
vis_params , aud_params

## Lack of alignment temporally and lack of similarity of shapes of data 

### Early Fusion

In [None]:
class ComplexConcatModel(nn.Module):
    def __init__(self, input_dim, output_dim=4):
        super(ComplexConcatModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.5)
        
        self.fc4 = nn.Linear(256, 128)
        self.bn4 = nn.BatchNorm1d(128)
        self.dropout4 = nn.Dropout(0.5)

        self.fc5 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x


In [None]:
class ConcatDataset(Dataset):
    def __init__(self, features, labels):
        """
        features: Numpy array of concatenated features.
        labels: Numpy array of labels.
        """
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.long)

# Concatenate features
concatenated_features = np.hstack((
    np.array(df['extracted_visual_features'].tolist()),
    np.array(df['extracted_audio_features'].tolist()),
    np.array(df['extracted_text_features'].tolist())
))

print(concatenated_features.shape)
labels = df['emotion_labels'].values

# Split the data
X_train, X_val, y_train, y_val = train_test_split(concatenated_features, labels, test_size=0.2, random_state=42)  ### USE 5-FOLD CROSS VALIDATION 

clf = LogisticRegression(random_state = 0, max_iter=1000).fit(X_train, y_train)
yPred_clf = clf.predict(X_val)
# print(accuracy_score(y_val, yPred_clf))


train_dataset = ConcatDataset(X_train, y_train)
val_dataset = ConcatDataset(X_val, y_val)

dataloaders = {
    'train': DataLoader(train_dataset, batch_size = 16, shuffle = True),
    'val': DataLoader(val_dataset, batch_size = 16, shuffle = False)
}

model = ComplexConcatModel(input_dim = concatenated_features.shape[1]).to(device)  
optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay = 1e-4)
criterion = torch.nn.CrossEntropyLoss(weight = class_weights_tensor)

_, _, _, _, _ = train_model(
    model = model,
    dataloaders = dataloaders,
    optimizer = optimizer,
    criterion = criterion,  
    device = device,
    num_epochs = 50,
    patience = 15
)


###  Late fusion 

In [None]:
predictions_aud = model_outputs['extracted_audio_features']['val_preds']
predictions_vis = model_outputs['extracted_visual_features']['val_preds']
predictions_text = model_outputs['extracted_text_features']['val_preds']


# final_predictions = (predictions_vis + predictions_aud + predictions_text) / 3
weight_dict = {
    'weight_vis': [i/100 for i in range(11, 100)],
    'weight_aud': [i/100 for i in range(11, 100)],
    'weight_text': [i/100 for i in range(11, 100)],
}

weight_combinations = list(product(*weight_dict.values()))
max_params = -np.inf
best_weights = None

for weights in tqdm(weight_combinations):
    # print(weights)
    weight_vis, weight_aud, weight_text = weights
    final_predictions_weighted = (weight_vis * predictions_vis + weight_aud * predictions_aud + weight_text * predictions_text)
    # print(final_predictions_weighted)
    final_predicted_classes = np.argmax(final_predictions_weighted)
    acc = (np.mean(final_predicted_classes == model_outputs['extracted_text_features']['val_labels'] ))

    if acc > max_params:
        max_params = acc
        best_weights = weights
final_predictions_weighted.shape, final_predicted_classes.shape

In [None]:
predictions_aud = model_outputs['extracted_audio_features']['val_preds']
predictions_vis = model_outputs['extracted_visual_features']['val_preds']
predictions_text = model_outputs['extracted_text_features']['val_preds']


# final_predictions = (predictions_vis + predictions_aud + predictions_text) / 3
weight_vis, weight_aud, weight_text  = best_weights
print(weight_vis, weight_aud, weight_text)
final_predictions_weighted = (weight_vis * predictions_vis + weight_aud * predictions_aud + weight_text * predictions_text)
final_predicted_classes = np.argmax(final_predictions_weighted, axis = 0)
print(np.mean(final_predicted_classes == model_outputs['extracted_text_features']['val_labels'] ))

# final_predicted_classes.shape

In [None]:
print(np.mean(np.argmax(model_outputs['extracted_text_features']['val_preds'], axis=0) == model_outputs['extracted_text_features']['val_labels'] ))

print(np.mean(np.argmax(model_outputs['extracted_visual_features']['val_preds'], axis=0) == model_outputs['extracted_text_features']['val_labels'] ))
print(np.mean(np.argmax(model_outputs['extracted_audio_features']['val_preds'] , axis=0) == model_outputs['extracted_text_features']['val_labels'] ))



### CONFUSION MATRIX

In [None]:
classes = ['Class1', 'Class2', 'Class3', 'Class4']

for feature_type, outputs in model_outputs.items():
    cm = confusion_matrix(outputs['val_preds'], outputs['val_labels'])

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    modality = feature_type.split('_')[1]  # Extract modality name from feature_type
    plt.title(f'Confusion Matrix for {modality.capitalize()} Model')
    plt.show()

In [108]:
def train_model_for_fold(model, dataloaders, optimizer, criterion, device,feature_type, num_epochs=50, patience=10):

    """
    Trains the model for a single fold in the cross-validation setup.

    Args:
    - model (torch.nn.Module): The PyTorch model to train.
    - dataloaders (dict): A dictionary containing 'train' and 'val' DataLoaders for the fold.
    - optimizer (torch.optim.Optimizer): The optimizer to use for training.
    - criterion (torch.nn.Module): The loss function.
    - device (torch.device): The device to train the model on.
    - num_epochs (int): The number of epochs to train for.
    - patience (int): The patience for early stopping.
    
    Returns:
    - fold_best_val_f1 (float): The best F1 score achieved on the validation set for the fold.
    """
    best_val_f1 = 0
    patience_counter = 0
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        # print('dataloaders: ', dataloaders)
        #for inputs, labels in dataloaders['extracted_visual_features_train']:
        for inputs, labels in dataloaders[f'{feature_type}_train']:

            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation phase
        val_labels = []
        val_preds = []
        model.eval()
        with torch.no_grad():
            for inputs, labels in dataloaders[f'{feature_type}_val']:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                val_labels.extend(labels.cpu().numpy())
                val_preds.extend(preds.cpu().numpy())

        val_f1 = f1_score(val_labels, val_preds, average='micro')
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    return best_val_f1

g

In [107]:
model_names_dict = {'aud': AudialClassifier, 'vis' : VisualClassifier, 'text' : TextClassifier}
model_names = VisualClassifier# AudialClassifier#, VisualClassifier, TextClassifier
feature_model_dict = {'visual_features': ('extracted_visual_features', VisualClassifier), 'audio_features': ('extracted_audio_features', AudialClassifier), 'lexical_features': ('extracted_text_features', TextClassifier)}
feature_columns = ['extracted_visual_features', 'extracted_audio_features','extracted_text_features']

cross_validate_model( df, model_class = VisualClassifier, feature_type = 'extracted_visual_features', column='visual_features',  n_splits = 5)
cross_validate_model( df, model_class = AudialClassifier, feature_type = 'extracted_audio_features', column='acoustic_features',  n_splits = 5)
cross_validate_model( df, model_class = TextClassifier,   feature_type = 'extracted_text_features', column='lexical_features',  n_splits = 5)



curr f1: 0.5298507462686567
curr f1: 0.5340909090909091
curr f1: 0.5454545454545454
curr f1: 0.5454545454545454
curr f1: 0.5871212121212122
Mean F1 across folds: 0.5484 ± 0.0203
Early stopping triggered at epoch 46
curr f1: 0.5932835820895522
curr f1: 0.5606060606060606
curr f1: 0.571969696969697
curr f1: 0.553030303030303
curr f1: 0.6136363636363636
Mean F1 across folds: 0.5785 ± 0.0222
curr f1: 0.667910447761194
curr f1: 0.6363636363636364
curr f1: 0.6590909090909091
curr f1: 0.6515151515151515
curr f1: 0.6212121212121212
Mean F1 across folds: 0.6472 ± 0.0166
