# Natural Language Processing - Project part 1

### Setup and Input

In [2]:
!pip install nltk
!pip install pandas
!pip install gensim
!pip install re
!pip install numpy
!pip install tensorflow

[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m


In [3]:

import gensim
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.downloader import load
import torch
import pandas as pd
from torch.utils.data import Dataset
import math

In [4]:
import sys
import os
import torch
from torch.utils.data import DataLoader
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
DRIVE_NAME = "NLPproject1"
ROOT_PATH = os.path.join('/content/drive/MyDrive/', DRIVE_NAME)
SCRIPTS_PATH = f'{ROOT_PATH}/Scripts'
DATA_PATH = f'{ROOT_PATH}/Dataset'

if SCRIPTS_PATH not in sys.path:
    sys.path.append(SCRIPTS_PATH)

# --- Importa le tue classi personalizzate ---
from preprocessing import GloveEmbedder
from dataset import EmpathyDataset
from ann_model import DeepEmpathyNet

print("✅ Setup completato. Tutte le classi sono state importate.")

Mounted at /content/drive
✅ Setup completato. Tutte le classi sono state importate.


### Data Preparation

In [5]:
# Embedder Initialization (GloVe model)
glove_embedder = GloveEmbedder(model_name="glove-wiki-gigaword-100")

# Data paths
train_csv_path = f"{DATA_PATH}/trac2_CONVT_train.csv"
eval_csv_path = f"{DATA_PATH}/trac2_CONVT_dev.csv"

# Crea le istanze del Dataset
# La classe ora gestisce tutto internamente: caricamento, pulizia, embedding!
print("\n--- Training Dataset Creation ---")
train_dataset = EmpathyDataset(csv_path=train_csv_path, embedder=glove_embedder)

print("\n--- Evaluation Dataset Creation ---")
eval_dataset = EmpathyDataset(csv_path=eval_csv_path, embedder=glove_embedder)

# DataLoaders
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\n✅ DataLoaders successfully created. Batch size: {BATCH_SIZE}")

Caricamento del modello GloVe 'glove-wiki-gigaword-100'...
Modello GloVe caricato con successo.

--- Training Dataset Creation ---
Loaded and cleaned data from /content/drive/MyDrive/NLPproject1/Dataset/trac2_CONVT_train.csv. Number of samples:11090
Generating embeddings for 11090 samples...

--- Evaluation Dataset Creation ---
Loaded and cleaned data from /content/drive/MyDrive/NLPproject1/Dataset/trac2_CONVT_dev.csv. Number of samples:987
Generating embeddings for 987 samples...

✅ DataLoaders successfully created. Batch size: 32


### Model Initialization


In [6]:
experiment_configs = [
    {
        "experiment_name": "baseline_relu_dropout_0.3",
        "input_dim": glove_embedder.vector_size, # Should be 100
        "hidden_dims": [256, 128],
        "dropout": 0.3,
        "activation": "relu",
        "norm_type": "layernorm",
        "num_classes": 4,
        "learning_rate": 1e-3,
    },
    {
        "experiment_name": "deep_gelu_dropout_0.5",
        "input_dim": glove_embedder.vector_size,
        "hidden_dims": [512, 256, 128], # Rete più profonda
        "dropout": 0.5,                  # Dropout più aggressivo
        "activation": "gelu",            # Attivazione diversa
        "norm_type": "batchnorm",
        "num_classes": 4,
        "learning_rate": 1e-4,           # Learning rate più basso
    },
    {
        "experiment_name": "shallow_leakyrelu_low_dropout",
        "input_dim": glove_embedder.vector_size,
        "hidden_dims": [512],           # Rete più superficiale
        "dropout": 0.2,
        "activation": "leakyrelu",
        "norm_type": "layernorm",
        "num_classes": 4,
        "learning_rate": 1e-3,
    }
    # Add other configs
]

NUM_EPOCHS = 10
MODELS_SAVE_PATH = f"{ROOT_PATH}/Saved Models"
os.makedirs(MODELS_SAVE_PATH, exist_ok=True)

print(f"Read {len(experiment_configs)} esperimenti.")
print(f"I modelli migliori verranno salvati in: {MODELS_SAVE_PATH}")

Read 3 esperimenti.
I modelli migliori verranno salvati in: /content/drive/MyDrive/NLPproject1/Saved Models


### Training and Evaluation

In [7]:
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def train_one_epoch(model, dataloader, optimizer, loss_fns, device):
    model.train()
    total_loss = 0.0

    # Definisci i pesi per ogni loss (puoi regolarli)
    loss_weights = {'intensity': 0.2, 'empathy': 0.2, 'polarity': 0.6}

    for batch in dataloader:
        features = batch['features'].to(device)
        labels = {k: v.to(device) for k, v in batch['labels'].items()}

        # Forward pass
        outputs = model(features)

        # Calcolo delle loss
        loss_intensity = loss_fns['regression'](outputs['intensity'], labels['intensity'])
        loss_empathy = loss_fns['regression'](outputs['empathy'], labels['empathy'])
        loss_polarity = loss_fns['classification'](outputs['polarity'], labels['polarity'])

        # Loss combinata e pesata
        combined_loss = (loss_weights['intensity'] * loss_intensity +
                         loss_weights['empathy'] * loss_empathy +
                         loss_weights['polarity'] * loss_polarity)

        # Backward pass e ottimizzazione
        optimizer.zero_grad()
        combined_loss.backward()
        optimizer.step()

        total_loss += combined_loss.item()

    return total_loss / len(dataloader)

def evaluate_performance(model, dataloader, loss_fns, device):
    model.eval()
    total_loss = 0.0
    loss_weights = {'intensity': 0.2, 'empathy': 0.2, 'polarity': 0.6}

    all_intensity_preds, all_intensity_labels = [], []
    all_empathy_preds, all_empathy_labels = [], []
    all_polarity_preds, all_polarity_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            features = batch['features'].to(device)
            labels = {k: v.to(device) for k, v in batch['labels'].items()}

            outputs = model(features)

            loss_intensity = loss_fns['regression'](outputs['intensity'], labels['intensity'])
            loss_empathy = loss_fns['regression'](outputs['empathy'], labels['empathy'])
            loss_polarity = loss_fns['classification'](outputs['polarity'], labels['polarity'])
            combined_loss = (loss_weights['intensity'] * loss_intensity +
                             loss_weights['empathy'] * loss_empathy +
                             loss_weights['polarity'] * loss_polarity)
            total_loss += combined_loss.item()

            # --- Salvataggio delle predizioni per il calcolo delle metriche ---
            # Regressione
            all_intensity_preds.append(outputs['intensity'].cpu())
            all_intensity_labels.append(labels['intensity'].cpu())
            all_empathy_preds.append(outputs['empathy'].cpu())
            all_empathy_labels.append(labels['empathy'].cpu())

            # Classificazione
            polarity_preds = torch.argmax(outputs['polarity'], dim=1)
            all_polarity_preds.append(polarity_preds.cpu())
            all_polarity_labels.append(labels['polarity'].cpu())

    # Concatena i risultati di tutti i batch
    all_intensity_preds = torch.cat(all_intensity_preds)
    all_intensity_labels = torch.cat(all_intensity_labels)
    all_empathy_preds = torch.cat(all_empathy_preds)
    all_empathy_labels = torch.cat(all_empathy_labels)
    all_polarity_preds = torch.cat(all_polarity_preds)
    all_polarity_labels = torch.cat(all_polarity_labels)

    # --- Calcolo delle Metriche ---
    # MAE per regressione (usando la funzione L1 Loss di PyTorch)
    mae_intensity = nn.functional.l1_loss(all_intensity_preds, all_intensity_labels).item()
    mae_empathy = nn.functional.l1_loss(all_empathy_preds, all_empathy_labels).item()

    # Metriche di classificazione (usando scikit-learn)
    accuracy_polarity = accuracy_score(all_polarity_labels, all_polarity_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_polarity_labels, all_polarity_preds, average='weighted', zero_division=0
    )

    # Ritorna un dizionario con tutte le metriche
    metrics = {
        "val_loss": total_loss / len(dataloader),
        "intensity_mae": mae_intensity,
        "empathy_mae": mae_empathy,
        "polarity_accuracy": accuracy_polarity,
        "polarity_precision": precision,
        "polarity_recall": recall,
        "polarity_f1": f1
    }

    return metrics

print("✅ Training and evaluation function defined.")

✅ Training and evaluation function defined.


In [8]:
# ============= #
# EXPERIMENT 1  #
# ============= #

device = "cuda" if torch.cuda.is_available() else "cpu"
results = []
best_overall_val_loss = float('inf')

for config in experiment_configs:
    print(f"\n{'='*20} START EXPERIMENT: {config['experiment_name']} {'='*20}")

    model = DeepEmpathyNet(config).to(device)

    loss_functions = {
        'regression': nn.MSELoss(),
        'classification': nn.CrossEntropyLoss()
    }
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

    best_epoch_metrics = {"val_loss": float('inf')}

    for epoch in range(NUM_EPOCHS):
        train_loss = train_one_epoch(model, train_loader, optimizer, loss_functions, device)
        val_metrics = evaluate_performance(model, eval_loader, loss_functions, device)

        print(f"Epoch {epoch+1}/{NUM_EPOCHS} -> Train Loss: {train_loss:.4f} | Val Loss: {val_metrics['val_loss']:.4f} | "
              f"Polarity F1: {val_metrics['polarity_f1']:.4f} | Intensity MAE: {val_metrics['intensity_mae']:.4f}")

        # Salva il modello basandoti sulla validation loss
        if val_metrics['val_loss'] < best_epoch_metrics['val_loss']:
            best_epoch_metrics = val_metrics # Salva tutte le metriche della migliore epoca
            model_save_path = os.path.join(MODELS_SAVE_PATH, f"{config['experiment_name']}_best.pth")
            torch.save(model.state_dict(), model_save_path)
            print(f"  -> New best model for this experiment! Saved in {model_save_path}")

    # Aggiungi tutte le metriche della migliore epoca ai risultati finali
    final_result = {"experiment_name": config['experiment_name'], "model_path": model_save_path}
    final_result.update(best_epoch_metrics)
    results.append(final_result)

    if best_epoch_metrics['val_loss'] < best_overall_val_loss:
        best_overall_val_loss = best_epoch_metrics['val_loss']
        print(f"\nNEW BEST MODEL OVERALL: {config['experiment_name']} 🏆")

print(f"\n{'='*20} END OF ALL EXPERIMENTS {'='*20}")


Epoch 1/10 -> Train Loss: 0.8119 | Val Loss: 0.7568 | Polarity F1: 0.6156 | Intensity MAE: 0.5231
  -> New best model for this experiment! Saved in /content/drive/MyDrive/NLPproject1/Saved Models/baseline_relu_dropout_0.3_best.pth
Epoch 2/10 -> Train Loss: 0.7127 | Val Loss: 0.7426 | Polarity F1: 0.6191 | Intensity MAE: 0.5334
  -> New best model for this experiment! Saved in /content/drive/MyDrive/NLPproject1/Saved Models/baseline_relu_dropout_0.3_best.pth
Epoch 3/10 -> Train Loss: 0.6964 | Val Loss: 0.7388 | Polarity F1: 0.6395 | Intensity MAE: 0.5110
  -> New best model for this experiment! Saved in /content/drive/MyDrive/NLPproject1/Saved Models/baseline_relu_dropout_0.3_best.pth
Epoch 4/10 -> Train Loss: 0.6876 | Val Loss: 0.7307 | Polarity F1: 0.6388 | Intensity MAE: 0.5086
  -> New best model for this experiment! Saved in /content/drive/MyDrive/NLPproject1/Saved Models/baseline_relu_dropout_0.3_best.pth
Epoch 5/10 -> Train Loss: 0.6751 | Val Loss: 0.7355 | Polarity F1: 0.6562 |

### Analysis Results

In [9]:
from datetime import datetime
import pandas as pd

REPORT_PATH = os.path.join(ROOT_PATH, 'Report')
os.makedirs(REPORT_PATH, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d")
report_filename = f"experiment_results_{timestamp}.csv"
report_filepath = os.path.join(REPORT_PATH, report_filename)

results_df = pd.DataFrame(results)

# Seleziona e Rinomina le colonne per una migliore leggibilità
display_columns = {
    'experiment_name': 'Experiment',
    'val_loss': 'Validation Loss',
    'polarity_f1': 'Polarity F1',
    'polarity_accuracy': 'Polarity Acc.',
    'intensity_mae': 'Intensity MAE',
    'empathy_mae': 'Empathy MAE'
}
results_df_display = results_df[list(display_columns.keys())].rename(columns=display_columns)
results_df_sorted = results_df_display.sort_values(by="Validation Loss", ascending=True).reset_index(drop=True)

try:
    results_df_sorted.to_csv(report_filepath, index=False)
    print(f"Report salvato con successo!")
    print(f"Percorso del file: {report_filepath}")
except Exception as e:
    print(f"Errore durante il salvataggio del report: {e}")

print("\n--- Report preview ---")
saved_report = pd.read_csv(report_filepath)
print(saved_report.head())

best_experiment_name = results_df_sorted.iloc[0]['Experiment']
best_experiment_full_info = next(item for item in results if item["experiment_name"] == best_experiment_name)

print(f"\nThe best experiment is '{best_experiment_full_info['experiment_name']}'")
print(f"with a validation loss: {best_experiment_full_info['val_loss']:.4f}")
print(f"The corresponding model is saved in: {best_experiment_full_info['model_path']}")


# Per caricare il modello migliore in futuro, puoi usare:
# model_config = ... (la configurazione del miglior esperimento)
# best_model = DeepEmpathyNet(model_config)
# best_model.load_state_dict(torch.load(best_experiment['model_path']))
# best_model.to(device)

Report salvato con successo!
Percorso del file: /content/drive/MyDrive/NLPproject1/Report/experiment_results_20251014.csv

--- Report preview ---
                      Experiment  Validation Loss  Polarity F1  Polarity Acc.  \
0      baseline_relu_dropout_0.3         0.713343     0.643797       0.654509   
1  shallow_leakyrelu_low_dropout         0.720213     0.642761       0.656535   
2          deep_gelu_dropout_0.5         0.785186     0.634318       0.651469   

   Intensity MAE  Empathy MAE  
0       0.505874     0.745510  
1       0.522459     0.758093  
2       0.612291     0.772994  

The best experiment is 'baseline_relu_dropout_0.3'
with a validation loss: 0.7133
The corresponding model is saved in: /content/drive/MyDrive/NLPproject1/Saved Models/baseline_relu_dropout_0.3_best.pth


### Test

In [10]:
from dataset import InferenceDataset

TEST_CSV_PATH = f"{DATA_PATH}/trac2_CONVT_test.csv"
SUBMISSION_PATH = os.path.join(REPORT_PATH, 'ann_report.csv')
ID_COLUMN_NAME = 'id'       # Verifica il nome della colonna ID nel file di test
TEXT_COLUMN_NAME = 'text'   # Verifica il nome della colonna di testo

print("--- Start inference on test set ---")

# Recuperiamo la configurazione e il percorso del miglior esperimento
best_experiment_name = results_df_sorted.iloc[0]['Experiment']
best_experiment_info = next(item for item in experiment_configs if item["experiment_name"] == best_experiment_name)
best_model_path = os.path.join(MODELS_SAVE_PATH, f"{best_experiment_name}_best.pth")

print(f"Best model: '{best_experiment_name}'")
model = DeepEmpathyNet(best_experiment_info).to(device)
model.load_state_dict(torch.load(best_model_path))
model.eval()


test_dataset = InferenceDataset(
    csv_path=TEST_CSV_PATH,
    embedder=glove_embedder,
    id_column=ID_COLUMN_NAME,
    text_column=TEXT_COLUMN_NAME
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False) # shuffle=False è importante!


all_ids = []
all_emotion_preds = []
all_empathy_preds = []
all_polarity_preds = []

print("Predictions running...")
with torch.no_grad(): # Disabilita il calcolo dei gradienti per velocizzare
    for batch in test_loader:
        ids = batch['id']
        features = batch['features'].to(device)

        outputs = model(features)

        emotion_preds = outputs['intensity'].squeeze().cpu().numpy()
        empathy_preds = outputs['empathy'].squeeze().cpu().numpy()
        polarity_preds = torch.argmax(outputs['polarity'], dim=1).cpu().numpy()

        all_ids.extend(ids.numpy())
        all_emotion_preds.extend(emotion_preds)
        all_empathy_preds.extend(empathy_preds)
        all_polarity_preds.extend(polarity_preds)

print("Predictions completed.")

submission_df = pd.DataFrame({
    'id': all_ids,
    'Emotion': all_emotion_preds,
    'EmotionalPolarity': all_polarity_preds,
    'Empathy': all_empathy_preds
})

submission_df['EmotionalPolarity'] = submission_df['EmotionalPolarity'].astype(int)


submission_df.to_csv(SUBMISSION_PATH, index=False)

print(f"\n✅ File di submission creato con successo!")
print(f"Percorso del file: {SUBMISSION_PATH}")

print("\n--- Anteprima del file di submission ---")
print(submission_df.head())

--- Start inference on test set ---
Best model: 'baseline_relu_dropout_0.3'
Generazione degli embedding per 2311 campioni di test...
Predictions running...
Predictions completed.

✅ File di submission creato con successo!
Percorso del file: /content/drive/MyDrive/NLPproject1/Report/ann_report.csv

--- Anteprima del file di submission ---
   id   Emotion  EmotionalPolarity   Empathy
0   1  2.195595                  2  2.051561
1   2  2.463433                  2  2.298273
2   3  2.412533                  2  2.224228
3   4  2.161638                  2  2.134687
4   5  2.682936                  2  2.594292
