# 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from time import time
import warnings
import gc # Garbage collector

# NLTK for basic tokenization and vocab building
import nltk
try:
    from nltk.tokenize import word_tokenize
except LookupError:
    nltk.download('punkt')
    from nltk.tokenize import word_tokenize

# Scikit-learn for metrics
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

# Ignore warnings
warnings.filterwarnings('ignore')

# 2. Configuration

In [None]:
# --- Paths ---
DATA_DIR = "../data/processed"
MODEL_SAVE_DIR = "../models/dl"
RESULTS_SAVE_DIR = "../results"
RESULTS_CSV_FILE = os.path.join(RESULTS_SAVE_DIR, "dl_results_summary.csv")
# --- Path to Pre-trained Embeddings (MODIFY IF USING GLOVE/WORD2VEC) ---
# Download from https://nlp.stanford.edu/projects/glove/
GLOVE_PATH = "../embeddings/glove.6B.100d.txt" # Example path

# --- Experiment Setup ---
DOMAINS = ["book_reviews", "financial_news"] # Add domain folder names
# List models to run by their class names defined below
DL_MODELS_TO_RUN = [
    "MLPClassifier",
    "RNNClassifier", # Simple RNN (Optional, usually worse than LSTM)
    "CNNClassifier",
    "LSTMClassifier",
    "BiLSTMClassifier",
    # Advanced Level (Requires EMBEDDING_TYPE='GloVe' or similar)
    "CNNStaticEmbClassifier",
    "LSTMStaticEmbClassifier",
    "CNNLSTMHybridClassifier",
    "WordCharParallelClassifier" # Most Complex
]
EMBEDDING_TYPE = "Learned" # Options: "Learned", "GloVe", "FastText", "Word2Vec"

# --- Model Hyperparameters ---
VOCAB_SIZE = 20000       # Max vocabulary size
EMBEDDING_DIM = 100      # Dimension for embeddings (MATCH PRE-TRAINED if using GloVe etc.)
HIDDEN_DIM_RNN = 128     # RNN/LSTM hidden units
NUM_FILTERS_CNN = 100    # CNN filters per kernel size
KERNEL_SIZES_CNN = [3, 4, 5] # CNN kernel sizes
CHAR_EMBEDDING_DIM = 25  # Character embedding dimension
CHAR_CNN_FILTERS = 50    # Filters for character CNN
CHAR_KERNEL_SIZE = 3     # Kernel size for character CNN
OUTPUT_DIM = 1           # Binary classification (1 output neuron)
DROPOUT_PROB = 0.5

# --- Training Hyperparameters ---
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 5           # Adjust based on convergence
PATIENCE = 2             # For early stopping

# --- Reproducibility & Device ---
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Create directories
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(RESULTS_SAVE_DIR, exist_ok=True)
for domain in DOMAINS:
    os.makedirs(os.path.join(MODEL_SAVE_DIR, domain), exist_ok=True)
    os.makedirs(os.path.join(RESULTS_SAVE_DIR, domain), exist_ok=True)

# 3. Data Loading and Preprocessing Helpers

In [None]:
def load_data(domain_name):
    """Loads train, validation, and test data for a given domain."""
    # (Same function as in ML template - loads train, val, test)
    print(f"\nLoading data for domain: {domain_name}...")
    try:
        train_path = os.path.join(DATA_DIR, domain_name, "train.csv")
        val_path = os.path.join(DATA_DIR, domain_name, "validation.csv")
        test_path = os.path.join(DATA_DIR, domain_name, "test.csv")

        train_df = pd.read_csv(train_path)
        val_df = pd.read_csv(val_path)
        test_df = pd.read_csv(test_path)

        if 'text' not in train_df.columns or 'label' not in train_df.columns: raise ValueError("Missing 'text' or 'label' column in train data")
        if 'text' not in val_df.columns or 'label' not in val_df.columns: raise ValueError("Missing 'text' or 'label' column in val data")
        if 'text' not in test_df.columns or 'label' not in test_df.columns: raise ValueError("Missing 'text' or 'label' column in test data")

        train_df['text'].fillna('', inplace=True)
        val_df['text'].fillna('', inplace=True)
        test_df['text'].fillna('', inplace=True)

        print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, Test shape: {test_df.shape}")
        return train_df, val_df, test_df
    except FileNotFoundError as e:
        print(f"Error loading data for {domain_name}: {e}")
        return None, None, None

def simple_tokenizer(text):
    """Basic word tokenizer using NLTK."""
    return word_tokenize(text.lower()) # Lowercase during tokenization

def build_vocab(texts, max_size):
    """Builds vocabulary from training texts."""
    word_counts = Counter()
    for text in texts:
        word_counts.update(simple_tokenizer(text))
    most_common_words = word_counts.most_common(max_size - 2) # Reserve for pad/unk
    vocab = {word: i+2 for i, (word, _) in enumerate(most_common_words)}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab, len(vocab)

def text_to_sequence(texts, vocab):
    """Converts texts to sequences of integers using the vocab."""
    return [[vocab.get(token, vocab['<unk>']) for token in simple_tokenizer(text)] for text in texts]

def pad_sequences(sequences, max_len, padding_value=0):
    """Pads sequences to max_len."""
    padded = np.full((len(sequences), max_len), padding_value, dtype=np.int64)
    for i, seq in enumerate(sequences):
        seq_len = min(len(seq), max_len)
        if seq_len > 0:
            padded[i, :seq_len] = seq[:seq_len]
    return padded

# --- Character Preprocessing (for WordCharParallelClassifier) ---
def build_char_vocab(texts):
    char_counts = Counter()
    for text in texts:
        char_counts.update(list(text)) # Count individual characters
    # Keep all unique characters found + pad/unk
    chars = sorted(char_counts.keys())
    char_vocab = {char: i+2 for i, char in enumerate(chars)}
    char_vocab['<c_pad>'] = 0
    char_vocab['<c_unk>'] = 1
    return char_vocab, len(char_vocab)

def word_to_char_sequence(word, char_vocab, max_word_len):
    """Converts a word to a padded sequence of character indices."""
    seq = [char_vocab.get(char, char_vocab['<c_unk>']) for char in word]
    padded_seq = np.full(max_word_len, char_vocab['<c_pad>'], dtype=np.int64)
    seq_len = min(len(seq), max_word_len)
    if seq_len > 0:
        padded_seq[:seq_len] = seq[:seq_len]
    return padded_seq

def texts_to_char_sequences(texts, char_vocab, max_seq_len, max_word_len):
    """Converts list of texts to padded char sequences for each word."""
    char_sequences = np.full((len(texts), max_seq_len, max_word_len), char_vocab['<c_pad>'], dtype=np.int64)
    for i, text in enumerate(texts):
        tokens = simple_tokenizer(text)
        num_tokens = min(len(tokens), max_seq_len)
        for j in range(num_tokens):
            char_sequences[i, j, :] = word_to_char_sequence(tokens[j], char_vocab, max_word_len)
    return char_sequences

# 4. Loading Pre-trained Embeddings (Optional)

In [None]:
def load_glove_embeddings(glove_path, word_to_idx, embedding_dim):
    """Loads GloVe embeddings into a NumPy matrix."""
    if not os.path.exists(glove_path):
        print(f"Warning: GloVe path not found: {glove_path}. Using random embeddings.")
        return None

    print(f"Loading GloVe embeddings from {glove_path}...")
    embeddings_index = {}
    try:
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
    except Exception as e:
        print(f"Error loading GloVe file: {e}")
        return None
    print(f"Found {len(embeddings_index)} word vectors.")

    vocab_size = len(word_to_idx)
    # Initialize with small random values
    embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab_size, embedding_dim)).astype(np.float32)
    hits = 0
    for word, i in word_to_idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will keep their random init.
            embedding_matrix[i] = embedding_vector
            hits += 1

    print(f"Converted {hits} of {vocab_size} words ({hits/vocab_size*100:.2f}%)")
    return torch.tensor(embedding_matrix)

# 5. PyTorch Dataset

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, word_sequences, labels, char_sequences=None):
        self.word_sequences = word_sequences
        self.labels = labels
        self.char_sequences = char_sequences
        self.has_char = char_sequences is not None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'words': torch.tensor(self.word_sequences[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float) # For BCEWithLogitsLoss
            # Use torch.long for CrossEntropyLoss if multi-class
        }
        if self.has_char:
            item['chars'] = torch.tensor(self.char_sequences[idx], dtype=torch.long)
        return item

# 6. Model Definitions (PyTorch)

In [None]:
# --- 1. MLP on Averaged Embeddings ---
class MLPClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            self.embedding.weight.requires_grad = False # Freeze pre-trained typically
        self.fc1 = nn.Linear(embedding_dim, embedding_dim // 2)
        self.fc2 = nn.Linear(embedding_dim // 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, words):
        # words shape: (batch_size, seq_len)
        embedded = self.dropout(self.embedding(words))
        # embedded shape: (batch_size, seq_len, emb_dim)
        # Average pooling - need to handle padding
        mask = (words != 0).unsqueeze(-1).float() # Mask for non-padding tokens
        summed = torch.sum(embedded * mask, dim=1)
        counts = mask.sum(dim=1)
        counts = torch.clamp(counts, min=1e-9) # Avoid division by zero
        pooled = summed / counts
        # pooled shape: (batch_size, emb_dim)
        hidden = torch.relu(self.fc1(pooled))
        output = self.fc2(self.dropout(hidden))
        return output

# --- 2. Simple RNN ---
class RNNClassifier(nn.Module):
     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None: self.embedding.weight.data.copy_(pretrained_embeddings)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

     def forward(self, words):
        embedded = self.dropout(self.embedding(words))
        output, hidden = self.rnn(embedded)
        # Use the hidden state of the last time step
        return self.fc(hidden.squeeze(0)) # Squeeze layer dim

# --- 3. CNN Classifier ---
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None: self.embedding.weight.data.copy_(pretrained_embeddings)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, words):
        embedded = self.dropout(self.embedding(words)).permute(0, 2, 1) # (B, Emb, Seq)
        conved = [torch.relu(conv(embedded)) for conv in self.convs] # List[(B, Filters, Seq')]
        pooled = [torch.max_pool1d(conv, conv.shape[-1]).squeeze(-1) for conv in conved] # List[(B, Filters)]
        cat = self.dropout(torch.cat(pooled, dim=-1)) # (B, Filters * Num_Kernels)
        return self.fc(cat)

# --- 4. LSTM Classifier ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None: self.embedding.weight.data.copy_(pretrained_embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout if dropout > 0 else 0, num_layers=1) # Simple 1 layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, words):
        embedded = self.dropout(self.embedding(words)) # (B, Seq, Emb)
        packed_output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: (num_layers * num_directions, batch, hidden_dim)
        # Use the hidden state of the last layer
        last_hidden = self.dropout(hidden[-1,:,:]) # (B, Hidden)
        return self.fc(last_hidden)

# --- 5. BiLSTM Classifier ---
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None: self.embedding.weight.data.copy_(pretrained_embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout if dropout > 0 else 0, num_layers=1, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, words):
        embedded = self.dropout(self.embedding(words)) # (B, Seq, Emb)
        packed_output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: (num_layers * 2, batch, hidden_dim)
        # Concatenate the final forward (hidden[-2]) and backward (hidden[-1]) hidden states
        last_hidden_concat = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) # (B, Hidden*2)
        return self.fc(last_hidden_concat)

# --- 6. CNN-LSTM Hybrid Classifier ---
class CNNLSTMHybridClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_size, hidden_dim_lstm, output_dim, dropout, pretrained_embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None: self.embedding.weight.data.copy_(pretrained_embeddings)
        self.conv = nn.Conv1d(embedding_dim, n_filters, kernel_size=filter_size)
        self.lstm = nn.LSTM(n_filters, hidden_dim_lstm, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim_lstm * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, words):
        embedded = self.dropout(self.embedding(words)).permute(0, 2, 1) # (B, Emb, Seq)
        conved = torch.relu(self.conv(embedded)) # (B, Filters, Seq')
        conved = conved.permute(0, 2, 1) # (B, Seq', Filters) - Ready for LSTM
        packed_output, (hidden, cell) = self.lstm(conved)
        hidden_concat = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden_concat)

# --- 7. Word + Character Parallel Classifier ---
class WordCharParallelClassifier(nn.Module):
    def __init__(self, word_vocab_size, char_vocab_size, word_emb_dim, char_emb_dim,
                 hidden_dim_lstm, char_cnn_filters, char_kernel_size, output_dim, dropout,
                 pretrained_word_embeddings=None):
        super().__init__()
        # Word Stream
        self.word_embedding = nn.Embedding(word_vocab_size, word_emb_dim, padding_idx=0)
        if pretrained_word_embeddings is not None: self.word_embedding.weight.data.copy_(pretrained_word_embeddings)
        self.word_lstm = nn.LSTM(word_emb_dim, hidden_dim_lstm, batch_first=True, bidirectional=True)

        # Character Stream
        self.char_embedding = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
        self.char_conv = nn.Conv1d(char_emb_dim, char_cnn_filters, kernel_size=char_kernel_size)
        # Optional: Add an LSTM after char CNN
        # self.char_lstm = nn.LSTM(char_cnn_filters, hidden_dim_lstm // 2, batch_first=True, bidirectional=True)

        # Combined Feed Forward
        # Adjust input size based on whether char_lstm is used
        # fc_input_dim = hidden_dim_lstm * 2 + hidden_dim_lstm # if char_lstm used
        fc_input_dim = hidden_dim_lstm * 2 + char_cnn_filters # if only char_cnn used
        self.fc = nn.Linear(fc_input_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, words, chars):
        # words: (batch_size, seq_len)
        # chars: (batch_size, seq_len, word_len)

        # --- Word Stream ---
        word_embedded = self.dropout(self.word_embedding(words)) # (B, Seq, WordEmb)
        _, (word_hidden, _) = self.word_lstm(word_embedded)
        # Concatenate final forward and backward hidden states
        word_features = self.dropout(torch.cat((word_hidden[-2,:,:], word_hidden[-1,:,:]), dim=1)) # (B, Hidden*2)

        # --- Character Stream ---
        batch_size, seq_len, word_len = chars.shape
        chars_embedded = self.dropout(self.char_embedding(chars)) # (B, Seq, WordLen, CharEmb)
        # Reshape for Conv1d: Treat SeqLen*WordLen as the sequence dimension
        chars_embedded = chars_embedded.view(batch_size * seq_len, word_len, CHAR_EMBEDDING_DIM)
        chars_embedded = chars_embedded.permute(0, 2, 1) # (B*Seq, CharEmb, WordLen)

        char_conved = torch.relu(self.char_conv(chars_embedded)) # (B*Seq, CharFilters, WordLen')
        # Max pool over word length dimension
        char_pooled = torch.max_pool1d(char_conved, char_conved.shape[-1]).squeeze(-1) # (B*Seq, CharFilters)
        # Reshape back to sequence level
        char_word_features = char_pooled.view(batch_size, seq_len, CHAR_CNN_FILTERS) # (B, Seq, CharFilters)

        # Option: Average pool over sequence length for char features
        char_features = torch.mean(char_word_features, dim=1) # (B, CharFilters)
        # Option: Feed char_word_features into another LSTM (more complex)
        # _, (char_hidden, _) = self.char_lstm(char_word_features)
        # char_features = self.dropout(torch.cat((char_hidden[-2,:,:], char_hidden[-1,:,:]), dim=1)) # (B, Hidden)

        # --- Fusion ---
        combined_features = torch.cat((word_features, char_features), dim=1)

        # --- Classification ---
        output = self.fc(combined_features)
        return output


# 7. Training and Evaluation Functions (PyTorch)

In [None]:
# Functions train_epoch and evaluate remain largely the same as in the previous template.
# Make sure the evaluate function returns the dictionary of metrics.
# Minor modification needed in evaluate for WordCharParallelClassifier if used
# (to pass both words and chars from the batch).

def train_epoch(model, iterator, optimizer, criterion, model_name=""):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    is_parallel_model = model_name == "WordCharParallelClassifier"

    for batch in iterator:
        words = batch['words'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()

        if is_parallel_model:
            chars = batch['chars'].to(DEVICE)
            predictions = model(words, chars).squeeze(1)
        else:
            predictions = model(words).squeeze(1)

        loss = criterion(predictions, labels)
        predicted_classes = torch.round(torch.sigmoid(predictions))
        correct = (predicted_classes == labels).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, model_name=""):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []
    model.eval()
    is_parallel_model = model_name == "WordCharParallelClassifier"

    with torch.no_grad():
        for batch in iterator:
            words = batch['words'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            if is_parallel_model:
                chars = batch['chars'].to(DEVICE)
                predictions = model(words, chars).squeeze(1)
            else:
                predictions = model(words).squeeze(1)

            loss = criterion(predictions, labels)
            predicted_classes = torch.round(torch.sigmoid(predictions))
            correct = (predicted_classes == labels).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            all_preds.extend(predicted_classes.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    report = classification_report(all_labels, all_preds, output_dict=True, zero_division=0)
    eval_results = {
        "Loss": epoch_loss / len(iterator),
        "Accuracy": accuracy_score(all_labels, all_preds), # Use sklearn accuracy
        "Precision (Macro)": report['macro avg']['precision'],
        "Recall (Macro)": report['macro avg']['recall'],
        "F1 (Macro)": report['macro avg']['f1-score'],
        "Precision (Weighted)": report['weighted avg']['precision'],
        "Recall (Weighted)": report['weighted avg']['recall'],
        "F1 (Weighted)": report['weighted avg']['f1-score'],
    }
    print(f'\tEval Loss: {eval_results["Loss"]:.3f} | Eval Acc: {eval_results["Accuracy"]*100:.2f}%')
    # print(classification_report(all_labels, all_preds, zero_division=0)) # Optional: print full report
    return eval_results


# 8. Main Experiment Loop

In [None]:
all_results_list = []

In [None]:
# --- Load GloVe Embeddings Once if needed ---
pretrained_embedding_matrix = None
if EMBEDDING_TYPE == "GloVe":
    # Need vocab built first - build dummy vocab just to get size estimate?
    # Or build vocab inside domain loop and load GloVe there?
    # Let's load inside the loop for simplicity, though slightly inefficient.
    pass # Will load inside loop


for domain in DOMAINS:
    train_df, val_df, test_df = load_data(domain)
    if train_df is None: continue

    print(f"\n{'='*10} Processing Domain: {domain} {'='*10}")

    # --- Build Vocabularies ---
    word_vocab, current_word_vocab_size = build_vocab(train_df['text'], VOCAB_SIZE)
    print(f"Word Vocabulary size: {current_word_vocab_size}")
    # Build char vocab if needed for parallel model
    char_vocab, current_char_vocab_size = None, 0
    if "WordCharParallelClassifier" in DL_MODELS_TO_RUN:
        char_vocab, current_char_vocab_size = build_char_vocab(train_df['text'])
        print(f"Character Vocabulary size: {current_char_vocab_size}")

    # --- Load Pre-trained Embeddings (if specified) ---
    pretrained_embedding_matrix = None
    if EMBEDDING_TYPE == "GloVe":
        pretrained_embedding_matrix = load_glove_embeddings(GLOVE_PATH, word_vocab, EMBEDDING_DIM)
        if pretrained_embedding_matrix is not None:
            pretrained_embedding_matrix = pretrained_embedding_matrix.to(DEVICE) # Move to device once

    # --- Convert Texts to Sequences ---
    train_word_seqs = text_to_sequence(train_df['text'], word_vocab)
    val_word_seqs = text_to_sequence(val_df['text'], word_vocab)
    test_word_seqs = text_to_sequence(test_df['text'], word_vocab)

    # Determine max sequence length (words)
    # Consider calculating based on 95th percentile for efficiency
    # max_len_word = int(np.percentile([len(s) for s in train_word_seqs], 95))
    max_len_word = max(len(s) for s in train_word_seqs) if train_word_seqs else 50 # Fallback length
    max_len_word = min(max_len_word, 300) # Cap max length
    print(f"Max sequence length (words): {max_len_word}")

    # Pad word sequences
    X_train_word_pad = pad_sequences(train_word_seqs, max_len_word)
    X_val_word_pad = pad_sequences(val_word_seqs, max_len_word)
    X_test_word_pad = pad_sequences(test_word_seqs, max_len_word)

    # --- Prepare Character Sequences (if needed) ---
    X_train_char_pad, X_val_char_pad, X_test_char_pad = None, None, None
    if "WordCharParallelClassifier" in DL_MODELS_TO_RUN:
        # Determine max word length (chars)
        # max_len_char = int(np.percentile([len(w) for text in train_df['text'] for w in simple_tokenizer(text)], 95))
        all_tokens = [token for text in train_df['text'] for token in simple_tokenizer(text)]
        max_len_char = max(len(w) for w in all_tokens) if all_tokens else 15 # Fallback length
        max_len_char = min(max_len_char, 25) # Cap max char length
        print(f"Max word length (chars): {max_len_char}")

        X_train_char_pad = texts_to_char_sequences(train_df['text'], char_vocab, max_len_word, max_len_char)
        X_val_char_pad = texts_to_char_sequences(val_df['text'], char_vocab, max_len_word, max_len_char)
        X_test_char_pad = texts_to_char_sequences(test_df['text'], char_vocab, max_len_word, max_len_char)

    # --- Get Labels ---
    y_train = train_df['label'].values
    y_val = val_df['label'].values
    y_test = test_df['label'].values

    # --- Create Datasets and DataLoaders ---
    train_dataset = SentimentDataset(X_train_word_pad, y_train, X_train_char_pad)
    val_dataset = SentimentDataset(X_val_word_pad, y_val, X_val_char_pad)
    test_dataset = SentimentDataset(X_test_word_pad, y_test, X_test_char_pad)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # --- Loop Through Models ---
    for model_name in DL_MODELS_TO_RUN:
        print(f"\n--- Running Model: {model_name} | Embedding: {EMBEDDING_TYPE} ---")

        current_embedding_matrix = None
        if EMBEDDING_TYPE != "Learned":
             current_embedding_matrix = pretrained_embedding_matrix # Use loaded GloVe/etc.
             if current_embedding_matrix is None and model_name not in ["MLPClassifier", "WordCharParallelClassifier"]: # MLP/Parallel handle None
                print(f"Skipping {model_name} as pre-trained embeddings ({EMBEDDING_TYPE}) failed to load.")
                continue

        # Instantiate model
        try:
            if model_name == "MLPClassifier": model = MLPClassifier(current_word_vocab_size, EMBEDDING_DIM, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            elif model_name == "RNNClassifier": model = RNNClassifier(current_word_vocab_size, EMBEDDING_DIM, HIDDEN_DIM_RNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            elif model_name == "CNNClassifier": model = CNNClassifier(current_word_vocab_size, EMBEDDING_DIM, NUM_FILTERS_CNN, KERNEL_SIZES_CNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            elif model_name == "LSTMClassifier": model = LSTMClassifier(current_word_vocab_size, EMBEDDING_DIM, HIDDEN_DIM_RNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            elif model_name == "BiLSTMClassifier": model = BiLSTMClassifier(current_word_vocab_size, EMBEDDING_DIM, HIDDEN_DIM_RNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            elif model_name == "CNNStaticEmbClassifier": model = CNNClassifier(current_word_vocab_size, EMBEDDING_DIM, NUM_FILTERS_CNN, KERNEL_SIZES_CNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix) # Alias
            elif model_name == "LSTMStaticEmbClassifier": model = LSTMClassifier(current_word_vocab_size, EMBEDDING_DIM, HIDDEN_DIM_RNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix) # Alias
            elif model_name == "CNNLSTMHybridClassifier": model = CNNLSTMHybridClassifier(current_word_vocab_size, EMBEDDING_DIM, NUM_FILTERS_CNN, KERNEL_SIZES_CNN[0], HIDDEN_DIM_RNN, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix) # Using first kernel size for CNN part
            elif model_name == "WordCharParallelClassifier":
                if char_vocab is None:
                    print("Skipping WordCharParallelClassifier as char vocab not built.")
                    continue
                model = WordCharParallelClassifier(current_word_vocab_size, current_char_vocab_size, EMBEDDING_DIM, CHAR_EMBEDDING_DIM, HIDDEN_DIM_RNN, CHAR_CNN_FILTERS, CHAR_KERNEL_SIZE, OUTPUT_DIM, DROPOUT_PROB, current_embedding_matrix)
            else:
                print(f"Model {model_name} not defined. Skipping.")
                continue

            model = model.to(DEVICE)
            optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
            criterion = nn.BCEWithLogitsLoss().to(DEVICE) # Binary classification

            # --- Training Loop with Early Stopping ---
            best_val_loss = float('inf')
            epochs_no_improve = 0
            total_train_time = 0
            model_save_path = os.path.join(MODEL_SAVE_DIR, domain, f"{model_name.lower()}_{EMBEDDING_TYPE.lower()}.pt")

            for epoch in range(NUM_EPOCHS):
                t_epoch_start = time()
                train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, model_name)
                val_results = evaluate(model, val_loader, criterion, model_name)
                val_loss = val_results["Loss"]
                epoch_time = time() - t_epoch_start
                total_train_time += epoch_time

                print(f'Epoch: {epoch+1:02}/{NUM_EPOCHS} | Time: {epoch_time:.2f}s')
                print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
                # Val results printed in evaluate()

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(model.state_dict(), model_save_path)
                    epochs_no_improve = 0
                    print(f"\tValidation loss decreased ({best_val_loss:.4f}). Saving model...")
                else:
                    epochs_no_improve += 1
                    print(f"\tValidation loss did not improve. Patience: {epochs_no_improve}/{PATIENCE}")

                if epochs_no_improve >= PATIENCE:
                    print("\tEarly stopping!")
                    break

            # --- Final Evaluation on Test Set ---
            print(f"\nLoading best model from {model_save_path} and evaluating on test set...")
            try:
                model.load_state_dict(torch.load(model_save_path))
            except FileNotFoundError:
                print(f"Warning: Best model file not found at {model_save_path}. Evaluating last state.")
                # If no model was saved (e.g., validation loss never improved),
                # the model variable holds the state from the last epoch.

            t_eval_start = time()
            test_results = evaluate(model, test_loader, criterion, model_name)
            eval_time = time() - t_eval_start
            print(f"Final Test Evaluation Time: {eval_time:.2f}s")

            # Store results
            final_model_results = {
                "Domain": domain,
                "Model": model_name,
                "Embedding": EMBEDDING_TYPE,
                "Accuracy": test_results["Accuracy"],
                "Precision (Macro)": test_results["Precision (Macro)"],
                "Recall (Macro)": test_results["Recall (Macro)"],
                "F1 (Macro)": test_results["F1 (Macro)"],
                "Precision (Weighted)": test_results["Precision (Weighted)"],
                "Recall (Weighted)": test_results["Recall (Weighted)"],
                "F1 (Weighted)": test_results["F1 (Weighted)"],
                "Train Time (s)": total_train_time,
                "Eval Time (s)": eval_time,
                "Best Val Loss": best_val_loss if best_val_loss != float('inf') else np.nan
            }
            all_results_list.append(final_model_results)

        except Exception as e:
            print(f"!!! ERROR running model {model_name} on domain {domain}: {e}")
            import traceback
            traceback.print_exc() # Print detailed error traceback

        finally:
            # Clean up GPU memory after each model run
            del model
            del optimizer
            del criterion
            if 'batch' in locals(): del batch
            if 'words' in locals(): del words
            if 'labels' in locals(): del labels
            if 'chars' in locals(): del chars
            if 'predictions' in locals(): del predictions
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()


# 9. Aggregate and Save Results

In [None]:
print("\n--- Experiment Finished ---")
if all_results_list:
    results_df = pd.DataFrame(all_results_list)
    # Define desired column order
    cols_order = ["Domain", "Model", "Embedding", "Accuracy",
                  "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
                  "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
                  "Train Time (s)", "Eval Time (s)", "Best Val Loss"]
    # Ensure all columns exist
    for col in cols_order:
        if col not in results_df.columns:
            results_df[col] = np.nan
    results_df = results_df[cols_order] # Reorder

    print("\nAggregated Results:")
    print(results_df.to_string()) # Print full dataframe

    # Save to CSV
    results_df.to_csv(RESULTS_CSV_FILE, index=False)
    print(f"\nResults saved to {RESULTS_CSV_FILE}")
else:
    print("No results were generated.")