# 1. Setup and Configuration

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split 
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix
    
)
import time
import os
import joblib
import logging
import warnings
import gc
import psutil

# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Limit CPU Usage ---
p = psutil.Process()
p.cpu_affinity([1, 2])

In [2]:
# --- Project Directory Structure ---
BASE_DIR = ".." 
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")

MODEL_OUTPUT_BASE_DIR = os.path.join(BASE_DIR, "models", "dl")
RESULT_DIR = os.path.join(BASE_DIR, "result")

# --- Specific Dataset Paths ---
BOOK_REVIEW_DATA_DIR = os.path.join(DATA_DIR, "book_reviews")
FINANCIAL_NEWS_DATA_DIR = os.path.join(DATA_DIR, "financial_news")

# --- Model/Result Output Dirs (Ensure they exist) ---
BOOK_REVIEW_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "book_reviews")
FINANCIAL_NEWS_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "financial_news")
BOOK_REVIEW_RESULT_DIR = os.path.join(RESULT_DIR, "book_reviews")
FINANCIAL_NEWS_RESULT_DIR = os.path.join(RESULT_DIR, "financial_news")

os.makedirs(BOOK_REVIEW_MODEL_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_MODEL_DIR, exist_ok=True)
os.makedirs(BOOK_REVIEW_RESULT_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_RESULT_DIR, exist_ok=True)

# --- GloVe Path ---
GLOVE_PATH = os.path.join(BASE_DIR, "data", "embeddings", "glove.6B.100d.txt")

# --- File Names ---
TRAIN_FN = "train.csv"
VAL_FN = "val.csv"
TEST_FN = "test.csv"

# --- Column Names ---
TEXT_COLUMN = "text"
TARGET_COLUMN = "score"

# Activate GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")



2025-05-01 10:01:14,499 - INFO - Using device: cuda


In [3]:
# --- Model & Training Hyperparameters ---
RANDOM_STATE = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# Vocabulary params
MIN_WORD_FREQ = 3 # Minimum frequency for a word to be included in the vocabulary

# Embedding params
EMBEDDING_DIM = 100 # Must match GloVe dimension if using pre-trained GloVe
LEARNED_EMBEDDING_DIM = 100 # Dimension for embeddings learned from scratch

# Model Arch params (can be tuned)
HIDDEN_DIM_RNN_LSTM = 64
N_LAYERS_RNN_LSTM = 3
DROPOUT = 0.5
N_FILTERS_CNN = 100
FILTER_SIZES_CNN = [3, 4, 5] # Kernel sizes for CNN

# Training params
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 30 # Increase for better performance, but takes longer
GRADIENT_CLIP = 1.0 # Helps prevent exploding gradients in RNNs/LSTMs

# --- Evaluation Metrics ---
METRICS_TO_CALCULATE = [
    "Accuracy",
    "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
    "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
    "Train Time (Epoch, s)", "Eval Time (s)" # Train time per epoch is more practical for DL
]

# --- Label Mapping (For PyTorch CrossEntropyLoss) ---
LABEL_MAP = {'negative': 0, 'neutral': 1, 'positive': 2} # Example mapping
NUM_CLASSES = len(LABEL_MAP)

# --- Datasets Configuration ---
DATASETS_TO_PROCESS = {
    "Book Review": {
        "train_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TRAIN_FN}'),
        "val_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{VAL_FN}'),
        "test_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TEST_FN}'),
        "model_dir": BOOK_REVIEW_MODEL_DIR,
        "result_dir": BOOK_REVIEW_RESULT_DIR,
        "vocab_path": os.path.join(BOOK_REVIEW_MODEL_DIR, "vocab.pt"), # Save vocab per dataset
    },
    "Financial News": {
        "train_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TRAIN_FN}'),
        "val_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{VAL_FN}'),
        "test_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TEST_FN}'),
        "model_dir": FINANCIAL_NEWS_MODEL_DIR,
        "result_dir": FINANCIAL_NEWS_RESULT_DIR,
         "vocab_path": os.path.join(FINANCIAL_NEWS_MODEL_DIR, "vocab.pt"),
    }
}

2025-05-01 10:01:14,512 - INFO - Using device: cuda


# 2. Utility Functions and Classes

In [4]:
def load_data(path):
    """Loads data from CSV and handles basic cleaning."""
    try:
        df = pd.read_csv(path)
        df = df.dropna(subset=[TEXT_COLUMN, TARGET_COLUMN]) # Drop rows with NaNs in critical columns
        df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str) # Ensure text is string
        df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(str) # Ensure labels are string before mapping
        # Map labels to integers
        df[TARGET_COLUMN] = df[TARGET_COLUMN].map(LABEL_MAP)
        # Verify mapping worked - check for NaNs introduced if a label wasn't in LABEL_MAP
        if df[TARGET_COLUMN].isnull().any():
            logging.warning(f"NaNs found in target column after mapping for {path}. Check LABEL_MAP and data labels.")
            # Option: Drop rows with unmapped labels
            original_count = len(df)
            df = df.dropna(subset=[TARGET_COLUMN])
            logging.warning(f"Dropped {original_count - len(df)} rows with unmappable labels.")
        df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(int) # Convert to int after mapping
        return df
    except FileNotFoundError:
        logging.error(f"File not found: {path}")
        return None
    except Exception as e:
        logging.error(f"Error loading data from {path}: {e}")
        return None

def tokenize(text):
    """Simple whitespace tokenizer."""
    return text.lower().split()

def build_vocab(texts, min_freq=MIN_WORD_FREQ):
    """Builds a vocabulary from a list of texts."""
    word_counts = Counter()
    for text in texts:
        word_counts.update(tokenize(text))

    # Create vocab mapping: word -> index
    # Add special tokens: <pad> for padding, <unk> for unknown words
    vocab = {"<pad>": 0, "<unk>": 1}
    idx = 2
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    logging.info(f"Built vocabulary with {len(vocab)} words (min freq: {min_freq}).")
    return vocab

class SentimentDataset(Dataset):
    """PyTorch Dataset for sentiment analysis."""
    def __init__(self, texts, labels, vocab, max_len=None): # max_len can be added for truncation
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.vocab_stoi = vocab # word -> index
        self.vocab_itos = {i: w for w, i in vocab.items()} # index -> word
        self.unk_idx = vocab.get("<unk>", 1)
        # self.max_len = max_len # Optional: truncate sequences

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = tokenize(text)
        # Convert tokens to indices
        token_ids = [self.vocab_stoi.get(token, self.unk_idx) for token in tokens]

        # Optional Truncation:
        # if self.max_len:
        #     token_ids = token_ids[:self.max_len]

        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    """Collates data samples into batches with padding."""
    label_list, text_list, lengths = [], [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.long)
        text_list.append(processed_text)
        lengths.append(len(processed_text)) # Store original lengths

    # Pad sequences to the max length in this batch
    # batch_first=True means output shape is (batch_size, seq_len)
    text_list_padded = pad_sequence(text_list, batch_first=True, padding_value=0) # Use PAD index 0

    label_list = torch.tensor(label_list, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long) # Useful for packed sequences later if needed

    return text_list_padded, label_list, lengths


def load_glove_embeddings(glove_path, vocab_stoi, embedding_dim):
    """Loads GloVe embeddings for words in the vocabulary."""
    if not os.path.exists(glove_path):
        logging.error(f"GloVe file not found at: {glove_path}")
        return None

    logging.info(f"Loading GloVe embeddings from {glove_path}")
    embeddings_index = {}
    try:
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                try:
                    vector = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = vector
                except ValueError:
                    logging.debug(f"Skipping line in GloVe file (could not parse vector): {line[:50]}...")
                    continue # Skip lines that might not parse correctly
    except Exception as e:
        logging.error(f"Error reading GloVe file: {e}")
        return None

    logging.info(f"Found {len(embeddings_index)} word vectors in GloVe file.")

    vocab_size = len(vocab_stoi)
    # Initialize embedding matrix with zeros or small random values
    # np.random.seed(RANDOM_STATE)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    # Or random init: embedding_matrix = np.random.rand(vocab_size, embedding_dim) * 0.02 - 0.01

    found_count = 0
    for word, i in vocab_stoi.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will remain zeros (or random).
            embedding_matrix[i] = embedding_vector
            found_count += 1
        else:
            # Handle <unk> and <pad> specifically
            if word == "<unk>": # Initialize <unk> token vector (e.g., average or random)
                embedding_matrix[i] = np.random.rand(embedding_dim) * 0.02 - 0.01 # Small random
                # pass
            elif word == "<pad>":
                embedding_matrix[i] = np.zeros(embedding_dim) # Ensure PAD is zeros

    logging.info(f"Initialized embedding matrix. Shape: {embedding_matrix.shape}")
    logging.info(f"Found pre-trained vectors for {found_count}/{vocab_size} words in vocabulary.")
    return torch.tensor(embedding_matrix, dtype=torch.float)


def calculate_metrics(y_true, y_pred):
    """Calculates evaluation metrics."""
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    return {
        "Accuracy": accuracy,
        "F1 (Macro)": f1_macro,
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 (Weighted)": f1_weighted,
        "Precision (Weighted)": precision_weighted,
        "Recall (Weighted)": recall_weighted,
    }

# 3. Model Definitions

In [5]:
# --- Base Model with Embedding Handling ---
class BaseModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings=None, freeze_embeddings=False):
        super().__init__()
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(
                pretrained_embeddings,
                freeze=freeze_embeddings,
                padding_idx=pad_idx
            )
            logging.info(f"Using pre-trained embeddings. Freeze: {freeze_embeddings}")
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
            logging.info("Using learned embeddings.")
        self.output_dim = output_dim

In [6]:
# --- 1. MLP on Averaged Embeddings ---
# Note: This averages embeddings before passing to MLP, simpler than sequence processing.
class MLPAveraged(BaseModel):
     def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, hidden_dim1=64, hidden_dim2=32, dropout=DROPOUT, pretrained_embeddings=None, freeze_embeddings=False):
         # embedding_dim is input_dim for MLP part
         super().__init__(vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings, freeze_embeddings)
         self.fc1 = nn.Linear(embedding_dim, hidden_dim1)
         self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
         self.fc3 = nn.Linear(hidden_dim2, output_dim)
         self.dropout = nn.Dropout(dropout)
         self.relu = nn.ReLU()

     def forward(self, text, text_lengths=None): # text_lengths unused here but kept for consistency
         # text shape: (batch_size, seq_len)
         embedded = self.embedding(text)
         # embedded shape: (batch_size, seq_len, embedding_dim)

         # Average embeddings across sequence length dimension
         # Need to handle padding: Mask out pad tokens before averaging
         pad_mask = (text != self.embedding.padding_idx).float().unsqueeze(-1) # (batch_size, seq_len, 1)
         embedded = embedded * pad_mask # Zero out embeddings for pad tokens
         # Sum embeddings and divide by actual lengths (excluding pad tokens)
         # Calculate actual lengths (sum of non-pad tokens)
         actual_lengths = pad_mask.sum(dim=1)
         actual_lengths = torch.max(actual_lengths, torch.ones_like(actual_lengths)) # Avoid division by zero for empty sequences

         pooled = embedded.sum(dim=1) / actual_lengths # Shape: (batch_size, embedding_dim)

         x = self.dropout(self.relu(self.fc1(pooled)))
         x = self.dropout(self.relu(self.fc2(x)))
         output = self.fc3(x) # Shape: (batch_size, output_dim)
         return output

In [7]:
# --- 2. Basic RNN ---
class RNNModel(BaseModel):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx, bidirectional=False, pretrained_embeddings=None, freeze_embeddings=False):
        super().__init__(vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings, freeze_embeddings)
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          batch_first=True, # Input shape: (batch_size, seq_len, embed_dim)
                          dropout=dropout if n_layers > 1 else 0) # Dropout only between layers
        # Adjust linear layer input size for bidirectional
        fc_in_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(fc_in_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths): # text_lengths useful for PackedSequence but not used here
        # text shape: (batch_size, seq_len)
        embedded = self.dropout(self.embedding(text))
        # embedded shape: (batch_size, seq_len, embedding_dim)

        # No packing used here for simplicity, RNN processes padded sequences
        # Output shape: (batch_size, seq_len, num_directions * hidden_dim)
        # Hidden shape: (n_layers * num_directions, batch_size, hidden_dim)
        rnn_output, hidden = self.rnn(embedded)

        # Get output from the last time step (or concatenate final forward/backward hidden states)
        # hidden[-1] is the hidden state of the last layer (forward)
        # hidden[-2] would be the last backward state if bidirectional
        if self.rnn.bidirectional:
            # Concatenate the final hidden states of the last layer from both directions
            # hidden shape: (n_layers * 2, batch, hidden_dim)
            # hidden[-2,:,:] is last layer's forward, hidden[-1,:,:] is last layer's backward
            hidden_fwd = hidden[-2,:,:]
            hidden_bwd = hidden[-1,:,:]
            hidden_cat = torch.cat((hidden_fwd, hidden_bwd), dim=1)
        else:
            # hidden shape: (n_layers * 1, batch, hidden_dim)
            hidden_cat = hidden[-1,:,:]

        # Apply dropout and final linear layer
        output = self.fc(self.dropout(hidden_cat)) # Shape: (batch_size, output_dim)
        return output

In [8]:
# --- 3. LSTM Model ---
class LSTMModel(BaseModel):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx, bidirectional=True, pretrained_embeddings=None, freeze_embeddings=False):
        super().__init__(vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings, freeze_embeddings)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout=dropout if n_layers > 1 else 0)
        fc_in_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(fc_in_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths): # text_lengths can be used with pack_padded_sequence
        embedded = self.dropout(self.embedding(text))

        # Optional: Use packed sequences for efficiency (handles padding)
        # packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        # packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        # Using padded sequence directly (simpler for this example):
        lstm_output, (hidden, cell) = self.lstm(embedded)

        # hidden shape: (n_layers * num_directions, batch_size, hidden_dim)
        # cell shape: (n_layers * num_directions, batch_size, hidden_dim)
        if self.lstm.bidirectional:
            hidden_fwd = hidden[-2,:,:]
            hidden_bwd = hidden[-1,:,:]
            hidden_cat = torch.cat((hidden_fwd, hidden_bwd), dim=1)
        else:
            hidden_cat = hidden[-1,:,:]

        output = self.fc(self.dropout(hidden_cat))
        return output

In [9]:
# --- 4. CNN Model (1D Convolution) ---
class CNNModel(BaseModel):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx, pretrained_embeddings=None, freeze_embeddings=False):
        super().__init__(vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings, freeze_embeddings)
        # Create multiple convolutional layers with different kernel sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=n_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        # The output dimension after concatenating pooled features from all kernel sizes
        fc_in_dim = len(filter_sizes) * n_filters
        self.fc = nn.Linear(fc_in_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, text, text_lengths=None):
        # text: [batch size, seq len]
        embedded = self.dropout(self.embedding(text))
        # embedded: [batch size, seq len, emb dim]

        # Conv1d expects input shape: (batch_size, channels, seq_len)
        # So, permute dimensions: (batch_size, emb dim, seq len)
        embedded = embedded.permute(0, 2, 1)

        # Apply convolutions and pooling
        conved = [self.relu(conv(embedded)) for conv in self.convs]
        # conved[n]: [batch size, n filters, seq len - filter_sizes[n] + 1]

        # Apply max pooling over time (sequence length dimension)
        # Pool size should cover the entire sequence length dimension after convolution
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled[n]: [batch size, n filters]

        # Concatenate the pooled features from different filter sizes
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat: [batch size, n filters * len(filter_sizes)]

        return self.fc(cat)

In [10]:
# --- 5. CNN-LSTM Hybrid Model ---
class CNNLSTMModel(BaseModel):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_size_cnn, # Single filter size for simplicity here
                 hidden_dim_lstm, output_dim, n_layers_lstm, dropout, pad_idx,
                 pretrained_embeddings=None, freeze_embeddings=False):
        super().__init__(vocab_size, embedding_dim, output_dim, pad_idx, pretrained_embeddings, freeze_embeddings)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=filter_size_cnn)
        self.relu = nn.ReLU()
        # Input to LSTM is the output channels of CNN
        self.lstm = nn.LSTM(n_filters, # Input features = CNN output channels
                            hidden_dim_lstm,
                            num_layers=n_layers_lstm,
                            bidirectional=True, # Often good to use bidirectional
                            batch_first=True,
                            dropout=dropout if n_layers_lstm > 1 else 0)
        fc_in_dim = hidden_dim_lstm * 2 # Bidirectional LSTM
        self.fc = nn.Linear(fc_in_dim, output_dim)
        self.dropout_embed = nn.Dropout(dropout)
        self.dropout_final = nn.Dropout(dropout)

    def forward(self, text, text_lengths=None):
        # text: [batch size, seq len]
        embedded = self.dropout_embed(self.embedding(text))
        # embedded: [batch size, seq len, emb dim]

        # --- CNN Part ---
        # Permute for Conv1d: [batch size, emb dim, seq len]
        embedded_permuted = embedded.permute(0, 2, 1)
        conved = self.relu(self.conv(embedded_permuted))
        # conved: [batch size, n filters, new seq len]
        # Permute back for LSTM: [batch size, new seq len, n filters]
        conved_permuted = conved.permute(0, 2, 1)

        # --- LSTM Part ---
        lstm_output, (hidden, cell) = self.lstm(conved_permuted)
        # lstm_output: [batch size, seq len, num directions * hidden dim]
        # hidden: [n layers * num directions, batch size, hidden dim]

        # Concatenate final forward and backward hidden states
        hidden_fwd = hidden[-2,:,:]
        hidden_bwd = hidden[-1,:,:]
        hidden_cat = torch.cat((hidden_fwd, hidden_bwd), dim=1)

        # --- Final Output ---
        output = self.fc(self.dropout_final(hidden_cat))
        return output

# 4. Training and Evaluation Functions

In [11]:
def train_epoch(model, iterator, optimizer, criterion, device, grad_clip=None):
    """Trains the model for one epoch."""
    model.train()
    epoch_loss = 0
    start_time = time.time()

    for batch_idx, (text, labels, lengths) in enumerate(iterator):
        text, labels = text.to(device), labels.to(device)
        lengths = lengths.to('cpu') # lengths for pack_padded_sequence must be on CPU

        optimizer.zero_grad()

        # Forward pass
        predictions = model(text, lengths) # Pass lengths if model uses them

        # Calculate loss
        loss = criterion(predictions, labels)

        # Backward pass and optimization
        loss.backward()

        # Gradient clipping
        if grad_clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

        # Optional: Print batch progress
        if batch_idx % 100 == 0:
            logging.debug(f"Batch {batch_idx}/{len(iterator)}, Loss: {loss.item():.4f}")

    end_time = time.time()
    train_time_epoch = end_time - start_time
    return epoch_loss / len(iterator), train_time_epoch


def evaluate(model, iterator, criterion, device):
    """Evaluates the model on a given dataset iterator."""
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    start_time = time.time()

    with torch.no_grad():
        for batch_idx, (text, labels, lengths) in enumerate(iterator):
            text, labels = text.to(device), labels.to(device)
            lengths = lengths.to('cpu')

            predictions = model(text, lengths)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()

            # Get predicted labels
            preds = torch.argmax(predictions, dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    end_time = time.time()
    eval_time = end_time - start_time
    metrics = calculate_metrics(all_labels, all_preds)
    avg_loss = epoch_loss / len(iterator)

    conf_matrix = confusion_matrix(all_labels, all_preds)

    return avg_loss, metrics, eval_time, conf_matrix

# 5. Run Experiments

In [12]:
all_results = []

In [13]:
# --- Loop through each dataset defined in the configuration ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    print(f"\n{'='*20} Processing Dataset: {dataset_name} {'='*20}")
    logging.info(f"Processing Dataset: {dataset_name}")

    # 1. Load Data
    train_df = load_data(config['train_path'])
    val_df = load_data(config['val_path'])
    test_df = load_data(config['test_path'])

    if train_df is None or val_df is None or test_df is None:
        logging.error(f"Skipping dataset {dataset_name} due to data loading errors.")
        continue

    # 2. Build or Load Vocabulary
    if os.path.exists(config['vocab_path']):
        vocab = joblib.load(config['vocab_path'])
        logging.info(f"Loaded existing vocabulary from {config['vocab_path']}")
        # Check if special tokens exist, add if missing (backward compatibility)
        if '<pad>' not in vocab: vocab['<pad>'] = 0
        if '<unk>' not in vocab: vocab['<unk>'] = 1
    else:
        vocab = build_vocab(train_df[TEXT_COLUMN].tolist(), min_freq=MIN_WORD_FREQ)
        joblib.dump(vocab, config['vocab_path'])
        logging.info(f"Built and saved vocabulary to {config['vocab_path']}")

    vocab_size = len(vocab)
    pad_idx = vocab['<pad>']

    # 3. Create Datasets and DataLoaders
    train_dataset = SentimentDataset(train_df[TEXT_COLUMN].tolist(), train_df[TARGET_COLUMN].tolist(), vocab)
    val_dataset = SentimentDataset(val_df[TEXT_COLUMN].tolist(), val_df[TARGET_COLUMN].tolist(), vocab)
    test_dataset = SentimentDataset(test_df[TEXT_COLUMN].tolist(), test_df[TARGET_COLUMN].tolist(), vocab)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

    # 4. Load Pre-trained Embeddings (if needed)
    glove_embeddings = None
    if os.path.exists(GLOVE_PATH):
        glove_embeddings = load_glove_embeddings(GLOVE_PATH, vocab, EMBEDDING_DIM)
        if glove_embeddings is None:
            logging.warning("Failed to load GloVe embeddings. Models requiring them will use learned embeddings.")
    else:
        logging.warning(f"GloVe path not found: {GLOVE_PATH}. Pre-trained embeddings disabled.")


    # --- Define Models to Run ---
    models_to_run = {
        # --- Mid Level ---
        # Name: (ModelClass, {kwargs}, use_pretrained_embed, freeze_embed)
        "MLP (Avg Learned Emb)": (MLPAveraged, {'hidden_dim1': 64, 'hidden_dim2': 32, 'dropout': DROPOUT, 'embedding_dim': LEARNED_EMBEDDING_DIM}, False, False),
        "RNN (Learned Emb)": (RNNModel, {'hidden_dim': HIDDEN_DIM_RNN_LSTM, 'n_layers': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'bidirectional': False, 'embedding_dim': LEARNED_EMBEDDING_DIM}, False, False),
        "LSTM (Learned Emb)": (LSTMModel, {'hidden_dim': HIDDEN_DIM_RNN_LSTM, 'n_layers': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'bidirectional': False, 'embedding_dim': LEARNED_EMBEDDING_DIM}, False, False),
        "BiLSTM (Learned Emb)": (LSTMModel, {'hidden_dim': HIDDEN_DIM_RNN_LSTM, 'n_layers': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'bidirectional': True, 'embedding_dim': LEARNED_EMBEDDING_DIM}, False, False),
        "CNN (Learned Emb)": (CNNModel, {'n_filters': N_FILTERS_CNN, 'filter_sizes': FILTER_SIZES_CNN, 'dropout': DROPOUT, 'embedding_dim': LEARNED_EMBEDDING_DIM}, False, False),

        # --- Advanced Level (Using Pre-trained) ---
        # Requires GloVe embeddings to be loaded successfully
        "MLP (Avg GloVe Emb)": (MLPAveraged, {'hidden_dim1': 64, 'hidden_dim2': 32, 'dropout': DROPOUT, 'embedding_dim': EMBEDDING_DIM}, True, True), # Freeze GloVe
        "CNN (GloVe Emb)": (CNNModel, {'n_filters': N_FILTERS_CNN, 'filter_sizes': FILTER_SIZES_CNN, 'dropout': DROPOUT, 'embedding_dim': EMBEDDING_DIM}, True, True), # Freeze GloVe
        "LSTM (GloVe Emb)": (LSTMModel, {'hidden_dim': HIDDEN_DIM_RNN_LSTM, 'n_layers': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'bidirectional': False, 'embedding_dim': EMBEDDING_DIM}, True, True), # Freeze GloVe
        "BiLSTM (GloVe Emb)": (LSTMModel, {'hidden_dim': HIDDEN_DIM_RNN_LSTM, 'n_layers': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'bidirectional': True, 'embedding_dim': EMBEDDING_DIM}, True, True), # Freeze GloVe
        "CNN-LSTM (GloVe Emb)": (CNNLSTMModel, {'n_filters': N_FILTERS_CNN, 'filter_size_cnn': 3, 'hidden_dim_lstm': HIDDEN_DIM_RNN_LSTM, 'n_layers_lstm': N_LAYERS_RNN_LSTM, 'dropout': DROPOUT, 'embedding_dim': EMBEDDING_DIM}, True, True), # Freeze GloVe
    }

    # --- Loop through each model configuration ---
    for model_name, (ModelClass, model_kwargs, use_pretrained, freeze_embed) in models_to_run.items():

        # Skip models requiring GloVe if loading failed
        if use_pretrained and glove_embeddings is None:
            logging.warning(f"Skipping model '{model_name}' as pre-trained GloVe embeddings were not loaded.")
            continue

        print(f"\n--- Training Model: {model_name} ---")
        logging.info(f"Starting training for {model_name} on {dataset_name}")
        results = {"Dataset": dataset_name, "Model": model_name}

        try:
            # Instantiate model
            current_embedding_dim = model_kwargs['embedding_dim'] # Get dim from kwargs
            current_pretrained_embeddings = glove_embeddings if use_pretrained else None

            model = ModelClass(
                vocab_size=vocab_size,
                output_dim=NUM_CLASSES,
                pad_idx=pad_idx,
                pretrained_embeddings=current_pretrained_embeddings,
                freeze_embeddings=freeze_embed,
                **model_kwargs # Pass specific model architecture args
            ).to(DEVICE)

            # Count parameters
            num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            logging.info(f"Model: {model_name}, Trainable Parameters: {num_params:,}")

            # Define optimizer and criterion
            optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
            criterion = nn.CrossEntropyLoss().to(DEVICE) # Handles softmax internally

            best_val_loss = float('inf')
            total_train_time = 0
            model_save_path = os.path.join(config['model_dir'], f"{dataset_name.replace(' ', '_')}_{model_name.replace(' ', '_')}_best.pt")

            # Training loop
            for epoch in range(NUM_EPOCHS):
                start_epoch_time = time.time()

                train_loss, train_time_epoch = train_epoch(model, train_loader, optimizer, criterion, DEVICE, GRADIENT_CLIP)
                val_loss, val_metrics, _, _ = evaluate(model, val_loader, criterion, DEVICE)

                total_train_time += train_time_epoch
                end_epoch_time = time.time()
                epoch_mins, epoch_secs = divmod(end_epoch_time - start_epoch_time, 60)

                logging.info(f'Epoch: {epoch+1:02} | Time: {int(epoch_mins)}m {epoch_secs:.0f}s')
                logging.info(f'\tTrain Loss: {train_loss:.3f}')
                logging.info(f'\t Val. Loss: {val_loss:.3f} | Val. F1 (Macro): {val_metrics["F1 (Macro)"]:.4f}')

                # Save best model based on validation loss
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(model.state_dict(), model_save_path)
                    logging.info(f"Saved best model to {model_save_path} (Epoch {epoch+1})")

            results["Train Time (Epoch, s)"] = round(total_train_time / NUM_EPOCHS, 3) # Avg time per epoch

            # Load best model and evaluate on Test set
            model.load_state_dict(torch.load(model_save_path))
            logging.info(f"Loaded best model from {model_save_path} for final test evaluation.")

            test_loss, test_metrics, test_eval_time, test_conf_matrix = evaluate(model, test_loader, criterion, DEVICE)
            results.update(test_metrics)
            results["Eval Time (s)"] = round(test_eval_time, 3)

            logging.info("Test Set Performance:")
            for key, value in test_metrics.items():
                logging.info(f"\t{key}: {value:.4f}")
            logging.info(f"\tTest Loss: {test_loss:.3f}")
            logging.info(f"\tEval Time: {test_eval_time:.3f}s")

            # --- Save Confusion Matrix CSV ---
            cm_filename = f"{dataset_name.replace(' ', '_')}_{model_name.replace(' ', '_')}_confusion_matrix.csv" # Change extension to .csv
            cm_save_path = os.path.join(config['result_dir'], cm_filename)
            try:
                # Convert numpy array to DataFrame for better CSV formatting with labels
                cm_df = pd.DataFrame(test_conf_matrix, 
                                    index=LABEL_MAP.keys(), # Rows are True Labels
                                    columns=LABEL_MAP.keys()) # Columns are Predicted Labels
                cm_df.index.name = 'True Label'
                cm_df.columns.name = 'Predicted Label'
                
                # Save to CSV
                cm_df.to_csv(cm_save_path, index=True, mode='w+') # index=True to include row/column names
                
                logging.info(f"Saved confusion matrix CSV to {cm_save_path}")
            except Exception as cm_save_e:
                logging.error(f"Failed to save confusion matrix CSV for {model_name}: {cm_save_e}")
        # --- End Save Confusion Matrix CSV ---


        except Exception as e:
            logging.error(f"!!! An error occurred while processing {model_name} for {dataset_name}: {e}", exc_info=True) # Log traceback
            # Record partial results if possible
            results["Accuracy"] = np.nan
            results["F1 (Macro)"] = np.nan
            # Fill other metrics with NaN or error messages
            for metric in METRICS_TO_CALCULATE:
                if metric not in results:
                    results[metric] = np.nan if metric not in ["Train Time (Epoch, s)", "Eval Time (s)"] else 0.0
        finally:
            all_results.append(results)
            # Clean up memory
            del model
            if 'optimizer' in locals(): del optimizer
            if 'criterion' in locals(): del criterion   
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

# --- Combine results into a DataFrame ---
results_df = pd.DataFrame(all_results)

2025-05-01 10:01:14,636 - INFO - Processing Dataset: Book Review





2025-05-01 10:01:16,627 - INFO - Loaded existing vocabulary from ..\models\dl\book_reviews\vocab.pt
2025-05-01 10:01:16,651 - INFO - Loading GloVe embeddings from ..\data\embeddings\glove.6B.100d.txt
2025-05-01 10:01:22,456 - INFO - Found 400000 word vectors in GloVe file.
2025-05-01 10:01:22,552 - INFO - Initialized embedding matrix. Shape: (86590, 100)
2025-05-01 10:01:22,553 - INFO - Found pre-trained vectors for 72856/86590 words in vocabulary.
2025-05-01 10:01:22,625 - INFO - Starting training for MLP (Avg Learned Emb) on Book Review
2025-05-01 10:01:22,659 - INFO - Using learned embeddings.
2025-05-01 10:01:22,737 - INFO - Model: MLP (Avg Learned Emb), Trainable Parameters: 8,667,643



--- Training Model: MLP (Avg Learned Emb) ---


2025-05-01 10:01:57,224 - INFO - Epoch: 01 | Time: 0m 34s
2025-05-01 10:01:57,225 - INFO - 	Train Loss: 0.496
2025-05-01 10:01:57,225 - INFO - 	 Val. Loss: 0.421 | Val. F1 (Macro): 0.5111
2025-05-01 10:01:57,304 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_Learned_Emb)_best.pt (Epoch 1)
2025-05-01 10:02:31,496 - INFO - Epoch: 02 | Time: 0m 34s
2025-05-01 10:02:31,497 - INFO - 	Train Loss: 0.404
2025-05-01 10:02:31,498 - INFO - 	 Val. Loss: 0.404 | Val. F1 (Macro): 0.5195
2025-05-01 10:02:31,561 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_Learned_Emb)_best.pt (Epoch 2)
2025-05-01 10:03:04,637 - INFO - Epoch: 03 | Time: 0m 33s
2025-05-01 10:03:04,638 - INFO - 	Train Loss: 0.372
2025-05-01 10:03:04,638 - INFO - 	 Val. Loss: 0.397 | Val. F1 (Macro): 0.5560
2025-05-01 10:03:04,693 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_Learned_Emb)_best.pt (Epoch 3)
2025-05-01 10:03:38,201 - INFO - Epoch: 04 


--- Training Model: RNN (Learned Emb) ---


2025-05-01 10:18:53,157 - INFO - Epoch: 01 | Time: 0m 57s
2025-05-01 10:18:53,158 - INFO - 	Train Loss: 0.649
2025-05-01 10:18:53,158 - INFO - 	 Val. Loss: 0.643 | Val. F1 (Macro): 0.2958
2025-05-01 10:18:53,213 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_RNN_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 10:19:48,200 - INFO - Epoch: 02 | Time: 0m 55s
2025-05-01 10:19:48,201 - INFO - 	Train Loss: 0.647
2025-05-01 10:19:48,202 - INFO - 	 Val. Loss: 0.644 | Val. F1 (Macro): 0.2958
2025-05-01 10:20:42,546 - INFO - Epoch: 03 | Time: 0m 54s
2025-05-01 10:20:42,547 - INFO - 	Train Loss: 0.645
2025-05-01 10:20:42,548 - INFO - 	 Val. Loss: 0.642 | Val. F1 (Macro): 0.2958
2025-05-01 10:20:42,599 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_RNN_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 10:21:36,912 - INFO - Epoch: 04 | Time: 0m 54s
2025-05-01 10:21:36,913 - INFO - 	Train Loss: 0.644
2025-05-01 10:21:36,913 - INFO - 	 Val. Loss: 0.643 | Val. F1 (Macro): 0


--- Training Model: LSTM (Learned Emb) ---


2025-05-01 10:46:40,318 - INFO - Epoch: 01 | Time: 1m 20s
2025-05-01 10:46:40,319 - INFO - 	Train Loss: 0.650
2025-05-01 10:46:40,320 - INFO - 	 Val. Loss: 0.644 | Val. F1 (Macro): 0.2958
2025-05-01 10:46:40,380 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_LSTM_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 10:48:00,132 - INFO - Epoch: 02 | Time: 1m 20s
2025-05-01 10:48:00,133 - INFO - 	Train Loss: 0.645
2025-05-01 10:48:00,133 - INFO - 	 Val. Loss: 0.642 | Val. F1 (Macro): 0.2958
2025-05-01 10:48:00,191 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_LSTM_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 10:49:20,146 - INFO - Epoch: 03 | Time: 1m 20s
2025-05-01 10:49:20,147 - INFO - 	Train Loss: 0.643
2025-05-01 10:49:20,148 - INFO - 	 Val. Loss: 0.641 | Val. F1 (Macro): 0.2958
2025-05-01 10:49:20,203 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_LSTM_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 10:50:39,824 - INFO - Epoch: 04 | Time: 1


--- Training Model: BiLSTM (Learned Emb) ---


2025-05-01 11:34:17,090 - INFO - Epoch: 01 | Time: 2m 41s
2025-05-01 11:34:17,091 - INFO - 	Train Loss: 0.575
2025-05-01 11:34:17,091 - INFO - 	 Val. Loss: 0.580 | Val. F1 (Macro): 0.4282
2025-05-01 11:34:17,148 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 11:36:51,097 - INFO - Epoch: 02 | Time: 2m 34s
2025-05-01 11:36:51,098 - INFO - 	Train Loss: 0.466
2025-05-01 11:36:51,098 - INFO - 	 Val. Loss: 0.491 | Val. F1 (Macro): 0.4796
2025-05-01 11:36:51,155 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 11:39:25,776 - INFO - Epoch: 03 | Time: 2m 35s
2025-05-01 11:39:25,777 - INFO - 	Train Loss: 0.415
2025-05-01 11:39:25,778 - INFO - 	 Val. Loss: 0.461 | Val. F1 (Macro): 0.4998
2025-05-01 11:39:25,836 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 11:41:59,678 - INFO - Epoch: 04 | T


--- Training Model: CNN (Learned Emb) ---


2025-05-01 12:52:45,713 - INFO - Epoch: 01 | Time: 2m 6s
2025-05-01 12:52:45,714 - INFO - 	Train Loss: 0.609
2025-05-01 12:52:45,714 - INFO - 	 Val. Loss: 0.464 | Val. F1 (Macro): 0.4674
2025-05-01 12:52:45,775 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 12:54:30,697 - INFO - Epoch: 02 | Time: 1m 45s
2025-05-01 12:54:30,698 - INFO - 	Train Loss: 0.516
2025-05-01 12:54:30,698 - INFO - 	 Val. Loss: 0.430 | Val. F1 (Macro): 0.5146
2025-05-01 12:54:30,774 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 12:56:10,898 - INFO - Epoch: 03 | Time: 1m 40s
2025-05-01 12:56:10,900 - INFO - 	Train Loss: 0.475
2025-05-01 12:56:10,900 - INFO - 	 Val. Loss: 0.408 | Val. F1 (Macro): 0.5400
2025-05-01 12:56:10,993 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 12:57:48,828 - INFO - Epoch: 04 | Time: 1m 38


--- Training Model: MLP (Avg GloVe Emb) ---


2025-05-01 13:38:01,045 - INFO - Epoch: 01 | Time: 0m 31s
2025-05-01 13:38:01,046 - INFO - 	Train Loss: 0.580
2025-05-01 13:38:01,047 - INFO - 	 Val. Loss: 0.539 | Val. F1 (Macro): 0.2958
2025-05-01 13:38:01,103 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 13:38:32,093 - INFO - Epoch: 02 | Time: 0m 31s
2025-05-01 13:38:32,094 - INFO - 	Train Loss: 0.544
2025-05-01 13:38:32,095 - INFO - 	 Val. Loss: 0.528 | Val. F1 (Macro): 0.3482
2025-05-01 13:38:32,181 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 13:39:03,556 - INFO - Epoch: 03 | Time: 0m 31s
2025-05-01 13:39:03,558 - INFO - 	Train Loss: 0.539
2025-05-01 13:39:03,559 - INFO - 	 Val. Loss: 0.522 | Val. F1 (Macro): 0.3857
2025-05-01 13:39:03,635 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 13:39:35,686 - INFO - Epoch: 04 | Time


--- Training Model: CNN (GloVe Emb) ---


2025-05-01 13:54:33,974 - INFO - Epoch: 01 | Time: 1m 4s
2025-05-01 13:54:33,976 - INFO - 	Train Loss: 0.566
2025-05-01 13:54:33,976 - INFO - 	 Val. Loss: 0.486 | Val. F1 (Macro): 0.4719
2025-05-01 13:54:34,028 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 13:55:41,246 - INFO - Epoch: 02 | Time: 1m 7s
2025-05-01 13:55:41,247 - INFO - 	Train Loss: 0.529
2025-05-01 13:55:41,247 - INFO - 	 Val. Loss: 0.473 | Val. F1 (Macro): 0.4747
2025-05-01 13:55:41,300 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 13:56:51,758 - INFO - Epoch: 03 | Time: 1m 10s
2025-05-01 13:56:51,759 - INFO - 	Train Loss: 0.518
2025-05-01 13:56:51,760 - INFO - 	 Val. Loss: 0.461 | Val. F1 (Macro): 0.4793
2025-05-01 13:56:51,834 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 13:58:01,584 - INFO - Epoch: 04 | Time: 1m 10s
2025-


--- Training Model: LSTM (GloVe Emb) ---


2025-05-01 14:27:47,930 - INFO - Epoch: 01 | Time: 1m 21s
2025-05-01 14:27:47,932 - INFO - 	Train Loss: 0.650
2025-05-01 14:27:47,933 - INFO - 	 Val. Loss: 0.641 | Val. F1 (Macro): 0.2958
2025-05-01 14:27:48,018 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_LSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 14:29:07,098 - INFO - Epoch: 02 | Time: 1m 19s
2025-05-01 14:29:07,100 - INFO - 	Train Loss: 0.645
2025-05-01 14:29:07,100 - INFO - 	 Val. Loss: 0.643 | Val. F1 (Macro): 0.2958
2025-05-01 14:30:26,867 - INFO - Epoch: 03 | Time: 1m 20s
2025-05-01 14:30:26,868 - INFO - 	Train Loss: 0.643
2025-05-01 14:30:26,868 - INFO - 	 Val. Loss: 0.642 | Val. F1 (Macro): 0.2958
2025-05-01 14:31:45,297 - INFO - Epoch: 04 | Time: 1m 18s
2025-05-01 14:31:45,300 - INFO - 	Train Loss: 0.643
2025-05-01 14:31:45,301 - INFO - 	 Val. Loss: 0.642 | Val. F1 (Macro): 0.2958
2025-05-01 14:33:04,852 - INFO - Epoch: 05 | Time: 1m 20s
2025-05-01 14:33:04,854 - INFO - 	Train Loss: 0.640
2025-05-01 1


--- Training Model: BiLSTM (GloVe Emb) ---


2025-05-01 15:14:50,687 - INFO - Epoch: 01 | Time: 6m 49s
2025-05-01 15:14:50,688 - INFO - 	Train Loss: 0.565
2025-05-01 15:14:50,688 - INFO - 	 Val. Loss: 0.471 | Val. F1 (Macro): 0.4500
2025-05-01 15:14:50,739 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 15:28:48,214 - INFO - Epoch: 02 | Time: 13m 57s
2025-05-01 15:28:48,215 - INFO - 	Train Loss: 0.478
2025-05-01 15:28:48,215 - INFO - 	 Val. Loss: 0.445 | Val. F1 (Macro): 0.4956
2025-05-01 15:28:48,282 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 15:42:34,959 - INFO - Epoch: 03 | Time: 13m 47s
2025-05-01 15:42:34,963 - INFO - 	Train Loss: 0.448
2025-05-01 15:42:34,964 - INFO - 	 Val. Loss: 0.420 | Val. F1 (Macro): 0.5093
2025-05-01 15:42:35,019 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_BiLSTM_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 15:56:22,739 - INFO - Epoch: 04 | Time:


--- Training Model: CNN-LSTM (GloVe Emb) ---


2025-05-01 20:01:52,810 - INFO - Epoch: 01 | Time: 2m 11s
2025-05-01 20:01:52,811 - INFO - 	Train Loss: 0.565
2025-05-01 20:01:52,812 - INFO - 	 Val. Loss: 0.465 | Val. F1 (Macro): 0.4815
2025-05-01 20:01:52,862 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN-LSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 20:04:02,955 - INFO - Epoch: 02 | Time: 2m 10s
2025-05-01 20:04:02,956 - INFO - 	Train Loss: 0.482
2025-05-01 20:04:02,956 - INFO - 	 Val. Loss: 0.441 | Val. F1 (Macro): 0.4977
2025-05-01 20:04:03,012 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN-LSTM_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 20:06:13,510 - INFO - Epoch: 03 | Time: 2m 10s
2025-05-01 20:06:13,511 - INFO - 	Train Loss: 0.456
2025-05-01 20:06:13,512 - INFO - 	 Val. Loss: 0.427 | Val. F1 (Macro): 0.5161
2025-05-01 20:06:13,566 - INFO - Saved best model to ..\models\dl\book_reviews\Book_Review_CNN-LSTM_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 20:08:24,448 - INFO - Epoch: 04 | T




2025-05-01 21:05:12,825 - INFO - Loading GloVe embeddings from ..\data\embeddings\glove.6B.100d.txt
2025-05-01 21:05:18,364 - INFO - Found 400000 word vectors in GloVe file.
2025-05-01 21:05:18,368 - INFO - Initialized embedding matrix. Shape: (2845, 100)
2025-05-01 21:05:18,369 - INFO - Found pre-trained vectors for 2672/2845 words in vocabulary.
2025-05-01 21:05:18,468 - INFO - Starting training for MLP (Avg Learned Emb) on Financial News
2025-05-01 21:05:18,473 - INFO - Using learned embeddings.
2025-05-01 21:05:18,476 - INFO - Model: MLP (Avg Learned Emb), Trainable Parameters: 293,143



--- Training Model: MLP (Avg Learned Emb) ---


2025-05-01 21:05:18,876 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:05:18,877 - INFO - 	Train Loss: 0.974
2025-05-01 21:05:18,877 - INFO - 	 Val. Loss: 0.871 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:18,882 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_Learned_Emb)_best.pt (Epoch 1)
2025-05-01 21:05:19,279 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:05:19,279 - INFO - 	Train Loss: 0.890
2025-05-01 21:05:19,280 - INFO - 	 Val. Loss: 0.834 | Val. F1 (Macro): 0.3005
2025-05-01 21:05:19,284 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_Learned_Emb)_best.pt (Epoch 2)
2025-05-01 21:05:19,615 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:05:19,616 - INFO - 	Train Loss: 0.852
2025-05-01 21:05:19,616 - INFO - 	 Val. Loss: 0.815 | Val. F1 (Macro): 0.3608
2025-05-01 21:05:19,620 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_Learned_Emb)_best.pt (Epoch 3)
2025-05-01 21:05:19,962 - INFO 


--- Training Model: RNN (Learned Emb) ---


2025-05-01 21:05:28,842 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:05:28,843 - INFO - 	Train Loss: 0.953
2025-05-01 21:05:28,843 - INFO - 	 Val. Loss: 0.918 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:28,847 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_RNN_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 21:05:29,209 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:05:29,210 - INFO - 	Train Loss: 0.942
2025-05-01 21:05:29,210 - INFO - 	 Val. Loss: 0.916 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:29,214 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_RNN_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 21:05:29,547 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:05:29,548 - INFO - 	Train Loss: 0.934
2025-05-01 21:05:29,548 - INFO - 	 Val. Loss: 0.918 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:29,844 - INFO - Epoch: 04 | Time: 0m 0s
2025-05-01 21:05:29,845 - INFO - 	Train Loss: 0.938
2025-05-01 21:05:29,846 - INFO - 	 Val. Loss: 0.918 | Val. F1 (Mac


--- Training Model: LSTM (Learned Emb) ---


2025-05-01 21:05:38,617 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:05:38,617 - INFO - 	Train Loss: 0.965
2025-05-01 21:05:38,618 - INFO - 	 Val. Loss: 0.915 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:38,622 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 21:05:38,961 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:05:38,962 - INFO - 	Train Loss: 0.926
2025-05-01 21:05:38,962 - INFO - 	 Val. Loss: 0.902 | Val. F1 (Macro): 0.2954
2025-05-01 21:05:38,967 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 21:05:39,287 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:05:39,288 - INFO - 	Train Loss: 0.884
2025-05-01 21:05:39,288 - INFO - 	 Val. Loss: 0.870 | Val. F1 (Macro): 0.3716
2025-05-01 21:05:39,292 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 21:05:39,610 - INFO - Epoch: 


--- Training Model: BiLSTM (Learned Emb) ---


2025-05-01 21:05:48,098 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:05:48,099 - INFO - 	Train Loss: 0.937
2025-05-01 21:05:48,100 - INFO - 	 Val. Loss: 0.882 | Val. F1 (Macro): 0.2485
2025-05-01 21:05:48,107 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 21:05:48,486 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:05:48,487 - INFO - 	Train Loss: 0.875
2025-05-01 21:05:48,488 - INFO - 	 Val. Loss: 0.809 | Val. F1 (Macro): 0.3985
2025-05-01 21:05:48,492 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 21:05:48,855 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:05:48,856 - INFO - 	Train Loss: 0.820
2025-05-01 21:05:48,856 - INFO - 	 Val. Loss: 0.773 | Val. F1 (Macro): 0.4011
2025-05-01 21:05:48,861 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 21:05:49,239 - INFO - E


--- Training Model: CNN (Learned Emb) ---


2025-05-01 21:05:59,756 - INFO - Epoch: 01 | Time: 0m 2s
2025-05-01 21:05:59,757 - INFO - 	Train Loss: 1.094
2025-05-01 21:05:59,757 - INFO - 	 Val. Loss: 0.811 | Val. F1 (Macro): 0.4216
2025-05-01 21:05:59,762 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(Learned_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:00,248 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:00,249 - INFO - 	Train Loss: 0.954
2025-05-01 21:06:00,249 - INFO - 	 Val. Loss: 0.786 | Val. F1 (Macro): 0.4705
2025-05-01 21:06:00,253 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(Learned_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:00,602 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:00,602 - INFO - 	Train Loss: 0.892
2025-05-01 21:06:00,603 - INFO - 	 Val. Loss: 0.771 | Val. F1 (Macro): 0.4958
2025-05-01 21:06:00,607 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(Learned_Emb)_best.pt (Epoch 3)
2025-05-01 21:06:00,975 - INFO - Epoch: 04 


--- Training Model: MLP (Avg GloVe Emb) ---


2025-05-01 21:06:10,002 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:06:10,003 - INFO - 	Train Loss: 0.977
2025-05-01 21:06:10,003 - INFO - 	 Val. Loss: 0.902 | Val. F1 (Macro): 0.2485
2025-05-01 21:06:10,006 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:10,249 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:10,249 - INFO - 	Train Loss: 0.910
2025-05-01 21:06:10,250 - INFO - 	 Val. Loss: 0.866 | Val. F1 (Macro): 0.2519
2025-05-01 21:06:10,254 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:10,474 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:10,475 - INFO - 	Train Loss: 0.893
2025-05-01 21:06:10,475 - INFO - 	 Val. Loss: 0.849 | Val. F1 (Macro): 0.3552
2025-05-01 21:06:10,479 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_MLP_(Avg_GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 21:06:10,690 - INFO - Epoc


--- Training Model: CNN (GloVe Emb) ---


2025-05-01 21:06:17,532 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:06:17,532 - INFO - 	Train Loss: 0.934
2025-05-01 21:06:17,532 - INFO - 	 Val. Loss: 0.818 | Val. F1 (Macro): 0.4365
2025-05-01 21:06:17,537 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:17,809 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:17,810 - INFO - 	Train Loss: 0.834
2025-05-01 21:06:17,810 - INFO - 	 Val. Loss: 0.787 | Val. F1 (Macro): 0.4462
2025-05-01 21:06:17,815 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:18,106 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:18,107 - INFO - 	Train Loss: 0.797
2025-05-01 21:06:18,107 - INFO - 	 Val. Loss: 0.763 | Val. F1 (Macro): 0.4686
2025-05-01 21:06:18,111 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 21:06:18,424 - INFO - Epoch: 04 | Time


--- Training Model: LSTM (GloVe Emb) ---


2025-05-01 21:06:25,712 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:06:25,713 - INFO - 	Train Loss: 0.969
2025-05-01 21:06:25,713 - INFO - 	 Val. Loss: 0.915 | Val. F1 (Macro): 0.2485
2025-05-01 21:06:25,717 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:26,051 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:26,051 - INFO - 	Train Loss: 0.932
2025-05-01 21:06:26,052 - INFO - 	 Val. Loss: 0.902 | Val. F1 (Macro): 0.2706
2025-05-01 21:06:26,055 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:26,351 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:26,352 - INFO - 	Train Loss: 0.899
2025-05-01 21:06:26,353 - INFO - 	 Val. Loss: 0.886 | Val. F1 (Macro): 0.2485
2025-05-01 21:06:26,357 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_LSTM_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 21:06:26,610 - INFO - Epoch: 04 | T


--- Training Model: BiLSTM (GloVe Emb) ---


2025-05-01 21:06:34,220 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:06:34,220 - INFO - 	Train Loss: 0.916
2025-05-01 21:06:34,221 - INFO - 	 Val. Loss: 0.850 | Val. F1 (Macro): 0.3922
2025-05-01 21:06:34,225 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:34,560 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:34,561 - INFO - 	Train Loss: 0.845
2025-05-01 21:06:34,562 - INFO - 	 Val. Loss: 0.815 | Val. F1 (Macro): 0.3957
2025-05-01 21:06:34,567 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:34,892 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:34,893 - INFO - 	Train Loss: 0.814
2025-05-01 21:06:34,895 - INFO - 	 Val. Loss: 0.779 | Val. F1 (Macro): 0.4085
2025-05-01 21:06:34,900 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_BiLSTM_(GloVe_Emb)_best.pt (Epoch 3)
2025-05-01 21:06:35,297 - INFO - Epoch: 


--- Training Model: CNN-LSTM (GloVe Emb) ---


2025-05-01 21:06:44,021 - INFO - Epoch: 01 | Time: 0m 0s
2025-05-01 21:06:44,022 - INFO - 	Train Loss: 0.946
2025-05-01 21:06:44,022 - INFO - 	 Val. Loss: 0.871 | Val. F1 (Macro): 0.2485
2025-05-01 21:06:44,027 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN-LSTM_(GloVe_Emb)_best.pt (Epoch 1)
2025-05-01 21:06:44,410 - INFO - Epoch: 02 | Time: 0m 0s
2025-05-01 21:06:44,410 - INFO - 	Train Loss: 0.855
2025-05-01 21:06:44,411 - INFO - 	 Val. Loss: 0.795 | Val. F1 (Macro): 0.4130
2025-05-01 21:06:44,416 - INFO - Saved best model to ..\models\dl\financial_news\Financial_News_CNN-LSTM_(GloVe_Emb)_best.pt (Epoch 2)
2025-05-01 21:06:44,764 - INFO - Epoch: 03 | Time: 0m 0s
2025-05-01 21:06:44,765 - INFO - 	Train Loss: 0.838
2025-05-01 21:06:44,766 - INFO - 	 Val. Loss: 0.848 | Val. F1 (Macro): 0.3389
2025-05-01 21:06:45,135 - INFO - Epoch: 04 | Time: 0m 0s
2025-05-01 21:06:45,136 - INFO - 	Train Loss: 0.818
2025-05-01 21:06:45,137 - INFO - 	 Val. Loss: 0.805 | Val. F

# 6. Results Summary and Saving

In [14]:
print("\n\n===== Overall Deep Learning Results Summary =====")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1200)
pd.set_option('display.max_colwidth', 80) # Adjust if needed
pd.set_option('display.float_format', '{:.4f}'.format)

# Ensure all expected columns exist, fill with NaN if necessary
for col in METRICS_TO_CALCULATE:
    if col not in results_df.columns:
        results_df[col] = np.nan

# Reorder columns for clarity
column_order = ["Dataset", "Model"] + METRICS_TO_CALCULATE
# Filter out columns not present if something went wrong during creation
column_order = [col for col in column_order if col in results_df.columns]
results_df = results_df[column_order]


print(results_df)

# --- Save results to CSV for each dataset ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    dataset_results_df = results_df[results_df['Dataset'] == dataset_name]
    if not dataset_results_df.empty:
        results_filename = f"{dataset_name.replace(' ', '_')}_dl_pytorch_results.csv"
        results_save_path = os.path.join(config['result_dir'], results_filename)
        try:
            dataset_results_df.to_csv(results_save_path, index=False, mode='w+')
            print(f"\nResults for {dataset_name} saved to {results_save_path}")
        except Exception as e:
            print(f"\nError saving results for {dataset_name} to {results_save_path}: {e}")

# --- Save combined results ---
combined_results_path = os.path.join(RESULT_DIR, "combined_dl_pytorch_results.csv")
try:
    results_df.to_csv(combined_results_path, index=False, mode='w+')
    print(f"\nCombined results saved to {combined_results_path}")
except Exception as e:
    print(f"\nError saving combined results to {combined_results_path}: {e}")



===== Overall Deep Learning Results Summary =====
           Dataset                  Model  Accuracy  F1 (Macro)  Precision (Macro)  Recall (Macro)  F1 (Weighted)  Precision (Weighted)  Recall (Weighted)  Train Time (Epoch, s)  Eval Time (s)
0      Book Review  MLP (Avg Learned Emb)    0.8567      0.5505             0.6616          0.5465         0.8226                0.8193             0.8567                29.8710         3.1720
1      Book Review      RNN (Learned Emb)    0.7976      0.2958             0.2659          0.3333         0.7079                0.6362             0.7976                50.9510         3.8210
2      Book Review     LSTM (Learned Emb)    0.8615      0.5323             0.6266          0.5367         0.8205                0.8094             0.8615                83.1910        11.5990
3      Book Review   BiLSTM (Learned Emb)    0.8660      0.5945             0.6987          0.5644         0.8389                0.8350             0.8660               143.588