# 1. Setup and Configuration

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel, # For feature extraction
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType # For LoRA
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
import evaluate 
import time
import os
import joblib 
import logging
import warnings
import gc
import psutil

# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Limit CPU Usage ---
p = psutil.Process()
p.cpu_affinity([1, 2, 3, 4, 5, 6, 7])

In [2]:
# --- Project Directory Structure ---
BASE_DIR = ".." # Assuming the notebook is in a 'notebooks' or similar folder
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
# Models and results saved within dataset-specific folders
MODEL_OUTPUT_BASE_DIR = os.path.join(BASE_DIR, "models", "llm")
RESULT_DIR = os.path.join(BASE_DIR, "result")

# --- Specific Dataset Paths ---
BOOK_REVIEW_DATA_DIR = os.path.join(DATA_DIR, "book_reviews")
FINANCIAL_NEWS_DATA_DIR = os.path.join(DATA_DIR, "financial_news")

# --- Model/Result Output Dirs (Ensure they exist) ---
BOOK_REVIEW_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "book_reviews")
FINANCIAL_NEWS_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "financial_news")
BOOK_REVIEW_RESULT_DIR = os.path.join(RESULT_DIR, "book_reviews")
FINANCIAL_NEWS_RESULT_DIR = os.path.join(RESULT_DIR, "financial_news")

os.makedirs(BOOK_REVIEW_MODEL_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_MODEL_DIR, exist_ok=True)
os.makedirs(BOOK_REVIEW_RESULT_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_RESULT_DIR, exist_ok=True)

# --- File Names ---
TRAIN_FN = "train.csv"
VAL_FN = "val.csv"
TEST_FN = "test.csv"

# --- Column Names ---
TEXT_COLUMN = "text"
TARGET_COLUMN = "score" # Assumes string labels like 'positive', 'negative', 'neutral'

In [3]:
# --- Model & Training Hyperparameters ---
RANDOM_STATE = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# Tokenizer params
MAX_LENGTH = 256 # Max sequence length for transformers

# Feature Extractor Params
FEATURE_EXTRACTOR_BATCH_SIZE = 16 # Batch size for extracting features
LOGREG_MAX_ITER = 1000 # Max iterations for Logistic Regression on features

# Fine-tuning params (adjust based on resources and dataset size)
LEARNING_RATE = 2e-5 # Common starting point for transformers
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 16 # Adjust based on GPU memory
EVAL_BATCH_SIZE = 32
NUM_EPOCHS = 3 # Usually fewer epochs needed for fine-tuning
FP16 = torch.cuda.is_available() # Enable mixed precision if GPU available

# PEFT (LoRA) params
USE_LORA = True # Flag to control if LoRA runs are included
LORA_R = 8 # LoRA rank (dimension)
LORA_ALPHA = 16 # LoRA alpha scaling
LORA_DROPOUT = 0.1
# Target modules vary by model, common ones for BERT/RoBERTa:
LORA_TARGET_MODULES = ["query", "value"] # Common target layers for attention

# --- Label Mapping (Essential for Transformers) ---
LABEL_LIST = ['negative', 'neutral', 'positive'] # Define explicit order
LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)}
ID2LABEL = {i: label for i, label in enumerate(LABEL_LIST)}
NUM_CLASSES = len(LABEL_LIST)

# --- Evaluation Metrics ---
METRICS_TO_CALCULATE = [
    "Accuracy",
    "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
    "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
    "Train Time (s)", "Eval Time (s)"
]
METRIC_FOR_BEST_MODEL = "f1_macro" # Metric to monitor for early stopping/best model saving

# --- Datasets Configuration ---
DATASETS_TO_PROCESS = {
    "Book Review": {
        "train_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TRAIN_FN}'),
        "val_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{VAL_FN}'),
        "test_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TEST_FN}'),
        "model_dir": BOOK_REVIEW_MODEL_DIR,
        "result_dir": BOOK_REVIEW_RESULT_DIR,
    },
    "Financial News": {
        "train_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TRAIN_FN}'),
        "val_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{VAL_FN}'),
        "test_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TEST_FN}'),
        "model_dir": FINANCIAL_NEWS_MODEL_DIR,
        "result_dir": FINANCIAL_NEWS_RESULT_DIR,
    }
}

# --- Model Configurations to Run ---
# Define the models and settings for the experiment loop
# Format: ('Experiment Name', 'HuggingFace Model ID', use_lora_flag, is_feature_extractor_run)
MODEL_CONFIGURATIONS = [
    # Feature Extractors
    ('DistilBERT Feature Extractor + LR', 'distilbert-base-uncased', False, True),
    ('BERT Feature Extractor + LR',       'bert-base-uncased',       False, True),

    # Full Fine-tuning
    ('DistilBERT Full FT', 'distilbert-base-uncased', False, False),
    ('BERT Full FT',       'bert-base-uncased',       False, False),
    ('RoBERTa Full FT',    'roberta-base',            False, False),
    ('FinBERT Full FT',    'ProsusAI/finbert',        False, False), # Domain-specific

    # LoRA Fine-tuning (only run if USE_LORA is True)
    ('BERT LoRA FT',       'bert-base-uncased',       True, False),
    ('RoBERTa LoRA FT',    'roberta-base',            True, False),
    # ('FinBERT LoRA FT',    'ProsusAI/finbert',        True, False), # Can also apply LoRA to FinBERT
] if USE_LORA else [ # Exclude LoRA runs if USE_LORA is False
    ('DistilBERT Feature Extractor + LR', 'distilbert-base-uncased', False, True),
    ('BERT Feature Extractor + LR',       'bert-base-uncased',       False, True),
    ('DistilBERT Full FT', 'distilbert-base-uncased', False, False),
    ('BERT Full FT',       'bert-base-uncased',       False, False),
    ('RoBERTa Full FT',    'roberta-base',            False, False),
    ('FinBERT Full FT',    'ProsusAI/finbert',        False, False),
]

# Check if FinBERT model ID needs adjustment (sometimes name changes)
# Example alternative: 'yiyanghkust/finbert-tone'
FINBERT_MODEL_ID = 'ProsusAI/finbert'

2025-05-01 22:48:08,826 - INFO - Using device: cuda


# 2. Utility Functions

In [4]:
def load_data_hf(path):
    """Loads a single CSV into a Hugging Face Dataset."""
    try:
        # Load directly using datasets library
        dataset = load_dataset('csv', data_files=path, split='train')
        # Rename target column to 'label' (expected by Trainer) and map string labels to integers
        if TARGET_COLUMN != 'label':
            dataset = dataset.rename_column(TARGET_COLUMN, 'label')
        dataset = dataset.map(lambda examples: {'label': LABEL2ID.get(str(examples['label']), -1)}, # Handle potential non-string labels robustly
                              desc="Mapping labels to IDs")
        # Filter out examples where label mapping failed (label == -1)
        original_size = len(dataset)
        dataset = dataset.filter(lambda example: example['label'] != -1, desc="Filtering invalid labels")
        if len(dataset) < original_size:
            logging.warning(f"Filtered out {original_size - len(dataset)} examples with invalid labels from {path}.")
        return dataset
    except Exception as e:
        logging.error(f"Error loading dataset from {path}: {e}", exc_info=True)
        return None

def create_dataset_dict(train_path, val_path, test_path):
    """Loads train, validation, and test CSVs into a DatasetDict."""
    train_ds = load_data_hf(train_path)
    val_ds = load_data_hf(val_path)
    test_ds = load_data_hf(test_path)
    if train_ds and val_ds and test_ds:
        logging.info(f"Loaded Train data: {len(train_ds)} examples")
        logging.info(f"Loaded Validation data: {len(val_ds)} examples")
        logging.info(f"Loaded Test data: {len(test_ds)} examples")
        return DatasetDict({
            'train': train_ds,
            'validation': val_ds,
            'test': test_ds
        })
    else:
        return None

def preprocess_function(examples, tokenizer):
    """Tokenizes text data."""
    # Ensure text is string, handle potential None values
    texts = [str(text) if text is not None else "" for text in examples[TEXT_COLUMN]]
    return tokenizer(texts, truncation=True, padding=False, max_length=MAX_LENGTH) # Padding handled by DataCollator

# Define metric computation function for Trainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    prec_macro = precision_score(labels, preds, average='macro', zero_division=0)
    rec_macro = recall_score(labels, preds, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
    prec_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
    rec_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'precision_macro': prec_macro,
        'recall_macro': rec_macro,
        'f1_weighted': f1_weighted,
        'precision_weighted': prec_weighted,
        'recall_weighted': rec_weighted,
    }

def calculate_metrics_from_preds(y_true, y_pred):
    """Calculates evaluation metrics from direct predictions."""
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    # Print the classification report for detailed metrics
    

    return {
        "Accuracy": accuracy,
        "F1 (Macro)": f1_macro,
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 (Weighted)": f1_weighted,
        "Precision (Weighted)": precision_weighted,
        "Recall (Weighted)": recall_weighted,
    }

# Function to extract features (CLS token)
def extract_hidden_states(batch, model, tokenizer, device):
    # Ensure input_ids and attention_mask are tensors on the correct device
    inputs = {k: v.to(device) for k, v in batch.items()
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return the representation of the [CLS] token (first token)
    # Move back to CPU to accumulate results if needed outside GPU loop
    return last_hidden_state[:, 0].cpu().numpy()

# 3. Run Experiments

In [5]:
all_results = []

In [6]:
# --- Loop through each dataset defined in the configuration ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    print(f"\n{'='*25} Processing Dataset: {dataset_name} {'='*25}")
    logging.info(f"Processing Dataset: {dataset_name}")

    # 1. Load Data using Hugging Face Datasets
    raw_datasets = create_dataset_dict(config['train_path'], config['val_path'], config['test_path'])
    if not raw_datasets:
        logging.error(f"Could not load data for {dataset_name}. Skipping.")
        continue

    # --- Loop through each model configuration ---
    for model_label, model_id, use_lora, is_feature_extractor in MODEL_CONFIGURATIONS:

        # --- Skip FinBERT for non-financial data ---
        if model_id == FINBERT_MODEL_ID and dataset_name != "Financial News":
            logging.info(f"Skipping {model_label} for {dataset_name} (Model is domain-specific).")
            continue

        # --- Skip LoRA runs if flag is off ---
        if use_lora and not USE_LORA:
            logging.info(f"Skipping LoRA run {model_label} as USE_LORA is False.")
            continue

        print(f"\n--- Processing Model: {model_label} ---")
        logging.info(f"Starting run for {model_label} on {dataset_name}")
        run_results = {"Dataset": dataset_name, "Model": model_label}
        train_time = 0.0
        eval_time = 0.0

        # Create specific output dirs for this run's checkpoints/models
        run_model_dir = os.path.join(config['model_dir'], model_label.replace(' ', '_').replace('+', ''))
        os.makedirs(run_model_dir, exist_ok=True)

        try:
            # 2. Load Tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_id)

            # 3. Tokenize Datasets
            logging.info(f"Tokenizing data using {model_id} tokenizer...")
            # Apply tokenization in batches
            tokenized_datasets = raw_datasets.map(
                lambda batch: preprocess_function(batch, tokenizer),
                batched=True,
                remove_columns=[TEXT_COLUMN], # Remove original text column
                desc="Running tokenizer on dataset"
            )
            # Data collator handles dynamic padding
            data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
            logging.info("Tokenization complete.")

            # ===========================================
            # === 4.A Feature Extraction + Classifier ===
            # ===========================================
            if is_feature_extractor:
                logging.info("Running in Feature Extraction mode.")
                # Load base model (no classification head)
                model = AutoModel.from_pretrained(model_id).to(DEVICE)
                model.eval() # Set to evaluation mode

                # --- Extract Features ---
                logging.info("Extracting features from datasets...")
                start_extract_time = time.time()

                # Need dataloaders for batching feature extraction
                tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
                train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)
                val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)
                test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)

                X_train_features = []
                y_train_labels = []
                for batch in train_dataloader:
                    y_train_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_train_features.append(batch_features)

                X_val_features = []
                y_val_labels = []
                for batch in val_dataloader:
                    y_val_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_val_features.append(batch_features)

                X_test_features = []
                y_test_labels = []
                for batch in test_dataloader:
                    y_test_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_test_features.append(batch_features)

                X_train_features = np.concatenate(X_train_features)
                X_val_features = np.concatenate(X_val_features)
                X_test_features = np.concatenate(X_test_features)
                end_extract_time = time.time()
                logging.info(f"Feature extraction took {end_extract_time - start_extract_time:.2f}s")
                logging.info(f"Train features shape: {X_train_features.shape}")

                # --- Train Classifier ---
                logging.info("Training Logistic Regression classifier...")
                classifier = LogisticRegression(max_iter=LOGREG_MAX_ITER, random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)
                start_train_time = time.time()
                # Combine train + val features for final classifier training? Or tune on val? Simpler: train on train, eval on test.
                classifier.fit(X_train_features, y_train_labels)
                end_train_time = time.time()
                train_time = end_train_time - start_train_time
                logging.info(f"Classifier training took {train_time:.2f}s")

                # --- Evaluate Classifier ---
                start_eval_time = time.time()
                y_pred_test = classifier.predict(X_test_features)
                end_eval_time = time.time()
                eval_time = end_eval_time - start_eval_time

                test_metrics = calculate_metrics_from_preds(y_test_labels, y_pred_test)
                run_results.update(test_metrics)

                logging.info("Feature Extractor + LR - Test Set Performance:")
                report_str = classification_report(y_test_labels, y_pred_test, target_names=LABEL_LIST, zero_division=0)
                print(report_str)

                cm = confusion_matrix(y_test_labels, y_pred_test, labels=list(range(NUM_CLASSES))) # Ensure labels are ordered
                cm_df = pd.DataFrame(cm, index=LABEL_LIST, columns=LABEL_LIST)
                print("Confusion Matrix (Test Set):")
                print(cm_df)

                cm_filename = f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_').replace('+','')}_confusion_matrix.csv"
                cm_save_path = os.path.join(config['result_dir'], cm_filename)
                try:
                    cm_df.to_csv(cm_save_path)
                    logging.info(f"Confusion matrix saved to {cm_save_path}")
                except Exception as cm_e:
                    logging.error(f"Failed to save confusion matrix to {cm_save_path}: {cm_e}")


                # Save the classifier
                clf_save_path = os.path.join(run_model_dir, f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_')}_LR_classifier.joblib")
                joblib.dump(classifier, clf_save_path)
                logging.info(f"Logistic Regression classifier saved to {clf_save_path}")

                # Cleanup GPU memory used by the base model
                del model
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()


            # ===========================================
            # === 4.B Fine-tuning (Full or LoRA)     ====
            # ===========================================
            else:
                logging.info(f"Running in Fine-tuning mode (LoRA: {use_lora}).")
                # Load model with sequence classification head
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_id,
                    num_labels=NUM_CLASSES,
                    id2label=ID2LABEL,
                    label2id=LABEL2ID
                )

                # --- Apply LoRA if specified ---
                if use_lora:
                    logging.info("Applying LoRA configuration...")
                    peft_config = LoraConfig(
                        task_type=TaskType.SEQ_CLS, # Specify task type
                        r=LORA_R,
                        lora_alpha=LORA_ALPHA,
                        lora_dropout=LORA_DROPOUT,
                        target_modules=LORA_TARGET_MODULES,
                        bias="none" # Usually set bias to 'none' or 'all'
                    )
                    model = get_peft_model(model, peft_config)
                    model.print_trainable_parameters() # Verify LoRA application

                model.to(DEVICE) # Move model to GPU before Trainer

                # --- Define Training Arguments ---
                training_args = TrainingArguments(
                    output_dir=os.path.join(run_model_dir, "checkpoints"),
                    logging_dir=os.path.join(run_model_dir, "logs"),
                    report_to="none", # Disable wandb/tensorboard reporting unless configured
                    num_train_epochs=NUM_EPOCHS,
                    learning_rate=LEARNING_RATE,
                    weight_decay=WEIGHT_DECAY,
                    per_device_train_batch_size=TRAIN_BATCH_SIZE,
                    per_device_eval_batch_size=EVAL_BATCH_SIZE,

                    # --- Strategies ---
                    eval_strategy="epoch",  # Evaluate at the end of each epoch
                    save_strategy="epoch",        # Save a checkpoint at the end of each epoch
                    logging_strategy="epoch",     # Log metrics at the end of each epoch (consistent)

                    # --- Explicitly disable step-based intervals when using epoch strategy ---
                    eval_steps=None,              # Do not evaluate every N steps
                    save_steps=None,              # Do not save every N steps
                    logging_steps=None,           # Do not log every N steps (use logging_strategy="epoch")
                    # Note: If you WANT step-based logging while using epoch eval/save, you can set
                    # logging_strategy="steps" and provide a value for logging_steps, but keep
                    # eval_steps=None and save_steps=None.

                    # --- Best model loading ---
                    load_best_model_at_end=True, # Load the best model based on metric_for_best_model
                    metric_for_best_model=METRIC_FOR_BEST_MODEL, # e.g., "f1_macro"
                    greater_is_better=True,      # F1 score is better when higher
                    save_total_limit=2,          # Only keep the best and the latest checkpoint

                    # --- Other settings ---
                    fp16=FP16,                   # Enable mixed precision training if GPU supports it
                    # logging_steps=50,          # Remove or comment out if using logging_strategy="epoch"
                    # dataloader_num_workers=2,  # Optional
                    gradient_accumulation_steps=1,
                    seed=RANDOM_STATE,
                    remove_unused_columns=True, # Default is True, good practice
                )

                print(f"Using evaluation_strategy: {training_args.eval_strategy}") # Add this print statement
                print(f"Using save_strategy: {training_args.save_strategy}")
                print(f"Using load_best_model_at_end: {training_args.load_best_model_at_end}")

                # --- Define Trainer ---
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_datasets["train"],
                    eval_dataset=tokenized_datasets["validation"], # Use validation set for evaluation during training
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
                    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)] # Stop if metric doesn't improve enough
                )

                # --- Train the Model ---
                logging.info("Starting fine-tuning...")
                start_train_time = time.time()
                train_result = trainer.train()
                end_train_time = time.time()
                train_time = end_train_time - start_train_time
                logging.info(f"Fine-tuning completed in {train_time:.2f}s")

                # --- Evaluate on Test Set ---
                logging.info("Evaluating model on the test set...")
                start_eval_time = time.time()
                # Evaluate first to get metrics like loss
                test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
                # Then predict to get raw predictions for confusion matrix
                predictions = trainer.predict(tokenized_datasets["test"])
                end_eval_time = time.time()
                eval_time = end_eval_time - start_eval_time

                # Extract predictions and labels
                y_pred_test = np.argmax(predictions.predictions, axis=1)
                y_true_test = predictions.label_ids

                # Map trainer metric names (e.g., 'eval_f1_macro') to our standard names
                run_results["Accuracy"] = test_results.get('eval_accuracy', np.nan)
                run_results["F1 (Macro)"] = test_results.get('eval_f1_macro', np.nan)
                run_results["Precision (Macro)"] = test_results.get('eval_precision_macro', np.nan)
                run_results["Recall (Macro)"] = test_results.get('eval_recall_macro', np.nan)
                run_results["F1 (Weighted)"] = test_results.get('eval_f1_weighted', np.nan)
                run_results["Precision (Weighted)"] = test_results.get('eval_precision_weighted', np.nan)
                run_results["Recall (Weighted)"] = test_results.get('eval_recall_weighted', np.nan)

                logging.info("Fine-tuning - Test Set Performance:")
                logging.info(test_results) # Log the full results dict from trainer

                # Generate, print, and save the confusion matrix using predictions
                logging.info("Fine-tuning - Test Set Confusion Matrix:")
                cm = confusion_matrix(y_true_test, y_pred_test, labels=list(range(NUM_CLASSES))) # Ensure labels are ordered
                cm_df = pd.DataFrame(cm, index=LABEL_LIST, columns=LABEL_LIST)
                print(cm_df)

                # Save the confusion matrix
                cm_filename = f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_').replace('+','')}_confusion_matrix.csv"
                cm_save_path = os.path.join(config['result_dir'], cm_filename)
                try:
                    cm_df.to_csv(cm_save_path, mode='w+')
                    logging.info(f"Confusion matrix saved to {cm_save_path}")
                except Exception as cm_e:
                    logging.error(f"Failed to save confusion matrix to {cm_save_path}: {cm_e}")

                # --- Save the Final Model & Tokenizer ---
                # Trainer already saved the best checkpoint based on validation set.
                # For LoRA, the main model is saved by Trainer, adapters need separate save
                final_model_save_path = os.path.join(run_model_dir, "final_model")
                if use_lora:
                    logging.info(f"Saving LoRA adapter model to {final_model_save_path}")
                    model.save_pretrained(final_model_save_path) # Saves only the adapter
                else:
                    # If not LoRA, trainer saved the full best model, we can optionally save it again here
                    # under a consistent name if needed, but load_best_model_at_end handles loading it.
                    # Saving explicitly:
                    # trainer.save_model(final_model_save_path)
                    logging.info(f"Best model loaded by Trainer. Checkpoint saved in {training_args.output_dir}")


                tokenizer.save_pretrained(final_model_save_path) # Save tokenizer with the model/adapter
                logging.info(f"Tokenizer saved to {final_model_save_path}")

            # --- Store Timings and Finalize Results ---
            run_results["Train Time (s)"] = round(train_time, 3)
            run_results["Eval Time (s)"] = round(eval_time, 3)


        except Exception as e:
            logging.error(f"!!! An error occurred while processing {model_label} for {dataset_name}: {e}", exc_info=True)
            # Record partial results if possible
            run_results["Accuracy"] = np.nan
            run_results["F1 (Macro)"] = np.nan
            # Fill other metrics with NaN or error messages
            for metric in METRICS_TO_CALCULATE:
                if metric not in run_results:
                    run_results[metric] = np.nan if metric not in ["Train Time (s)", "Eval Time (s)"] else 0.0
        finally:
            all_results.append(run_results)
            # Clean up memory aggressively after each run
            del tokenizer
            if 'model' in locals(): del model
            if 'trainer' in locals(): del trainer
            if 'classifier' in locals(): del classifier
            if 'tokenized_datasets' in locals(): del tokenized_datasets
            # if 'raw_datasets' in locals(): del raw_datasets
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()


# --- Combine results into a DataFrame ---
results_df = pd.DataFrame(all_results)

2025-05-01 22:48:08,874 - INFO - Processing Dataset: Book Review





Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/210000 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/210000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/45000 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/45000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/45000 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-01 22:48:19,542 - INFO - Loaded Train data: 210000 examples
2025-05-01 22:48:19,543 - INFO - Loaded Validation data: 45000 examples
2025-05-01 22:48:19,543 - INFO - Loaded Test data: 45000 examples
2025-05-01 22:48:19,543 - INFO - Starting run for DistilBERT Feature Extractor + LR on Book Review



--- Processing Model: DistilBERT Feature Extractor + LR ---


2025-05-01 22:48:19,943 - INFO - Tokenizing data using distilbert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/210000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-01 22:48:51,687 - INFO - Tokenization complete.
2025-05-01 22:48:51,687 - INFO - Running in Feature Extraction mode.
2025-05-01 22:48:52,532 - INFO - Extracting features from datasets...
2025-05-01 23:09:13,878 - INFO - Feature extraction took 1221.34s
2025-05-01 23:09:13,880 - INFO - Train features shape: (210000, 768)
2025-05-01 23:09:13,880 - INFO - Training Logistic Regression classifier...
2025-05-01 23:25:17,987 - INFO - Classifier training took 964.11s
2025-05-01 23:25:18,197 - INFO - Feature Extractor + LR - Test Set Performance:
2025-05-01 23:25:18,228 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_DistilBERT_Feature_Extractor__LR_confusion_matrix.csv
2025-05-01 23:25:18,230 - INFO - Logistic Regression classifier saved to ..\models\llm\book_reviews\DistilBERT_Feature_Extractor__LR\Book_Review_DistilBERT_Feature_Extractor_+_LR_LR_classifier.joblib


              precision    recall  f1-score   support

    negative       0.51      0.74      0.61      5292
     neutral       0.22      0.57      0.32      3814
    positive       0.96      0.74      0.84     35894

    accuracy                           0.73     45000
   macro avg       0.57      0.68      0.59     45000
weighted avg       0.85      0.73      0.77     45000

Confusion Matrix (Test Set):
          negative  neutral  positive
negative      3891     1041       360
neutral        892     2166       756
positive      2777     6455     26662


2025-05-01 23:25:18,505 - INFO - Starting run for BERT Feature Extractor + LR on Book Review



--- Processing Model: BERT Feature Extractor + LR ---


2025-05-01 23:25:18,882 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/210000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-01 23:25:54,104 - INFO - Tokenization complete.
2025-05-01 23:25:54,104 - INFO - Running in Feature Extraction mode.
2025-05-01 23:25:55,160 - INFO - Extracting features from datasets...
2025-05-02 00:05:55,867 - INFO - Feature extraction took 2400.71s
2025-05-02 00:05:55,867 - INFO - Train features shape: (210000, 768)
2025-05-02 00:05:55,868 - INFO - Training Logistic Regression classifier...
2025-05-02 00:15:02,122 - INFO - Classifier training took 546.25s
2025-05-02 00:15:02,331 - INFO - Feature Extractor + LR - Test Set Performance:
2025-05-02 00:15:02,361 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_BERT_Feature_Extractor__LR_confusion_matrix.csv
2025-05-02 00:15:02,363 - INFO - Logistic Regression classifier saved to ..\models\llm\book_reviews\BERT_Feature_Extractor__LR\Book_Review_BERT_Feature_Extractor_+_LR_LR_classifier.joblib


              precision    recall  f1-score   support

    negative       0.50      0.71      0.59      5292
     neutral       0.22      0.54      0.31      3814
    positive       0.96      0.74      0.84     35894

    accuracy                           0.72     45000
   macro avg       0.56      0.66      0.58     45000
weighted avg       0.84      0.72      0.76     45000

Confusion Matrix (Test Set):
          negative  neutral  positive
negative      3773     1149       370
neutral        924     2055       835
positive      2871     6347     26676


2025-05-02 00:15:02,609 - INFO - Starting run for DistilBERT Full FT on Book Review



--- Processing Model: DistilBERT Full FT ---


2025-05-02 00:15:02,972 - INFO - Tokenizing data using distilbert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-02 00:15:07,497 - INFO - Tokenization complete.
2025-05-02 00:15:07,498 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 00:15:08,266 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.3357,0.305599,0.894244,0.673707,0.777665,0.655781,0.874733,0.878653,0.894244
2,0.2493,0.311123,0.899956,0.736509,0.757797,0.722791,0.894618,0.891421,0.899956
3,0.19,0.357838,0.899289,0.74428,0.76113,0.729603,0.896114,0.893703,0.899289


2025-05-02 01:26:29,636 - INFO - Fine-tuning completed in 4281.37s
2025-05-02 01:26:29,637 - INFO - Evaluating model on the test set...


2025-05-02 01:28:53,537 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 01:28:53,537 - INFO - {'eval_loss': 0.35942190885543823, 'eval_accuracy': 0.8997555555555555, 'eval_f1_macro': 0.745369681444788, 'eval_precision_macro': 0.7617655517570192, 'eval_recall_macro': 0.7312301620847061, 'eval_f1_weighted': 0.8965515704112923, 'eval_precision_weighted': 0.8941068533411948, 'eval_recall_weighted': 0.8997555555555555, 'eval_runtime': 71.1588, 'eval_samples_per_second': 632.389, 'eval_steps_per_second': 19.773, 'epoch': 3.0}
2025-05-02 01:28:53,538 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 01:28:53,542 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_DistilBERT_Full_FT_confusion_matrix.csv
2025-05-02 01:28:53,542 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\book_reviews\DistilBERT_Full_FT\checkpoints
2025-05-02 01:28:53,551 - INFO - Tokenizer saved to ..\models\llm\book_reviews\DistilBERT_Full_FT\final_model
2025-05

          negative  neutral  positive
negative      4065      668       559
neutral        618     1753      1443
positive       366      857     34671

--- Processing Model: BERT Full FT ---


2025-05-02 01:28:54,079 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-02 01:28:59,119 - INFO - Tokenization complete.
2025-05-02 01:28:59,119 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 01:28:59,758 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.3218,0.2958,0.898756,0.680368,0.806721,0.661056,0.878278,0.886741,0.898756
2,0.2318,0.313083,0.9048,0.74296,0.76845,0.730481,0.898599,0.89597,0.9048
3,0.1692,0.365307,0.905067,0.759235,0.772842,0.74694,0.902772,0.900955,0.905067


2025-05-02 03:44:54,243 - INFO - Fine-tuning completed in 8154.48s
2025-05-02 03:44:54,244 - INFO - Evaluating model on the test set...


2025-05-02 03:49:32,035 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 03:49:32,036 - INFO - {'eval_loss': 0.3702978193759918, 'eval_accuracy': 0.9046888888888889, 'eval_f1_macro': 0.7564766966548792, 'eval_precision_macro': 0.7710917206735477, 'eval_recall_macro': 0.7434999096948102, 'eval_f1_weighted': 0.9020696812985897, 'eval_precision_weighted': 0.9000194929634077, 'eval_recall_weighted': 0.9046888888888889, 'eval_runtime': 137.9553, 'eval_samples_per_second': 326.193, 'eval_steps_per_second': 10.199, 'epoch': 3.0}
2025-05-02 03:49:32,037 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 03:49:32,041 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_BERT_Full_FT_confusion_matrix.csv
2025-05-02 03:49:32,042 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\book_reviews\BERT_Full_FT\checkpoints
2025-05-02 03:49:32,051 - INFO - Tokenizer saved to ..\models\llm\book_reviews\BERT_Full_FT\final_model
2025-05-02 03:49:32,228 

          negative  neutral  positive
negative      4130      679       483
neutral        620     1839      1355
positive       295      857     34742

--- Processing Model: RoBERTa Full FT ---


2025-05-02 03:49:32,698 - INFO - Tokenizing data using roberta-base tokenizer...


Running tokenizer on dataset:   0%|          | 0/210000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-02 03:50:01,689 - INFO - Tokenization complete.
2025-05-02 03:50:01,690 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 03:50:02,842 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.3133,0.276434,0.905289,0.712384,0.792399,0.694928,0.890691,0.892989,0.905289
2,0.2426,0.29679,0.910711,0.75827,0.78543,0.741912,0.905009,0.90232,0.910711
3,0.1997,0.329621,0.910733,0.76644,0.783063,0.75289,0.907401,0.904995,0.910733


2025-05-02 06:08:54,284 - INFO - Fine-tuning completed in 8331.44s
2025-05-02 06:08:54,285 - INFO - Evaluating model on the test set...


2025-05-02 06:13:28,926 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 06:13:28,927 - INFO - {'eval_loss': 0.3335813283920288, 'eval_accuracy': 0.9101333333333333, 'eval_f1_macro': 0.7641207689206952, 'eval_precision_macro': 0.7811966277037125, 'eval_recall_macro': 0.7504179059829492, 'eval_f1_weighted': 0.9066477422094331, 'eval_precision_weighted': 0.9041685072460681, 'eval_recall_weighted': 0.9101333333333333, 'eval_runtime': 136.5746, 'eval_samples_per_second': 329.49, 'eval_steps_per_second': 10.302, 'epoch': 3.0}
2025-05-02 06:13:28,927 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 06:13:28,931 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_RoBERTa_Full_FT_confusion_matrix.csv
2025-05-02 06:13:28,931 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\book_reviews\RoBERTa_Full_FT\checkpoints
2025-05-02 06:13:28,967 - INFO - Tokenizer saved to ..\models\llm\book_reviews\RoBERTa_Full_FT\final_model


          negative  neutral  positive
negative      4295      611       386
neutral        648     1785      1381
positive       277      741     34876


2025-05-02 06:13:29,152 - INFO - Skipping FinBERT Full FT for Book Review (Model is domain-specific).
2025-05-02 06:13:29,152 - INFO - Starting run for BERT LoRA FT on Book Review



--- Processing Model: BERT LoRA FT ---


2025-05-02 06:13:29,553 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-02 06:13:34,535 - INFO - Tokenization complete.
2025-05-02 06:13:34,536 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 06:13:34,866 - INFO - Applying LoRA configuration...


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-02 06:13:35,205 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.4153,0.349596,0.8706,0.638505,0.688276,0.616487,0.854782,0.847475,0.8706
2,0.3529,0.336474,0.876133,0.659953,0.704863,0.63877,0.863008,0.856953,0.876133
3,0.3436,0.333177,0.875933,0.66442,0.702842,0.643693,0.864383,0.858214,0.875933


2025-05-02 07:56:19,903 - INFO - Fine-tuning completed in 6164.70s
2025-05-02 07:56:19,903 - INFO - Evaluating model on the test set...


2025-05-02 08:01:25,096 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 08:01:25,096 - INFO - {'eval_loss': 0.3368000388145447, 'eval_accuracy': 0.8787777777777778, 'eval_f1_macro': 0.6715997397846097, 'eval_precision_macro': 0.7174696168092218, 'eval_recall_macro': 0.6482436466612698, 'eval_f1_weighted': 0.8667640116706523, 'eval_precision_weighted': 0.8614690282078064, 'eval_recall_weighted': 0.8787777777777778, 'eval_runtime': 151.8622, 'eval_samples_per_second': 296.321, 'eval_steps_per_second': 9.265, 'epoch': 3.0}
2025-05-02 08:01:25,097 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 08:01:25,101 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_BERT_LoRA_FT_confusion_matrix.csv
2025-05-02 08:01:25,101 - INFO - Saving LoRA adapter model to ..\models\llm\book_reviews\BERT_LoRA_FT\final_model


          negative  neutral  positive
negative      3736      526      1030
neutral        724     1029      2061
positive       582      532     34780


2025-05-02 08:01:26,627 - INFO - Tokenizer saved to ..\models\llm\book_reviews\BERT_LoRA_FT\final_model
2025-05-02 08:01:26,804 - INFO - Starting run for RoBERTa LoRA FT on Book Review



--- Processing Model: RoBERTa LoRA FT ---


2025-05-02 08:01:27,400 - INFO - Tokenizing data using roberta-base tokenizer...


Running tokenizer on dataset:   0%|          | 0/45000 [00:00<?, ? examples/s]

2025-05-02 08:01:31,855 - INFO - Tokenization complete.
2025-05-02 08:01:31,855 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 08:01:32,169 - INFO - Applying LoRA configuration...


trainable params: 887,811 || all params: 125,535,750 || trainable%: 0.7072


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-02 08:01:32,539 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.3332,0.2894,0.894911,0.696182,0.74249,0.689228,0.882855,0.881458,0.894911
2,0.2973,0.284263,0.898067,0.717739,0.749934,0.705,0.889508,0.886298,0.898067
3,0.2906,0.282006,0.898556,0.721433,0.751624,0.705719,0.890698,0.886906,0.898556


2025-05-02 09:44:02,944 - INFO - Fine-tuning completed in 6150.40s
2025-05-02 09:44:02,945 - INFO - Evaluating model on the test set...


2025-05-02 09:49:02,399 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:49:02,399 - INFO - {'eval_loss': 0.2877999544143677, 'eval_accuracy': 0.8984222222222222, 'eval_f1_macro': 0.719141877540694, 'eval_precision_macro': 0.7526124651662789, 'eval_recall_macro': 0.7037834429886437, 'eval_f1_weighted': 0.8899143076637963, 'eval_precision_weighted': 0.8864349284837768, 'eval_recall_weighted': 0.8984222222222222, 'eval_runtime': 149.1208, 'eval_samples_per_second': 301.769, 'eval_steps_per_second': 9.435, 'epoch': 3.0}
2025-05-02 09:49:02,400 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:49:02,404 - INFO - Confusion matrix saved to ..\result\book_reviews\Book_Review_RoBERTa_LoRA_FT_confusion_matrix.csv
2025-05-02 09:49:02,405 - INFO - Saving LoRA adapter model to ..\models\llm\book_reviews\RoBERTa_LoRA_FT\final_model


          negative  neutral  positive
negative      4261      454       577
neutral        875     1274      1665
positive       413      587     34894


2025-05-02 09:49:03,054 - INFO - Tokenizer saved to ..\models\llm\book_reviews\RoBERTa_LoRA_FT\final_model
2025-05-02 09:49:03,240 - INFO - Processing Dataset: Financial News





Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/3392 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/3392 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/727 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/727 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Mapping labels to IDs:   0%|          | 0/727 [00:00<?, ? examples/s]

Filtering invalid labels:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:49:04,992 - INFO - Loaded Train data: 3392 examples
2025-05-02 09:49:04,993 - INFO - Loaded Validation data: 727 examples
2025-05-02 09:49:04,993 - INFO - Loaded Test data: 727 examples
2025-05-02 09:49:05,035 - INFO - Starting run for DistilBERT Feature Extractor + LR on Financial News



--- Processing Model: DistilBERT Feature Extractor + LR ---


2025-05-02 09:49:05,344 - INFO - Tokenizing data using distilbert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/3392 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:49:05,557 - INFO - Tokenization complete.
2025-05-02 09:49:05,558 - INFO - Running in Feature Extraction mode.
2025-05-02 09:49:05,969 - INFO - Extracting features from datasets...
2025-05-02 09:49:11,258 - INFO - Feature extraction took 5.29s
2025-05-02 09:49:11,259 - INFO - Train features shape: (3392, 768)
2025-05-02 09:49:11,259 - INFO - Training Logistic Regression classifier...
2025-05-02 09:49:20,539 - INFO - Classifier training took 9.28s
2025-05-02 09:49:20,550 - INFO - Feature Extractor + LR - Test Set Performance:
2025-05-02 09:49:20,556 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_DistilBERT_Feature_Extractor__LR_confusion_matrix.csv
2025-05-02 09:49:20,558 - INFO - Logistic Regression classifier saved to ..\models\llm\financial_news\DistilBERT_Feature_Extractor__LR\Financial_News_DistilBERT_Feature_Extractor_+_LR_LR_classifier.joblib


              precision    recall  f1-score   support

    negative       0.59      0.85      0.69        91
     neutral       0.87      0.76      0.81       432
    positive       0.63      0.68      0.65       204

    accuracy                           0.75       727
   macro avg       0.70      0.76      0.72       727
weighted avg       0.77      0.75      0.75       727

Confusion Matrix (Test Set):
          negative  neutral  positive
negative        77        7         7
neutral         32      327        73
positive        22       44       138


2025-05-02 09:49:20,816 - INFO - Starting run for BERT Feature Extractor + LR on Financial News



--- Processing Model: BERT Feature Extractor + LR ---


2025-05-02 09:49:21,098 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/3392 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:49:21,312 - INFO - Tokenization complete.
2025-05-02 09:49:21,312 - INFO - Running in Feature Extraction mode.
2025-05-02 09:49:21,895 - INFO - Extracting features from datasets...
2025-05-02 09:49:31,490 - INFO - Feature extraction took 9.60s
2025-05-02 09:49:31,490 - INFO - Train features shape: (3392, 768)
2025-05-02 09:49:31,491 - INFO - Training Logistic Regression classifier...
2025-05-02 09:49:34,188 - INFO - Classifier training took 2.70s
2025-05-02 09:49:34,197 - INFO - Feature Extractor + LR - Test Set Performance:
2025-05-02 09:49:34,205 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_BERT_Feature_Extractor__LR_confusion_matrix.csv
2025-05-02 09:49:34,206 - INFO - Logistic Regression classifier saved to ..\models\llm\financial_news\BERT_Feature_Extractor__LR\Financial_News_BERT_Feature_Extractor_+_LR_LR_classifier.joblib


              precision    recall  f1-score   support

    negative       0.57      0.80      0.66        91
     neutral       0.88      0.75      0.81       432
    positive       0.63      0.71      0.67       204

    accuracy                           0.74       727
   macro avg       0.69      0.75      0.71       727
weighted avg       0.77      0.74      0.75       727

Confusion Matrix (Test Set):
          negative  neutral  positive
negative        73        7        11
neutral         35      323        74
positive        21       38       145


2025-05-02 09:49:34,507 - INFO - Starting run for DistilBERT Full FT on Financial News



--- Processing Model: DistilBERT Full FT ---


2025-05-02 09:49:34,791 - INFO - Tokenizing data using distilbert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:49:34,849 - INFO - Tokenization complete.
2025-05-02 09:49:34,850 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 09:49:35,322 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.6185,0.473743,0.814305,0.76633,0.786861,0.782528,0.808538,0.829345,0.814305
2,0.3048,0.419204,0.834938,0.812727,0.80298,0.823754,0.835882,0.837628,0.834938
3,0.2039,0.452261,0.837689,0.811669,0.799239,0.827744,0.83849,0.84082,0.837689


2025-05-02 09:50:09,106 - INFO - Fine-tuning completed in 33.78s
2025-05-02 09:50:09,106 - INFO - Evaluating model on the test set...


2025-05-02 09:50:09,907 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:50:09,907 - INFO - {'eval_loss': 0.41072723269462585, 'eval_accuracy': 0.8404401650618982, 'eval_f1_macro': 0.8235341655157988, 'eval_precision_macro': 0.8062278440633666, 'eval_recall_macro': 0.8465264310852546, 'eval_f1_weighted': 0.8413002255133367, 'eval_precision_weighted': 0.8447563226746198, 'eval_recall_weighted': 0.8404401650618982, 'eval_runtime': 0.3951, 'eval_samples_per_second': 1840.263, 'eval_steps_per_second': 58.22, 'epoch': 3.0}
2025-05-02 09:50:09,908 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:50:09,911 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_DistilBERT_Full_FT_confusion_matrix.csv
2025-05-02 09:50:09,912 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\financial_news\DistilBERT_Full_FT\checkpoints
2025-05-02 09:50:09,924 - INFO - Tokenizer saved to ..\models\llm\financial_news\DistilBERT_Full_FT\final_mode

          negative  neutral  positive
negative        82        5         4
neutral         23      369        40
positive         6       38       160

--- Processing Model: BERT Full FT ---


2025-05-02 09:50:10,349 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:50:10,415 - INFO - Tokenization complete.
2025-05-02 09:50:10,416 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 09:50:10,992 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.6668,0.451912,0.832187,0.797065,0.797914,0.8072,0.830745,0.836454,0.832187
2,0.3081,0.400538,0.85282,0.829387,0.82013,0.839766,0.853723,0.855397,0.85282
3,0.1794,0.455921,0.844567,0.820974,0.8033,0.843933,0.846357,0.851185,0.844567


2025-05-02 09:51:09,956 - INFO - Fine-tuning completed in 58.96s
2025-05-02 09:51:09,956 - INFO - Evaluating model on the test set...


2025-05-02 09:51:11,357 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:51:11,357 - INFO - {'eval_loss': 0.4168113172054291, 'eval_accuracy': 0.8514442916093535, 'eval_f1_macro': 0.8353445165945166, 'eval_precision_macro': 0.8233709649729319, 'eval_recall_macro': 0.8491993663562291, 'eval_f1_weighted': 0.8522584734156261, 'eval_precision_weighted': 0.8541837561826048, 'eval_recall_weighted': 0.8514442916093535, 'eval_runtime': 0.682, 'eval_samples_per_second': 1065.987, 'eval_steps_per_second': 33.724, 'epoch': 3.0}
2025-05-02 09:51:11,357 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:51:11,360 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_BERT_Full_FT_confusion_matrix.csv
2025-05-02 09:51:11,360 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\financial_news\BERT_Full_FT\checkpoints
2025-05-02 09:51:11,369 - INFO - Tokenizer saved to ..\models\llm\financial_news\BERT_Full_FT\final_model
2025-05-02 09:51:

          negative  neutral  positive
negative        79        7         5
neutral         18      374        40
positive         4       34       166

--- Processing Model: RoBERTa Full FT ---


2025-05-02 09:51:11,864 - INFO - Tokenizing data using roberta-base tokenizer...


Running tokenizer on dataset:   0%|          | 0/3392 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:51:12,124 - INFO - Tokenization complete.
2025-05-02 09:51:12,125 - INFO - Running in Fine-tuning mode (LoRA: False).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 09:51:12,708 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.5913,0.410667,0.854195,0.833312,0.820603,0.85912,0.854057,0.859955,0.854195
2,0.2945,0.391283,0.85282,0.842024,0.82703,0.861262,0.854265,0.859679,0.85282
3,0.1962,0.479494,0.851444,0.841589,0.826037,0.860373,0.852377,0.855485,0.851444


2025-05-02 09:52:15,605 - INFO - Fine-tuning completed in 62.90s
2025-05-02 09:52:15,606 - INFO - Evaluating model on the test set...


2025-05-02 09:52:17,045 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:52:17,046 - INFO - {'eval_loss': 0.3728979825973511, 'eval_accuracy': 0.859697386519945, 'eval_f1_macro': 0.8553361473716953, 'eval_precision_macro': 0.8376828151850101, 'eval_recall_macro': 0.8772140622630819, 'eval_f1_weighted': 0.8605430153813676, 'eval_precision_weighted': 0.864450299028611, 'eval_recall_weighted': 0.859697386519945, 'eval_runtime': 0.697, 'eval_samples_per_second': 1043.046, 'eval_steps_per_second': 32.999, 'epoch': 3.0}
2025-05-02 09:52:17,046 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:52:17,049 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_RoBERTa_Full_FT_confusion_matrix.csv
2025-05-02 09:52:17,049 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\financial_news\RoBERTa_Full_FT\checkpoints
2025-05-02 09:52:17,080 - INFO - Tokenizer saved to ..\models\llm\financial_news\RoBERTa_Full_FT\final_model
2025-05-02 

          negative  neutral  positive
negative        85        5         1
neutral         18      367        47
positive         1       30       173

--- Processing Model: FinBERT Full FT ---


2025-05-02 09:52:17,553 - INFO - Tokenizing data using ProsusAI/finbert tokenizer...


Running tokenizer on dataset:   0%|          | 0/3392 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:52:17,772 - INFO - Tokenization complete.
2025-05-02 09:52:17,773 - INFO - Running in Fine-tuning mode (LoRA: False).
2025-05-02 09:52:19,015 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.6275,0.372209,0.859697,0.840124,0.819596,0.874906,0.860752,0.867856,0.859697
2,0.2082,0.369647,0.877579,0.870019,0.855398,0.887733,0.878667,0.882373,0.877579
3,0.1026,0.440914,0.877579,0.866078,0.849126,0.886879,0.878622,0.882269,0.877579


2025-05-02 09:53:17,362 - INFO - Fine-tuning completed in 58.35s
2025-05-02 09:53:17,362 - INFO - Evaluating model on the test set...


2025-05-02 09:53:18,760 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:53:18,760 - INFO - {'eval_loss': 0.36533597111701965, 'eval_accuracy': 0.8707015130674003, 'eval_f1_macro': 0.8574163395911162, 'eval_precision_macro': 0.8447316300257476, 'eval_recall_macro': 0.8723794361049263, 'eval_f1_weighted': 0.8716577590195416, 'eval_precision_weighted': 0.8745258230006187, 'eval_recall_weighted': 0.8707015130674003, 'eval_runtime': 0.68, 'eval_samples_per_second': 1069.119, 'eval_steps_per_second': 33.824, 'epoch': 3.0}
2025-05-02 09:53:18,761 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:53:18,764 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_FinBERT_Full_FT_confusion_matrix.csv
2025-05-02 09:53:18,764 - INFO - Best model loaded by Trainer. Checkpoint saved in ..\models\llm\financial_news\FinBERT_Full_FT\checkpoints
2025-05-02 09:53:18,773 - INFO - Tokenizer saved to ..\models\llm\financial_news\FinBERT_Full_FT\final_model
2025-05-

          negative  neutral  positive
negative        80        8         3
neutral         16      376        40
positive         3       24       177

--- Processing Model: BERT LoRA FT ---


2025-05-02 09:53:19,199 - INFO - Tokenizing data using bert-base-uncased tokenizer...


Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:53:19,258 - INFO - Tokenization complete.
2025-05-02 09:53:19,258 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 09:53:19,585 - INFO - Applying LoRA configuration...


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-02 09:53:19,871 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,1.0628,0.9336,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
2,0.9294,0.922395,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
3,0.9194,0.918918,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223


2025-05-02 09:53:53,077 - INFO - Fine-tuning completed in 33.21s
2025-05-02 09:53:53,077 - INFO - Evaluating model on the test set...


2025-05-02 09:53:54,584 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:53:54,584 - INFO - {'eval_loss': 0.935846745967865, 'eval_accuracy': 0.5914718019257221, 'eval_f1_macro': 0.24776721405934887, 'eval_precision_macro': 0.19770114942528735, 'eval_recall_macro': 0.3317901234567901, 'eval_f1_weighted': 0.44168680800676224, 'eval_precision_weighted': 0.3524356116302234, 'eval_recall_weighted': 0.5914718019257221, 'eval_runtime': 0.742, 'eval_samples_per_second': 979.785, 'eval_steps_per_second': 30.997, 'epoch': 3.0}
2025-05-02 09:53:54,584 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:53:54,587 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_BERT_LoRA_FT_confusion_matrix.csv
2025-05-02 09:53:54,587 - INFO - Saving LoRA adapter model to ..\models\llm\financial_news\BERT_LoRA_FT\final_model


          negative  neutral  positive
negative         0       91         0
neutral          1      430         1
positive         0      204         0


2025-05-02 09:53:55,110 - INFO - Tokenizer saved to ..\models\llm\financial_news\BERT_LoRA_FT\final_model
2025-05-02 09:53:55,276 - INFO - Starting run for RoBERTa LoRA FT on Financial News



--- Processing Model: RoBERTa LoRA FT ---


2025-05-02 09:53:55,606 - INFO - Tokenizing data using roberta-base tokenizer...


Running tokenizer on dataset:   0%|          | 0/727 [00:00<?, ? examples/s]

2025-05-02 09:53:55,707 - INFO - Tokenization complete.
2025-05-02 09:53:55,708 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-02 09:53:56,010 - INFO - Applying LoRA configuration...


trainable params: 887,811 || all params: 125,535,750 || trainable%: 0.7072


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-02 09:53:56,318 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,0.9673,0.921489,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
2,0.9295,0.907223,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
3,0.9111,0.895944,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223


2025-05-02 09:54:30,788 - INFO - Fine-tuning completed in 34.47s
2025-05-02 09:54:30,789 - INFO - Evaluating model on the test set...


2025-05-02 09:54:32,373 - INFO - Fine-tuning - Test Set Performance:
2025-05-02 09:54:32,374 - INFO - {'eval_loss': 0.9231343269348145, 'eval_accuracy': 0.594222833562586, 'eval_f1_macro': 0.24849007765314926, 'eval_precision_macro': 0.19807427785419532, 'eval_recall_macro': 0.3333333333333333, 'eval_f1_weighted': 0.44297543416572416, 'eval_precision_weighted': 0.3531007759271487, 'eval_recall_weighted': 0.594222833562586, 'eval_runtime': 0.7895, 'eval_samples_per_second': 920.792, 'eval_steps_per_second': 29.131, 'epoch': 3.0}
2025-05-02 09:54:32,374 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-02 09:54:32,377 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_RoBERTa_LoRA_FT_confusion_matrix.csv
2025-05-02 09:54:32,378 - INFO - Saving LoRA adapter model to ..\models\llm\financial_news\RoBERTa_LoRA_FT\final_model


          negative  neutral  positive
negative         0       91         0
neutral          0      432         0
positive         0      204         0


2025-05-02 09:54:32,926 - INFO - Tokenizer saved to ..\models\llm\financial_news\RoBERTa_LoRA_FT\final_model


# 4. Results Summary and Saving

In [7]:
print("\n\n===== Overall LLM Results Summary =====")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1400) # Wider display
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

# Ensure all expected columns exist, fill with NaN if necessary
for col in METRICS_TO_CALCULATE:
    if col not in results_df.columns:
        results_df[col] = np.nan

# Reorder columns for clarity
column_order = ["Dataset", "Model"] + METRICS_TO_CALCULATE
# Filter out columns not present if something went wrong during creation
column_order = [col for col in column_order if col in results_df.columns]
results_df = results_df[column_order]


print(results_df)

# --- Save results to CSV for each dataset ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    dataset_results_df = results_df[results_df['Dataset'] == dataset_name]
    if not dataset_results_df.empty:
        results_filename = f"{dataset_name.replace(' ', '_')}_llm_transformers_results.csv"
        results_save_path = os.path.join(config['result_dir'], results_filename)
        try:
            dataset_results_df.to_csv(results_save_path, index=False, mode='w+')
            print(f"\nResults for {dataset_name} saved to {results_save_path}")
        except Exception as e:
            print(f"\nError saving results for {dataset_name} to {results_save_path}: {e}")

# --- Save combined results ---
combined_results_path = os.path.join(RESULT_DIR, "combined_llm_transformers_results.csv")
try:
    results_df.to_csv(combined_results_path, index=False, mode='w+')
    print(f"\nCombined results saved to {combined_results_path}")
except Exception as e:
    print(f"\nError saving combined results to {combined_results_path}: {e}")




===== Overall LLM Results Summary =====
           Dataset                              Model  Accuracy  F1 (Macro)  Precision (Macro)  Recall (Macro)  F1 (Weighted)  Precision (Weighted)  Recall (Weighted)  Train Time (s)  Eval Time (s)
0      Book Review  DistilBERT Feature Extractor + LR    0.7271      0.5881             0.5662          0.6820         0.7665                0.8451             0.7271        964.1050         0.1320
1      Book Review        BERT Feature Extractor + LR    0.7223      0.5770             0.5568          0.6650         0.7624                0.8400             0.7223        546.2540         0.1310
2      Book Review                 DistilBERT Full FT    0.8998      0.7454             0.7618          0.7312         0.8966                0.8941             0.8998       4281.3700       143.8980
3      Book Review                       BERT Full FT    0.9047      0.7565             0.7711          0.7435         0.9021                0.9000             0.9047