# 1. Setup and Configuration

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel, # For feature extraction
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType # For LoRA
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
import evaluate 
import time
import os
import joblib 
import logging
import warnings
import gc
import psutil

# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Limit CPU Usage ---
p = psutil.Process()
p.cpu_affinity([1, 2, 3, 4, 5, 6, 7])

In [2]:
# --- Project Directory Structure ---
BASE_DIR = ".." # Assuming the notebook is in a 'notebooks' or similar folder
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
# Models and results saved within dataset-specific folders
MODEL_OUTPUT_BASE_DIR = os.path.join(BASE_DIR, "models", "llm")
RESULT_DIR = os.path.join(BASE_DIR, "result")

# --- Specific Dataset Paths ---
BOOK_REVIEW_DATA_DIR = os.path.join(DATA_DIR, "book_reviews")
FINANCIAL_NEWS_DATA_DIR = os.path.join(DATA_DIR, "financial_news")

# --- Model/Result Output Dirs (Ensure they exist) ---
BOOK_REVIEW_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "book_reviews")
FINANCIAL_NEWS_MODEL_DIR = os.path.join(MODEL_OUTPUT_BASE_DIR, "financial_news")
BOOK_REVIEW_RESULT_DIR = os.path.join(RESULT_DIR, "book_reviews")
FINANCIAL_NEWS_RESULT_DIR = os.path.join(RESULT_DIR, "financial_news")

os.makedirs(BOOK_REVIEW_MODEL_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_MODEL_DIR, exist_ok=True)
os.makedirs(BOOK_REVIEW_RESULT_DIR, exist_ok=True)
os.makedirs(FINANCIAL_NEWS_RESULT_DIR, exist_ok=True)

# --- File Names ---
TRAIN_FN = "train.csv"
VAL_FN = "val.csv"
TEST_FN = "test.csv"

# --- Column Names ---
TEXT_COLUMN = "text"
TARGET_COLUMN = "score" # Assumes string labels like 'positive', 'negative', 'neutral'

In [3]:
# --- Model & Training Hyperparameters ---
RANDOM_STATE = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# Tokenizer params
MAX_LENGTH = 256

# Feature Extractor Params
FEATURE_EXTRACTOR_BATCH_SIZE = 64
LOGREG_MAX_ITER = 2000

# Fine-tuning params
LEARNING_RATE = 2e-5 
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 128 
EVAL_BATCH_SIZE = 32
NUM_EPOCHS = 300
FP16 = torch.cuda.is_available()

# PEFT (LoRA) params
USE_LORA = True # Flag to control if LoRA runs are included
LORA_R = 8 # LoRA rank (dimension)
LORA_ALPHA = 16 # LoRA alpha scaling
LORA_DROPOUT = 0.1
# Target modules vary by model, common ones for BERT/RoBERTa:
LORA_TARGET_MODULES = ["query", "value"] # Common target layers for attention

# --- Label Mapping (Essential for Transformers) ---
LABEL_LIST = ['negative', 'neutral', 'positive'] # Define explicit order
LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)}
ID2LABEL = {i: label for i, label in enumerate(LABEL_LIST)}
NUM_CLASSES = len(LABEL_LIST)

# --- Evaluation Metrics ---
METRICS_TO_CALCULATE = [
    "Accuracy",
    "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
    "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
    "Train Time (s)", "Eval Time (s)"
]
METRIC_FOR_BEST_MODEL = "f1_macro" # Metric to monitor for early stopping/best model saving

# --- Datasets Configuration ---
DATASETS_TO_PROCESS = {
    # "Book Review": {
    #     "train_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TRAIN_FN}'),
    #     "val_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{VAL_FN}'),
    #     "test_path": os.path.join(BOOK_REVIEW_DATA_DIR, f'book_reviews_{TEST_FN}'),
    #     "model_dir": BOOK_REVIEW_MODEL_DIR,
    #     "result_dir": BOOK_REVIEW_RESULT_DIR,
    # },
    "Financial News": {
        "train_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TRAIN_FN}'),
        "val_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{VAL_FN}'),
        "test_path": os.path.join(FINANCIAL_NEWS_DATA_DIR, f'financial_news_{TEST_FN}'),
        "model_dir": FINANCIAL_NEWS_MODEL_DIR,
        "result_dir": FINANCIAL_NEWS_RESULT_DIR,
    }
}

# --- Model Configurations to Run ---
# Define the models and settings for the experiment loop
# Format: ('Experiment Name', 'HuggingFace Model ID', use_lora_flag, is_feature_extractor_run)
MODEL_CONFIGURATIONS = [
    # Feature Extractors
    # ('DistilBERT Feature Extractor + LR', 'distilbert-base-uncased', False, True),
    # ('BERT Feature Extractor + LR',       'bert-base-uncased',       False, True),

    # # Full Fine-tuning
    # ('DistilBERT Full FT', 'distilbert-base-uncased', False, False),
    # ('BERT Full FT',       'bert-base-uncased',       False, False),
    # ('RoBERTa Full FT',    'roberta-base',            False, False),
    # ('FinBERT Full FT',    'ProsusAI/finbert',        False, False), # Domain-specific

    # LoRA Fine-tuning (only run if USE_LORA is True)
    ('BERT LoRA FT',       'bert-base-uncased',       True, False),
    ('RoBERTa LoRA FT',    'roberta-base',            True, False),
    # ('FinBERT LoRA FT',    'ProsusAI/finbert',        True, False), # Can also apply LoRA to FinBERT
] if USE_LORA else [ # Exclude LoRA runs if USE_LORA is False
    ('DistilBERT Feature Extractor + LR', 'distilbert-base-uncased', False, True),
    ('BERT Feature Extractor + LR',       'bert-base-uncased',       False, True),
    ('DistilBERT Full FT', 'distilbert-base-uncased', False, False),
    ('BERT Full FT',       'bert-base-uncased',       False, False),
    ('RoBERTa Full FT',    'roberta-base',            False, False),
    ('FinBERT Full FT',    'ProsusAI/finbert',        False, False),
]

# Check if FinBERT model ID needs adjustment (sometimes name changes)
# Example alternative: 'yiyanghkust/finbert-tone'
FINBERT_MODEL_ID = 'ProsusAI/finbert'

2025-05-03 09:24:54,431 - INFO - Using device: cuda


# 2. Utility Functions

In [4]:
def load_data_hf(path):
    """Loads a single CSV into a Hugging Face Dataset."""
    try:
        # Load directly using datasets library
        dataset = load_dataset('csv', data_files=path, split='train')
        # Rename target column to 'label' (expected by Trainer) and map string labels to integers
        if TARGET_COLUMN != 'label':
            dataset = dataset.rename_column(TARGET_COLUMN, 'label')
        dataset = dataset.map(lambda examples: {'label': LABEL2ID.get(str(examples['label']), -1)}, # Handle potential non-string labels robustly
                              desc="Mapping labels to IDs")
        # Filter out examples where label mapping failed (label == -1)
        original_size = len(dataset)
        dataset = dataset.filter(lambda example: example['label'] != -1, desc="Filtering invalid labels")
        if len(dataset) < original_size:
            logging.warning(f"Filtered out {original_size - len(dataset)} examples with invalid labels from {path}.")
        return dataset
    except Exception as e:
        logging.error(f"Error loading dataset from {path}: {e}", exc_info=True)
        return None

def create_dataset_dict(train_path, val_path, test_path):
    """Loads train, validation, and test CSVs into a DatasetDict."""
    train_ds = load_data_hf(train_path)
    val_ds = load_data_hf(val_path)
    test_ds = load_data_hf(test_path)
    if train_ds and val_ds and test_ds:
        logging.info(f"Loaded Train data: {len(train_ds)} examples")
        logging.info(f"Loaded Validation data: {len(val_ds)} examples")
        logging.info(f"Loaded Test data: {len(test_ds)} examples")
        return DatasetDict({
            'train': train_ds,
            'validation': val_ds,
            'test': test_ds
        })
    else:
        return None

def preprocess_function(examples, tokenizer):
    """Tokenizes text data."""
    # Ensure text is string, handle potential None values
    texts = [str(text) if text is not None else "" for text in examples[TEXT_COLUMN]]
    return tokenizer(texts, truncation=True, padding=False, max_length=MAX_LENGTH) # Padding handled by DataCollator

# Define metric computation function for Trainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    prec_macro = precision_score(labels, preds, average='macro', zero_division=0)
    rec_macro = recall_score(labels, preds, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
    prec_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
    rec_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'precision_macro': prec_macro,
        'recall_macro': rec_macro,
        'f1_weighted': f1_weighted,
        'precision_weighted': prec_weighted,
        'recall_weighted': rec_weighted,
    }

def calculate_metrics_from_preds(y_true, y_pred):
    """Calculates evaluation metrics from direct predictions."""
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    # Print the classification report for detailed metrics
    

    return {
        "Accuracy": accuracy,
        "F1 (Macro)": f1_macro,
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 (Weighted)": f1_weighted,
        "Precision (Weighted)": precision_weighted,
        "Recall (Weighted)": recall_weighted,
    }

# Function to extract features (CLS token)
def extract_hidden_states(batch, model, tokenizer, device):
    # Ensure input_ids and attention_mask are tensors on the correct device
    inputs = {k: v.to(device) for k, v in batch.items()
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return the representation of the [CLS] token (first token)
    # Move back to CPU to accumulate results if needed outside GPU loop
    return last_hidden_state[:, 0].cpu().numpy()

# 3. Run Experiments

In [5]:
all_results = []

In [6]:
# --- Loop through each dataset defined in the configuration ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    print(f"\n{'='*25} Processing Dataset: {dataset_name} {'='*25}")
    logging.info(f"Processing Dataset: {dataset_name}")

    # 1. Load Data using Hugging Face Datasets
    raw_datasets = create_dataset_dict(config['train_path'], config['val_path'], config['test_path'])
    if not raw_datasets:
        logging.error(f"Could not load data for {dataset_name}. Skipping.")
        continue

    # --- Loop through each model configuration ---
    for model_label, model_id, use_lora, is_feature_extractor in MODEL_CONFIGURATIONS:

        # --- Skip FinBERT for non-financial data ---
        if model_id == FINBERT_MODEL_ID and dataset_name != "Financial News":
            logging.info(f"Skipping {model_label} for {dataset_name} (Model is domain-specific).")
            continue

        # --- Skip LoRA runs if flag is off ---
        if use_lora and not USE_LORA:
            logging.info(f"Skipping LoRA run {model_label} as USE_LORA is False.")
            continue

        print(f"\n--- Processing Model: {model_label} ---")
        logging.info(f"Starting run for {model_label} on {dataset_name}")
        run_results = {"Dataset": dataset_name, "Model": model_label}
        train_time = 0.0
        eval_time = 0.0

        # Create specific output dirs for this run's checkpoints/models
        run_model_dir = os.path.join(config['model_dir'], model_label.replace(' ', '_').replace('+', ''))
        os.makedirs(run_model_dir, exist_ok=True)

        try:
            # 2. Load Tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_id)

            # 3. Tokenize Datasets
            logging.info(f"Tokenizing data using {model_id} tokenizer...")
            # Apply tokenization in batches
            tokenized_datasets = raw_datasets.map(
                lambda batch: preprocess_function(batch, tokenizer),
                batched=True,
                remove_columns=[TEXT_COLUMN], # Remove original text column
                desc="Running tokenizer on dataset"
            )
            # Data collator handles dynamic padding
            data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
            logging.info("Tokenization complete.")

            # ===========================================
            # === 4.A Feature Extraction + Classifier ===
            # ===========================================
            if is_feature_extractor:
                logging.info("Running in Feature Extraction mode.")
                # Load base model (no classification head)
                model = AutoModel.from_pretrained(model_id).to(DEVICE)
                model.eval() # Set to evaluation mode

                # --- Extract Features ---
                logging.info("Extracting features from datasets...")
                start_extract_time = time.time()

                # Need dataloaders for batching feature extraction
                tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
                train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)
                val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)
                test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=FEATURE_EXTRACTOR_BATCH_SIZE, collate_fn=data_collator)

                X_train_features = []
                y_train_labels = []
                for batch in train_dataloader:
                    y_train_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_train_features.append(batch_features)

                X_val_features = []
                y_val_labels = []
                for batch in val_dataloader:
                    y_val_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_val_features.append(batch_features)

                X_test_features = []
                y_test_labels = []
                for batch in test_dataloader:
                    y_test_labels.extend(batch['labels'].tolist())
                    batch_features = extract_hidden_states(batch, model, tokenizer, DEVICE)
                    X_test_features.append(batch_features)

                X_train_features = np.concatenate(X_train_features)
                X_val_features = np.concatenate(X_val_features)
                X_test_features = np.concatenate(X_test_features)
                end_extract_time = time.time()
                logging.info(f"Feature extraction took {end_extract_time - start_extract_time:.2f}s")
                logging.info(f"Train features shape: {X_train_features.shape}")

                # --- Train Classifier ---
                logging.info("Training Logistic Regression classifier...")
                classifier = LogisticRegression(max_iter=LOGREG_MAX_ITER, random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)
                start_train_time = time.time()
                # Combine train + val features for final classifier training? Or tune on val? Simpler: train on train, eval on test.
                classifier.fit(X_train_features, y_train_labels)
                end_train_time = time.time()
                train_time = end_train_time - start_train_time
                logging.info(f"Classifier training took {train_time:.2f}s")

                # --- Evaluate Classifier ---
                start_eval_time = time.time()
                y_pred_test = classifier.predict(X_test_features)
                end_eval_time = time.time()
                eval_time = end_eval_time - start_eval_time

                test_metrics = calculate_metrics_from_preds(y_test_labels, y_pred_test)
                run_results.update(test_metrics)

                logging.info("Feature Extractor + LR - Test Set Performance:")
                report_str = classification_report(y_test_labels, y_pred_test, target_names=LABEL_LIST, zero_division=0)
                print(report_str)

                cm = confusion_matrix(y_test_labels, y_pred_test, labels=list(range(NUM_CLASSES))) # Ensure labels are ordered
                cm_df = pd.DataFrame(cm, index=LABEL_LIST, columns=LABEL_LIST)
                print("Confusion Matrix (Test Set):")
                print(cm_df)

                cm_filename = f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_').replace('+','')}_confusion_matrix.csv"
                cm_save_path = os.path.join(config['result_dir'], cm_filename)
                try:
                    cm_df.to_csv(cm_save_path)
                    logging.info(f"Confusion matrix saved to {cm_save_path}")
                except Exception as cm_e:
                    logging.error(f"Failed to save confusion matrix to {cm_save_path}: {cm_e}")


                # Save the classifier
                clf_save_path = os.path.join(run_model_dir, f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_')}_LR_classifier.joblib")
                joblib.dump(classifier, clf_save_path)
                logging.info(f"Logistic Regression classifier saved to {clf_save_path}")

                # Cleanup GPU memory used by the base model
                del model
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()


            # ===========================================
            # === 4.B Fine-tuning (Full or LoRA)     ====
            # ===========================================
            else:
                logging.info(f"Running in Fine-tuning mode (LoRA: {use_lora}).")
                # Load model with sequence classification head
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_id,
                    num_labels=NUM_CLASSES,
                    id2label=ID2LABEL,
                    label2id=LABEL2ID
                )

                # --- Apply LoRA if specified ---
                if use_lora:
                    logging.info("Applying LoRA configuration...")
                    peft_config = LoraConfig(
                        task_type=TaskType.SEQ_CLS, # Specify task type
                        r=LORA_R,
                        lora_alpha=LORA_ALPHA,
                        lora_dropout=LORA_DROPOUT,
                        target_modules=LORA_TARGET_MODULES,
                        bias="none" # Usually set bias to 'none' or 'all'
                    )
                    model = get_peft_model(model, peft_config)
                    model.print_trainable_parameters() # Verify LoRA application

                model.to(DEVICE) # Move model to GPU before Trainer

                # --- Define Training Arguments ---
                training_args = TrainingArguments(
                    output_dir=os.path.join(run_model_dir, "checkpoints"),
                    logging_dir=os.path.join(run_model_dir, "logs"),
                    report_to="none", # Disable wandb/tensorboard reporting unless configured
                    num_train_epochs=NUM_EPOCHS,
                    learning_rate=LEARNING_RATE,
                    weight_decay=WEIGHT_DECAY,
                    per_device_train_batch_size=TRAIN_BATCH_SIZE,
                    per_device_eval_batch_size=EVAL_BATCH_SIZE,

                    # --- Strategies ---
                    eval_strategy="epoch",  # Evaluate at the end of each epoch
                    save_strategy="epoch",        # Save a checkpoint at the end of each epoch
                    logging_strategy="epoch",     # Log metrics at the end of each epoch (consistent)

                    # --- Explicitly disable step-based intervals when using epoch strategy ---
                    eval_steps=None,              # Do not evaluate every N steps
                    save_steps=None,              # Do not save every N steps
                    logging_steps=None,           # Do not log every N steps (use logging_strategy="epoch")
                    # Note: If you WANT step-based logging while using epoch eval/save, you can set
                    # logging_strategy="steps" and provide a value for logging_steps, but keep
                    # eval_steps=None and save_steps=None.

                    # --- Best model loading ---
                    load_best_model_at_end=True, # Load the best model based on metric_for_best_model
                    metric_for_best_model=METRIC_FOR_BEST_MODEL, # e.g., "f1_macro"
                    greater_is_better=True,      # F1 score is better when higher
                    save_total_limit=2,          # Only keep the best and the latest checkpoint

                    # --- Other settings ---
                    fp16=FP16,                   # Enable mixed precision training if GPU supports it
                    # logging_steps=50,          # Remove or comment out if using logging_strategy="epoch"
                    # dataloader_num_workers=2,  # Optional
                    gradient_accumulation_steps=1,
                    seed=RANDOM_STATE,
                    remove_unused_columns=True, # Default is True, good practice
                )

                print(f"Using evaluation_strategy: {training_args.eval_strategy}") # Add this print statement
                print(f"Using save_strategy: {training_args.save_strategy}")
                print(f"Using load_best_model_at_end: {training_args.load_best_model_at_end}")

                # --- Define Trainer ---
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_datasets["train"],
                    eval_dataset=tokenized_datasets["validation"], # Use validation set for evaluation during training
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
                    callbacks=[EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)] # Stop if metric doesn't improve enough
                )

                # --- Train the Model ---
                logging.info("Starting fine-tuning...")
                start_train_time = time.time()
                train_result = trainer.train()
                end_train_time = time.time()
                train_time = end_train_time - start_train_time
                logging.info(f"Fine-tuning completed in {train_time:.2f}s")

                # --- Evaluate on Test Set ---
                logging.info("Evaluating model on the test set...")
                start_eval_time = time.time()
                # Evaluate first to get metrics like loss
                test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
                # Then predict to get raw predictions for confusion matrix
                predictions = trainer.predict(tokenized_datasets["test"])
                end_eval_time = time.time()
                eval_time = end_eval_time - start_eval_time

                # Extract predictions and labels
                y_pred_test = np.argmax(predictions.predictions, axis=1)
                y_true_test = predictions.label_ids

                # Map trainer metric names (e.g., 'eval_f1_macro') to our standard names
                run_results["Accuracy"] = test_results.get('eval_accuracy', np.nan)
                run_results["F1 (Macro)"] = test_results.get('eval_f1_macro', np.nan)
                run_results["Precision (Macro)"] = test_results.get('eval_precision_macro', np.nan)
                run_results["Recall (Macro)"] = test_results.get('eval_recall_macro', np.nan)
                run_results["F1 (Weighted)"] = test_results.get('eval_f1_weighted', np.nan)
                run_results["Precision (Weighted)"] = test_results.get('eval_precision_weighted', np.nan)
                run_results["Recall (Weighted)"] = test_results.get('eval_recall_weighted', np.nan)

                logging.info("Fine-tuning - Test Set Performance:")
                logging.info(test_results) # Log the full results dict from trainer

                # Generate, print, and save the confusion matrix using predictions
                logging.info("Fine-tuning - Test Set Confusion Matrix:")
                cm = confusion_matrix(y_true_test, y_pred_test, labels=list(range(NUM_CLASSES))) # Ensure labels are ordered
                cm_df = pd.DataFrame(cm, index=LABEL_LIST, columns=LABEL_LIST)
                print(cm_df)

                # Save the confusion matrix
                cm_filename = f"{dataset_name.replace(' ', '_')}_{model_label.replace(' ', '_').replace('+','')}_confusion_matrix.csv"
                cm_save_path = os.path.join(config['result_dir'], cm_filename)
                try:
                    cm_df.to_csv(cm_save_path, mode='w+')
                    logging.info(f"Confusion matrix saved to {cm_save_path}")
                except Exception as cm_e:
                    logging.error(f"Failed to save confusion matrix to {cm_save_path}: {cm_e}")

                # --- Save the Final Model & Tokenizer ---
                # Trainer already saved the best checkpoint based on validation set.
                # For LoRA, the main model is saved by Trainer, adapters need separate save
                final_model_save_path = os.path.join(run_model_dir, "final_model")
                if use_lora:
                    logging.info(f"Saving LoRA adapter model to {final_model_save_path}")
                    model.save_pretrained(final_model_save_path) # Saves only the adapter
                else:
                    # If not LoRA, trainer saved the full best model, we can optionally save it again here
                    # under a consistent name if needed, but load_best_model_at_end handles loading it.
                    # Saving explicitly:
                    # trainer.save_model(final_model_save_path)
                    logging.info(f"Best model loaded by Trainer. Checkpoint saved in {training_args.output_dir}")


                tokenizer.save_pretrained(final_model_save_path) # Save tokenizer with the model/adapter
                logging.info(f"Tokenizer saved to {final_model_save_path}")

            # --- Store Timings and Finalize Results ---
            run_results["Train Time (s)"] = round(train_time, 3)
            run_results["Eval Time (s)"] = round(eval_time, 3)


        except Exception as e:
            logging.error(f"!!! An error occurred while processing {model_label} for {dataset_name}: {e}", exc_info=True)
            # Record partial results if possible
            run_results["Accuracy"] = np.nan
            run_results["F1 (Macro)"] = np.nan
            # Fill other metrics with NaN or error messages
            for metric in METRICS_TO_CALCULATE:
                if metric not in run_results:
                    run_results[metric] = np.nan if metric not in ["Train Time (s)", "Eval Time (s)"] else 0.0
        finally:
            all_results.append(run_results)
            # Clean up memory aggressively after each run
            del tokenizer
            if 'model' in locals(): del model
            if 'trainer' in locals(): del trainer
            if 'classifier' in locals(): del classifier
            if 'tokenized_datasets' in locals(): del tokenized_datasets
            # if 'raw_datasets' in locals(): del raw_datasets
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()


# --- Combine results into a DataFrame ---
results_df = pd.DataFrame(all_results)

2025-05-03 09:24:54,497 - INFO - Processing Dataset: Financial News





2025-05-03 09:24:55,963 - INFO - Loaded Train data: 3392 examples
2025-05-03 09:24:55,964 - INFO - Loaded Validation data: 727 examples
2025-05-03 09:24:55,964 - INFO - Loaded Test data: 727 examples
2025-05-03 09:24:55,965 - INFO - Starting run for BERT LoRA FT on Financial News



--- Processing Model: BERT LoRA FT ---


2025-05-03 09:24:56,368 - INFO - Tokenizing data using bert-base-uncased tokenizer...
2025-05-03 09:24:56,384 - INFO - Tokenization complete.
2025-05-03 09:24:56,384 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-03 09:24:56,761 - INFO - Applying LoRA configuration...


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-03 09:24:57,423 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,1.1409,1.080298,0.464924,0.255785,0.221725,0.304784,0.400419,0.352236,0.464924
2,1.0368,0.992572,0.591472,0.247982,0.197974,0.33179,0.442069,0.352922,0.591472
3,0.9707,0.947288,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
4,0.9394,0.933011,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
5,0.9296,0.926836,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
6,0.924,0.921469,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
7,0.9166,0.914479,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
8,0.9072,0.905603,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
9,0.8925,0.894867,0.596974,0.255361,0.531954,0.336585,0.44919,0.636056,0.596974
10,0.8824,0.87889,0.598349,0.259262,0.366389,0.338211,0.453299,0.49703,0.598349


2025-05-03 09:36:58,271 - INFO - Fine-tuning completed in 720.85s
2025-05-03 09:36:58,272 - INFO - Evaluating model on the test set...


2025-05-03 09:36:59,800 - INFO - Fine-tuning - Test Set Performance:
2025-05-03 09:36:59,801 - INFO - {'eval_loss': 0.42158979177474976, 'eval_accuracy': 0.8376891334250344, 'eval_f1_macro': 0.8225959271751351, 'eval_precision_macro': 0.8025425425425426, 'eval_recall_macro': 0.8492951311578762, 'eval_f1_weighted': 0.8391884172319162, 'eval_precision_weighted': 0.8445922401080587, 'eval_recall_weighted': 0.8376891334250344, 'eval_runtime': 0.7595, 'eval_samples_per_second': 957.225, 'eval_steps_per_second': 30.284, 'epoch': 76.0}
2025-05-03 09:36:59,802 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-03 09:36:59,811 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_BERT_LoRA_FT_confusion_matrix.csv
2025-05-03 09:36:59,812 - INFO - Saving LoRA adapter model to ..\models\llm\financial_news\BERT_LoRA_FT\final_model


          negative  neutral  positive
negative        82        6         3
neutral         22      362        48
positive         7       32       165


2025-05-03 09:37:00,628 - INFO - Tokenizer saved to ..\models\llm\financial_news\BERT_LoRA_FT\final_model
2025-05-03 09:37:00,771 - INFO - Starting run for RoBERTa LoRA FT on Financial News



--- Processing Model: RoBERTa LoRA FT ---


2025-05-03 09:37:01,131 - INFO - Tokenizing data using roberta-base tokenizer...
2025-05-03 09:37:01,188 - INFO - Tokenization complete.
2025-05-03 09:37:01,188 - INFO - Running in Fine-tuning mode (LoRA: True).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-03 09:37:01,516 - INFO - Applying LoRA configuration...


trainable params: 887,811 || all params: 125,535,750 || trainable%: 0.7072


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-05-03 09:37:01,833 - INFO - Starting fine-tuning...


Using evaluation_strategy: IntervalStrategy.EPOCH
Using save_strategy: SaveStrategy.EPOCH
Using load_best_model_at_end: True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,F1 Weighted,Precision Weighted,Recall Weighted
1,1.0143,0.972216,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
2,0.9592,0.934804,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
3,0.9409,0.923067,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
4,0.9362,0.915565,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
5,0.9259,0.905991,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
6,0.9047,0.884697,0.594223,0.24849,0.198074,0.333333,0.442975,0.353101,0.594223
7,0.866,0.830294,0.64099,0.347677,0.407659,0.388618,0.539366,0.546785,0.64099
8,0.7908,0.730383,0.661623,0.423181,0.403319,0.449748,0.608941,0.567793,0.661623
9,0.6975,0.628773,0.716644,0.635998,0.71198,0.605236,0.694377,0.709727,0.716644
10,0.5818,0.534381,0.746905,0.709729,0.71647,0.713942,0.739543,0.741494,0.746905


2025-05-03 09:45:15,163 - INFO - Fine-tuning completed in 493.33s
2025-05-03 09:45:15,163 - INFO - Evaluating model on the test set...


2025-05-03 09:45:16,720 - INFO - Fine-tuning - Test Set Performance:
2025-05-03 09:45:16,720 - INFO - {'eval_loss': 0.36540836095809937, 'eval_accuracy': 0.8404401650618982, 'eval_f1_macro': 0.8337329377480232, 'eval_precision_macro': 0.8142779142779143, 'eval_recall_macro': 0.8589544079740158, 'eval_f1_weighted': 0.8409677564831212, 'eval_precision_weighted': 0.8444349063331181, 'eval_recall_weighted': 0.8404401650618982, 'eval_runtime': 0.7678, 'eval_samples_per_second': 946.902, 'eval_steps_per_second': 29.957, 'epoch': 51.0}
2025-05-03 09:45:16,721 - INFO - Fine-tuning - Test Set Confusion Matrix:
2025-05-03 09:45:16,724 - INFO - Confusion matrix saved to ..\result\financial_news\Financial_News_RoBERTa_LoRA_FT_confusion_matrix.csv
2025-05-03 09:45:16,725 - INFO - Saving LoRA adapter model to ..\models\llm\financial_news\RoBERTa_LoRA_FT\final_model


          negative  neutral  positive
negative        86        4         1
neutral         20      364        48
positive         4       39       161


2025-05-03 09:45:17,288 - INFO - Tokenizer saved to ..\models\llm\financial_news\RoBERTa_LoRA_FT\final_model


# 4. Results Summary and Saving

In [7]:
print("\n\n===== Overall LLM Results Summary =====")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1400) # Wider display
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

# Ensure all expected columns exist, fill with NaN if necessary
for col in METRICS_TO_CALCULATE:
    if col not in results_df.columns:
        results_df[col] = np.nan

# Reorder columns for clarity
column_order = ["Dataset", "Model"] + METRICS_TO_CALCULATE
# Filter out columns not present if something went wrong during creation
column_order = [col for col in column_order if col in results_df.columns]
results_df = results_df[column_order]


print(results_df)

# --- Save results to CSV for each dataset ---
for dataset_name, config in DATASETS_TO_PROCESS.items():
    dataset_results_df = results_df[results_df['Dataset'] == dataset_name]
    if not dataset_results_df.empty:
        results_filename = f"{dataset_name.replace(' ', '_')}_llm_transformers_results.csv"
        results_save_path = os.path.join(config['result_dir'], results_filename)
        try:
            dataset_results_df.to_csv(results_save_path, index=False, mode='w+')
            print(f"\nResults for {dataset_name} saved to {results_save_path}")
        except Exception as e:
            print(f"\nError saving results for {dataset_name} to {results_save_path}: {e}")

# --- Save combined results ---
combined_results_path = os.path.join(RESULT_DIR, "combined_llm_transformers_results.csv")
try:
    results_df.to_csv(combined_results_path, index=False, mode='w+')
    print(f"\nCombined results saved to {combined_results_path}")
except Exception as e:
    print(f"\nError saving combined results to {combined_results_path}: {e}")




===== Overall LLM Results Summary =====
          Dataset            Model  Accuracy  F1 (Macro)  Precision (Macro)  Recall (Macro)  F1 (Weighted)  Precision (Weighted)  Recall (Weighted)  Train Time (s)  Eval Time (s)
0  Financial News     BERT LoRA FT    0.8377      0.8226             0.8025          0.8493         0.8392                0.8446             0.8377        720.8480         1.5270
1  Financial News  RoBERTa LoRA FT    0.8404      0.8337             0.8143          0.8590         0.8410                0.8444             0.8404        493.3290         1.5560

Results for Financial News saved to ..\result\financial_news\Financial_News_llm_transformers_results.csv

Combined results saved to ..\result\combined_llm_transformers_results.csv
