# 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import os
import torch
import joblib
from time import time
import warnings
import gc

# Hugging Face Libraries
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType

# Evaluation
import evaluate # Hugging Face evaluate
from sklearn.metrics import classification_report as sk_classification_report # For feature extraction part
from sklearn.linear_model import LogisticRegression # For feature extraction part

# Ignore warnings
warnings.filterwarnings('ignore')

# 2. Configuration

In [None]:
# --- Paths ---
DATA_DIR = "../data/processed"
MODEL_SAVE_DIR = "../models/llm" # Specific dir for LLM outputs
RESULTS_SAVE_DIR = "../results"
RESULTS_CSV_FILE = os.path.join(RESULTS_SAVE_DIR, "llm_results_summary.csv")

# --- Experiment Setup ---
DOMAINS = ["book_reviews", "financial_news"] # Add domain folder names

# --- Models & Methods Configuration ---
# Define models and the methods to apply to them
EXPERIMENTS = [
    # Feature Extraction
    {"method": "feature_extraction", "model_id": "distilbert-base-uncased", "short_name": "DistilBERT_FE"},
    {"method": "feature_extraction", "model_id": "bert-base-uncased", "short_name": "BERT_FE"},
    # Full Fine-tuning
    {"method": "finetune", "model_id": "distilbert-base-uncased", "short_name": "DistilBERT_FT"},
    {"method": "finetune", "model_id": "bert-base-uncased", "short_name": "BERT_FT"},
    {"method": "finetune", "model_id": "roberta-base", "short_name": "RoBERTa_FT"},
    {"method": "finetune", "model_id": "microsoft/deberta-v3-base", "short_name": "DeBERTaV3_FT"},
    {"method": "finetune", "model_id": "ProsusAI/finbert", "short_name": "FinBERT_FT", "domain_filter": "financial_news"}, # Domain specific
    # LoRA Fine-tuning
    {"method": "lora", "model_id": "bert-base-uncased", "short_name": "BERT_LoRA"},
    {"method": "lora", "model_id": "roberta-base", "short_name": "RoBERTa_LoRA"},
    # Add more experiments as needed
]

# --- Training Arguments (Defaults - can be overridden per experiment if needed) ---
DEFAULT_TRAINING_ARGS = {
    "output_dir": os.path.join(MODEL_SAVE_DIR, "training_output"), # Base output dir for trainer
    "num_train_epochs": 3, # Adjust as needed
    "per_device_train_batch_size": 16, # Adjust based on GPU memory
    "per_device_eval_batch_size": 32, # Adjust based on GPU memory
    "gradient_accumulation_steps": 2, # Increase effective batch size (train_batch_size * grad_accum)
    "warmup_steps": 100,
    "weight_decay": 0.01,
    "logging_dir": os.path.join(MODEL_SAVE_DIR, "logs"),
    "logging_steps": 50,
    "evaluation_strategy": "epoch", # Evaluate at the end of each epoch
    "save_strategy": "epoch",       # Save checkpoint at the end of each epoch
    "load_best_model_at_end": True, # Load the best model based on validation metric
    "metric_for_best_model": "f1",  # Use F1 score to select best model
    "greater_is_better": True,
    "report_to": "none", # Disable wandb/tensorboard logging unless configured
    "fp16": torch.cuda.is_available(), # Use mixed precision if GPU available
    # "bf16": torch.cuda.is_bf16_supported(), # Use bfloat16 if supported (Ampere GPUs+)
}

# --- PEFT Configuration (LoRA) ---
DEFAULT_LORA_CONFIG = LoraConfig(
    task_type=TaskType.SEQUENCE_CLASSIFICATION,
    inference_mode=False,
    r=8, # Rank of the update matrices
    lora_alpha=16, # Alpha scaling factor
    lora_dropout=0.1,
    # target_modules usually identified automatically for common models,
    # but might need specifying for others: e.g., ["query", "value"] for BERT
)

# --- Feature Extraction Configuration ---
ML_CLASSIFIER = LogisticRegression(random_state=42, max_iter=1000) # Classifier for extracted features

# --- Tokenization ---
MAX_LENGTH = 256 # Max sequence length for tokenizer

# --- Reproducibility & Device ---
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Create directories
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(RESULTS_SAVE_DIR, exist_ok=True)
for domain in DOMAINS:
    os.makedirs(os.path.join(MODEL_SAVE_DIR, domain), exist_ok=True)
    os.makedirs(os.path.join(RESULTS_SAVE_DIR, domain), exist_ok=True)

# 3. Helper Functions

In [None]:
def load_data_hf(domain_name):
    """Loads train, validation, and test data into Hugging Face Datasets."""
    print(f"\nLoading data for domain: {domain_name}...")
    try:
        train_path = os.path.join(DATA_DIR, domain_name, "train.csv")
        val_path = os.path.join(DATA_DIR, domain_name, "validation.csv")
        test_path = os.path.join(DATA_DIR, domain_name, "test.csv")

        # Load using pandas first to easily handle potential NaNs
        train_df = pd.read_csv(train_path).dropna(subset=['text', 'label'])
        val_df = pd.read_csv(val_path).dropna(subset=['text', 'label'])
        test_df = pd.read_csv(test_path).dropna(subset=['text', 'label'])

        # Convert labels to integers if they aren't already
        train_df['label'] = train_df['label'].astype(int)
        val_df['label'] = val_df['label'].astype(int)
        test_df['label'] = test_df['label'].astype(int)

        # Convert pandas DataFrames to Hugging Face Dataset objects
        train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
        val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
        test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

        dataset_dict = DatasetDict({
            'train': train_dataset,
            'validation': val_dataset,
            'test': test_dataset
        })

        print(f"Dataset loaded: {dataset_dict}")
        return dataset_dict
    except FileNotFoundError as e:
        print(f"Error loading data for {domain_name}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred during data loading for {domain_name}: {e}")
        return None


def preprocess_function(examples, tokenizer):
    """Tokenizes the text data."""
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

# --- Evaluation Metrics ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    """Computes metrics for Trainer evaluation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    precision_macro = precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall_macro = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1_weighted = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    precision_weighted = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall_weighted = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]

    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1": f1_weighted, # Trainer uses 'f1' by default for best model
        "precision_weighted": precision_weighted,
        "recall_weighted": recall_weighted,
    }

# --- Feature Extraction Function ---
def extract_transformer_features(model, tokenizer, texts, batch_size=32):
    """Extracts [CLS] token embeddings using a pre-trained model."""
    model.eval()
    model.to(DEVICE)
    all_features = []
    print(f"Extracting features for {len(texts)} texts...")

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract [CLS] token embedding (first token's last hidden state)
        # Or use outputs.pooler_output if available and preferred
        cls_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_features.append(cls_features)
        if (i // batch_size + 1) % 50 == 0:
            print(f"  Processed batch {i // batch_size + 1}")


    return np.vstack(all_features)

# 4. Main Experiment Loop

In [None]:
all_results_list = []

In [None]:
for domain in DOMAINS:
    # Load data using Hugging Face datasets
    raw_datasets = load_data_hf(domain)
    if raw_datasets is None:
        print(f"Skipping domain {domain} due to data loading error.")
        continue

    # Determine number of labels dynamically
    label_list = raw_datasets["train"].unique("label")
    num_labels = len(label_list)
    print(f"Domain: {domain} | Number of labels: {num_labels} | Labels: {label_list}")

    # Map labels to IDs (important if labels are not 0, 1, ..., n-1)
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for label, i in label2id.items()}

    # Pre-map labels in the dataset if necessary (Trainer usually handles this if labels are integers)
    # def map_labels(example):
    #     example['label'] = label2id[example['label']]
    #     return example
    # raw_datasets = raw_datasets.map(map_labels)


    for exp_config in EXPERIMENTS:
        model_id = exp_config["model_id"]
        method = exp_config["method"]
        short_name = exp_config["short_name"]

        # Check if model is domain-specific
        if exp_config.get("domain_filter") and domain != exp_config["domain_filter"]:
            print(f"Skipping {short_name} for domain {domain} as it's domain-specific.")
            continue

        print(f"\n{'='*10} Domain: {domain} | Model: {model_id} | Method: {method} {'='*10}")

        # --- Load Tokenizer ---
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
        except Exception as e:
            print(f"!!! ERROR loading tokenizer {model_id}: {e}. Skipping experiment.")
            continue

        # --- Tokenize Datasets ---
        try:
            tokenized_datasets = raw_datasets.map(lambda x: preprocess_function(x, tokenizer), batched=True)
            data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Dynamic padding
        except Exception as e:
            print(f"!!! ERROR tokenizing data for {model_id}: {e}. Skipping experiment.")
            continue

        # --- Setup Model Save Path ---
        model_output_dir = os.path.join(MODEL_SAVE_DIR, domain, short_name)
        os.makedirs(model_output_dir, exist_ok=True)

        # --- Run Experiment based on Method ---
        exp_train_time = None
        exp_eval_time = None
        exp_results = {}

        try:
            if method == "feature_extraction":
                print("Method: Feature Extraction + ML Classifier")
                # Load base model (no classification head)
                model = AutoModel.from_pretrained(model_id)

                # Extract features
                t0 = time()
                X_train_features = extract_transformer_features(model, tokenizer, raw_datasets["train"]["text"])
                X_test_features = extract_transformer_features(model, tokenizer, raw_datasets["test"]["text"])
                feature_ext_time = time() - t0
                print(f"Feature extraction time: {feature_ext_time:.2f}s")

                y_train = raw_datasets["train"]["label"]
                y_test = raw_datasets["test"]["label"]

                # Train ML Classifier
                print(f"Training ML Classifier ({ML_CLASSIFIER.__class__.__name__})...")
                t0 = time()
                ml_classifier = ML_CLASSIFIER # Get fresh instance
                ml_classifier.fit(X_train_features, y_train)
                exp_train_time = time() - t0
                print(f"ML Training time: {exp_train_time:.2f}s")

                # Evaluate ML Classifier
                t0 = time()
                y_pred = ml_classifier.predict(X_test_features)
                exp_eval_time = time() - t0
                print(f"ML Evaluation time: {exp_eval_time:.2f}s")

                report_dict = sk_classification_report(y_test, y_pred, output_dict=True, zero_division=0)
                exp_results = {
                    "accuracy": accuracy_score(y_test, y_pred),
                    "f1_macro": report_dict['macro avg']['f1-score'],
                    "precision_macro": report_dict['macro avg']['precision'],
                    "recall_macro": report_dict['macro avg']['recall'],
                    "f1_weighted": report_dict['weighted avg']['f1-score'],
                    "precision_weighted": report_dict['weighted avg']['precision'],
                    "recall_weighted": report_dict['weighted avg']['recall'],
                }
                print(sk_classification_report(y_test, y_pred, zero_division=0))

                # Save ML model
                ml_model_save_path = os.path.join(model_output_dir, "ml_classifier.joblib")
                joblib.dump(ml_classifier, ml_model_save_path)
                print(f"Saved ML classifier to {ml_model_save_path}")
                del model # Free up GPU memory from base transformer

            elif method == "finetune" or method == "lora":
                print(f"Method: {method.upper()}")
                # Load model for sequence classification
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_id,
                    num_labels=num_labels,
                    id2label=id2label,
                    label2id=label2id
                )

                # Apply LoRA if specified
                if method == "lora":
                    print("Applying LoRA configuration...")
                    lora_config = DEFAULT_LORA_CONFIG
                    # Ensure task type matches
                    lora_config.task_type = TaskType.SEQUENCE_CLASSIFICATION
                    model = get_peft_model(model, lora_config)
                    model.print_trainable_parameters()

                # Define Training Arguments
                training_args = TrainingArguments(
                    output_dir=os.path.join(model_output_dir, "training_output"), # Specific output for this run
                    **DEFAULT_TRAINING_ARGS # Use defaults, can override here
                    # Example override: learning_rate=5e-5
                )

                # Initialize Trainer
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_datasets["train"],
                    eval_dataset=tokenized_datasets["validation"], # Use validation set for eval during training
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
                    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)] # Add early stopping
                )

                # Train
                print("Starting training...")
                t0 = time()
                train_result = trainer.train()
                exp_train_time = time() - t0
                print(f"Training finished in {exp_train_time:.2f}s")
                trainer.save_model(model_output_dir) # Saves best model/adapters
                print(f"Saved best model/adapters to {model_output_dir}")

                # Evaluate on Test Set
                print("Evaluating on test set...")
                t0 = time()
                eval_result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
                exp_eval_time = time() - t0
                print(f"Test evaluation finished in {exp_eval_time:.2f}s")
                print(f"Test Results: {eval_result}")

                # Store metrics (key names match compute_metrics output)
                exp_results = {
                    "accuracy": eval_result.get("eval_accuracy", np.nan),
                    "f1_macro": eval_result.get("eval_f1_macro", np.nan),
                    "precision_macro": eval_result.get("eval_precision_macro", np.nan),
                    "recall_macro": eval_result.get("eval_recall_macro", np.nan),
                    "f1_weighted": eval_result.get("eval_f1", np.nan), # Note: key is 'f1' from compute_metrics
                    "precision_weighted": eval_result.get("eval_precision_weighted", np.nan),
                    "recall_weighted": eval_result.get("eval_recall_weighted", np.nan),
                }
                del trainer
                del model

            else:
                print(f"Unknown method: {method}. Skipping.")
                continue

            # Append results for this experiment
            result_row = {
                "Domain": domain,
                "Model ID": model_id,
                "Method": method,
                "Short Name": short_name,
                "Accuracy": exp_results.get("accuracy"),
                "F1 (Macro)": exp_results.get("f1_macro"),
                "Precision (Macro)": exp_results.get("precision_macro"),
                "Recall (Macro)": exp_results.get("recall_macro"),
                "F1 (Weighted)": exp_results.get("f1_weighted"),
                "Precision (Weighted)": exp_results.get("precision_weighted"),
                "Recall (Weighted)": exp_results.get("recall_weighted"),
                "Train Time (s)": exp_train_time,
                "Eval Time (s)": exp_eval_time
            }
            all_results_list.append(result_row)

        except Exception as e:
            print(f"!!! ERROR during experiment {short_name} on domain {domain}: {e}")
            import traceback
            traceback.print_exc() # Print detailed error traceback

        finally:
            # Clean up GPU memory
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

# 5. Aggregate and Save Results

In [None]:
print("\n--- Experiment Finished ---")
if all_results_list:
    results_df = pd.DataFrame(all_results_list)
    # Define desired column order
    cols_order = ["Domain", "Model ID", "Method", "Short Name", "Accuracy",
                  "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
                  "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
                  "Train Time (s)", "Eval Time (s)"]
    # Ensure all columns exist
    for col in cols_order:
        if col not in results_df.columns:
            results_df[col] = np.nan
    results_df = results_df[cols_order] # Reorder

    print("\nAggregated Results:")
    print(results_df.to_string()) # Print full dataframe

    # Save to CSV
    results_df.to_csv(RESULTS_CSV_FILE, index=False)
    print(f"\nResults saved to {RESULTS_CSV_FILE}")
else:
    print("No results were generated.")