In [None]:
# SMS Scam Detection - Large Language Model Fine-tuning
# =====================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
import re
import os
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
)

from datasets import Dataset, DatasetDict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score, average_precision_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)
import evaluate
import warnings
import optuna
from pathlib import Path

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set CUDA_LAUNCH_BLOCKING for better error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Ignore specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")

# Set up project paths
project_dir = '/content/drive/MyDrive/sms-scam-detection'
os.chdir(project_dir)

data_dir = "data/processed/"
model_dir = "models/llm/"
results_dir = "results/"

# Create directories
os.makedirs(model_dir, exist_ok=True)
os.makedirs(os.path.join(results_dir, "metrics"), exist_ok=True)
os.makedirs(os.path.join(results_dir, "visualizations"), exist_ok=True)

# Setup label mapping
id2label = {0: "Legitimate", 1: "Scam"}
label2id = {"Legitimate": 0, "Scam": 1}

def load_data():
    """Load and prepare the dataset."""
    train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
    val_df = pd.read_csv(os.path.join(data_dir, "val.csv"))
    test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))

    print(f"Loaded data: Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

    # Print class distribution
    print("\nClass Distribution:")
    for name, df in [("Training", train_df), ("Validation", val_df), ("Test", test_df)]:
        print(f"{name} Set:")
        print(df['label'].value_counts(normalize=True) * 100)

    # Calculate imbalance ratio
    train_neg_count = (train_df['label'] == 0).sum()
    train_pos_count = (train_df['label'] == 1).sum()
    imbalance_ratio = train_neg_count / train_pos_count if train_pos_count > 0 else float('inf')
    print(f"\nImbalance ratio (legitimate:scam): {imbalance_ratio:.2f}:1")

    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    dataset_dict = DatasetDict({
        "train": train_dataset,
        "valid": val_dataset,
        "test": test_dataset
    })

    print("\nDataset structure:")
    print(dataset_dict)

    return dataset_dict

def preprocess_function(examples, tokenizer, max_length=128):
    """Tokenize and preprocess text examples."""
    text_column = "cleaned_text" if "cleaned_text" in examples else "message"
    return tokenizer(
        examples[text_column],
        max_length=max_length,
        padding='max_length',
        truncation=True
    )

def tokenize_data(dataset_dict, tokenizer, max_length=128):
    """Tokenize all splits in a dataset dictionary."""
    return dataset_dict.map(
        lambda examples: preprocess_function(examples, tokenizer, max_length),
        batched=True
    )

def compute_metrics(eval_pred):
    """Compute evaluation metrics for model predictions."""
    predictions, labels = eval_pred

    probabilities = torch.nn.functional.softmax(torch.tensor(predictions), dim=-1).numpy()
    positive_class_probs = probabilities[:, 1]
    predicted_classes = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predicted_classes)
    precision = precision_score(labels, predicted_classes)
    recall = recall_score(labels, predicted_classes)
    f1 = f1_score(labels, predicted_classes)
    mcc = matthews_corrcoef(labels, predicted_classes)
    roc_auc = roc_auc_score(labels, positive_class_probs)
    pr_auc = average_precision_score(labels, positive_class_probs)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc
    }

def create_data_collator(tokenizer):
    """Create a data collator for batching examples."""
    return DataCollatorWithPadding(tokenizer=tokenizer)

def visualize_training_metrics(metrics, model_name):
    """Visualize training and evaluation metrics."""
    results_dir = Path(f"results/visualizations/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    train_loss = [x['loss'] for x in metrics if 'loss' in x]
    eval_loss = [x['eval_loss'] for x in metrics if 'eval_loss' in x]

    if train_loss and eval_loss:
        min_length = min(len(train_loss), len(eval_loss))
        epochs = range(1, min_length + 1)

        plt.figure(figsize=(12, 6))
        plt.plot(epochs, train_loss[:min_length], label='Training Loss', marker='o')
        plt.plot(epochs, eval_loss[:min_length], label='Validation Loss', marker='o')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title('Training Loss vs Validation Loss')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(results_dir / "loss_curve.png")
        plt.show()

    # Extract evaluation metrics
    if metrics and 'eval_accuracy' in metrics[0]:
        eval_accuracy = [x.get('eval_accuracy', None) for x in metrics if 'eval_loss' in x]
        eval_f1 = [x.get('eval_f1', None) for x in metrics if 'eval_loss' in x]
        eval_mcc = [x.get('eval_mcc', None) for x in metrics if 'eval_loss' in x]

        eval_accuracy = [x for x in eval_accuracy if x is not None]
        eval_f1 = [x for x in eval_f1 if x is not None]
        eval_mcc = [x for x in eval_mcc if x is not None]

        if eval_accuracy:
            plt.figure(figsize=(10, 6))
            plt.plot(range(1, len(eval_accuracy) + 1), eval_accuracy, label='Accuracy', marker='o', color='blue')
            plt.xlabel('Epochs')
            plt.ylabel('Score')
            plt.title('Validation Accuracy')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(results_dir / "accuracy_curve.png")
            plt.show()

        if eval_f1:
            plt.figure(figsize=(10, 6))
            plt.plot(range(1, len(eval_f1) + 1), eval_f1, label='F1 Score', marker='o', color='green')
            plt.xlabel('Epochs')
            plt.ylabel('Score')
            plt.title('Validation F1 Score')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(results_dir / "f1_curve.png")
            plt.show()

        if eval_mcc:
            plt.figure(figsize=(10, 6))
            plt.plot(range(1, len(eval_mcc) + 1), eval_mcc, label='MCC', marker='o', color='purple')
            plt.xlabel('Epochs')
            plt.ylabel('Score')
            plt.title('Validation MCC')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(results_dir / "mcc_curve.png")
            plt.show()

def plot_confusion_matrix(cm, classes, model_name):
    """Plot confusion matrix."""
    results_dir = Path(f"results/visualizations/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(results_dir / "confusion_matrix.png")
    plt.show()

def plot_roc_curve(fpr, tpr, roc_auc, model_name):
    """Plot ROC curve."""
    results_dir = Path(f"results/visualizations/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(results_dir / "roc_curve.png")
    plt.show()

def plot_pr_curve(precision, recall, pr_auc, model_name):
    """Plot precision-recall curve."""
    results_dir = Path(f"results/visualizations/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    plt.figure(figsize=(10, 8))
    plt.step(recall, precision, color='green', lw=2, where='post',
             label=f'PR Curve (AUC = {pr_auc:.3f})')
    plt.fill_between(recall, precision, alpha=0.2, color='green', step='post')
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend(loc="lower left")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(results_dir / "pr_curve.png")
    plt.show()

def train_model(model, tokenizer, tokenized_data, training_args, model_name):
    """Train a model and evaluate it on validation data."""
    print(f"\n=== Training {model_name} ===")

    results_dir = Path(f"results/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["valid"],
        tokenizer=tokenizer,
        data_collator=create_data_collator(tokenizer),
        compute_metrics=compute_metrics,
    )

    start_time = time.time()
    train_result = trainer.train()
    end_time = time.time()
    training_time = end_time - start_time
    training_time_minutes = round(training_time / 60, 2)

    print(f"\nTraining completed in {training_time_minutes} minutes")
    print(f"Training Loss: {train_result.training_loss:.4f}")

    eval_results = trainer.evaluate()
    print("\nValidation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    model_save_path = results_dir / "final_model"
    trainer.save_model(str(model_save_path))
    print(f"Model saved to {model_save_path}")

    visualize_training_metrics(trainer.state.log_history, model_name)

    metrics_file = results_dir / "training_metrics.json"
    with open(metrics_file, "w") as f:
        json.dump({
            "training_time_seconds": training_time,
            "training_time_minutes": training_time_minutes,
            "final_train_loss": train_result.training_loss,
            "eval_results": eval_results
        }, f, indent=4)

    return trainer

def evaluate_model(trainer, tokenized_test_data, model_name):
    """Evaluate a trained model on test data."""
    print(f"\n=== Evaluating {model_name} on Test Set ===")

    test_results = trainer.predict(tokenized_test_data)

    predictions = test_results.predictions
    labels = test_results.label_ids

    probabilities = torch.nn.functional.softmax(torch.tensor(predictions), dim=-1).numpy()
    positive_class_probs = probabilities[:, 1]
    predicted_classes = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predicted_classes)
    precision = precision_score(labels, predicted_classes)
    recall = recall_score(labels, predicted_classes)
    f1 = f1_score(labels, predicted_classes)
    mcc = matthews_corrcoef(labels, predicted_classes)
    roc_auc = roc_auc_score(labels, positive_class_probs)
    pr_auc = average_precision_score(labels, positive_class_probs)
    cm = confusion_matrix(labels, predicted_classes)

    print("\nTest Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

    print("\nClassification Report:")
    class_names = list(id2label.values())
    report = classification_report(labels, predicted_classes, target_names=class_names)
    print(report)

    results_dir = Path(f"results/{model_name}")
    results_dir.mkdir(parents=True, exist_ok=True)

    metrics_df = pd.DataFrame({
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1],
        'MCC': [mcc],
        'ROC_AUC': [roc_auc],
        'PR_AUC': [pr_auc]
    })
    metrics_df.to_csv(results_dir / "test_metrics.csv", index=False)

    with open(results_dir / "classification_report.txt", "w") as f:
        f.write(report)

    plot_confusion_matrix(cm, class_names, model_name)

    fpr, tpr, _ = roc_curve(labels, positive_class_probs)
    plot_roc_curve(fpr, tpr, roc_auc, model_name)

    precision_points, recall_points, _ = precision_recall_curve(labels, positive_class_probs)
    plot_pr_curve(precision_points, recall_points, pr_auc, model_name)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "confusion_matrix": cm,
        "predicted_classes": predicted_classes,
        "probabilities": probabilities
    }

def predict_examples(model, tokenizer, examples, model_name):
    """Predict on a list of example messages."""
    model.eval()
    results = []

    for text in examples:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=1).item()
        prob_score = probs[0][predicted_class].item()

        results.append({
            "text": text,
            "predicted_class": id2label[predicted_class],
            "confidence": prob_score
        })

    results_df = pd.DataFrame(results)

    print(f"\n=== Example Predictions with {model_name} ===")
    for i, row in results_df.iterrows():
        print(f"\nExample {i+1}:")
        print(f"Text: {row['text']}")
        print(f"Prediction: {row['predicted_class']}")
        print(f"Confidence: {row['confidence']:.4f}")

    return results_df

def main():
    """Main execution function."""
    # Load data
    dataset_dict = load_data()

    # Define model families
    model_families = {
        "roberta": {
            "model_name": "roberta-base",
            "tokenizer": AutoTokenizer.from_pretrained("roberta-base")
        },
        "distilbert": {
            "model_name": "distilbert-base-uncased",
            "tokenizer": AutoTokenizer.from_pretrained("distilbert-base-uncased")
        }
    }

    # Tokenize datasets for each model family
    tokenized_datasets = {}
    for family, config in model_families.items():
        print(f"\nTokenizing datasets for {family}...")
        tokenized_datasets[family] = tokenize_data(dataset_dict, config["tokenizer"], max_length=128)

    # Define fine-tuning techniques
    fine_tuning_techniques = [
        "roberta_frozen",
        "roberta_full",
        "roberta_lora",
        "distilbert_frozen",
        "distilbert_full",
        "distilbert_lora"
    ]

    all_results = {}

    # Process each fine-tuning technique
    for technique in fine_tuning_techniques:
        print(f"\n{'='*50}")
        print(f"Processing {technique}")
        print(f"{'='*50}")

        family, approach = technique.split("_")

        model_name = model_families[family]["model_name"]
        tokenizer = model_families[family]["tokenizer"]
        tokenized_data = tokenized_datasets[family]

        # Load base model
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            id2label=id2label,
            label2id=label2id
        ).to(device)

        # Apply fine-tuning technique
        if approach == "frozen":
            print("Freezing base model parameters...")
            for name, param in model.named_parameters():
                if "classifier" not in name:
                    param.requires_grad = False

        elif approach == "lora":
            print("Applying LoRA configuration...")
            peft_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=16,
                lora_alpha=32,
                lora_dropout=0.1,
                target_modules=["query", "value"] if family == "roberta" else ["q_lin", "v_lin"]
            )
            model = get_peft_model(model, peft_config)
            model.print_trainable_parameters()

        # Full fine-tuning doesn't require any parameter modifications

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f"results/{technique}/checkpoints",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            warmup_steps=100,
            weight_decay=0.01,
            logging_dir=f"results/{technique}/logs",
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="mcc",
            greater_is_better=True,
            push_to_hub=False,
            dataloader_pin_memory=False,
            remove_unused_columns=False
        )

        # Train the model
        trainer = train_model(model, tokenizer, tokenized_data, training_args, technique)

        # Evaluate on test set
        test_results = evaluate_model(trainer, tokenized_data["test"], technique)

        # Store results
        all_results[technique] = test_results

        # Clean up GPU memory
        del model
        del trainer
        torch.cuda.empty_cache()

    # Compile comparison results
    comparison_results = []
    for technique, results in all_results.items():
        comparison_results.append({
            "Model": technique,
            "Accuracy": results["accuracy"],
            "Precision": results["precision"],
            "Recall": results["recall"],
            "F1": results["f1"],
            "MCC": results["mcc"],
            "ROC_AUC": results["roc_auc"],
            "PR_AUC": results["pr_auc"]
        })

    comparison_df = pd.DataFrame(comparison_results)
    comparison_df = comparison_df.sort_values("MCC", ascending=False)

    # Save comparison results
    comparison_df.to_csv("results/metrics/llm_comparison_results.csv", index=False)

    print("\n" + "="*60)
    print("FINAL COMPARISON OF ALL LLM APPROACHES")
    print("="*60)
    print(comparison_df.to_string(index=False))

    # Visualize comparison
    metrics_to_plot = ["Accuracy", "Precision", "Recall", "F1", "MCC", "ROC_AUC", "PR_AUC"]

    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()

    for i, metric in enumerate(metrics_to_plot):
        sns.barplot(data=comparison_df, x="Model", y=metric, ax=axes[i])
        axes[i].set_title(f"{metric} Comparison")
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(True, alpha=0.3)

    # Hide the last subplot since we have 7 metrics and 8 subplots
    axes[-1].set_visible(False)

    plt.tight_layout()
    plt.savefig("results/visualizations/llm_comparison_all_metrics.png", dpi=300, bbox_inches='tight')
    plt.show()

    # Focus on key metrics
    key_metrics = ["F1", "MCC", "ROC_AUC", "PR_AUC"]

    plt.figure(figsize=(14, 8))

    melted_df = comparison_df.melt(
        id_vars=["Model"],
        value_vars=key_metrics,
        var_name="Metric",
        value_name="Score"
    )

    sns.barplot(data=melted_df, x="Model", y="Score", hue="Metric")
    plt.title("Key Metrics Comparison Across LLM Approaches")
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig("results/visualizations/llm_key_metrics_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()

    # Best model analysis
    best_model = comparison_df.iloc[0]
    print(f"\nBEST PERFORMING MODEL: {best_model['Model']}")
    print(f"   MCC: {best_model['MCC']:.4f}")
    print(f"   F1 Score: {best_model['F1']:.4f}")
    print(f"   ROC AUC: {best_model['ROC_AUC']:.4f}")
    print(f"   PR AUC: {best_model['PR_AUC']:.4f}")

    # Test predictions on example messages
    example_messages = [
        "Congratulations! You've won a ugx100000 gift card. Click here to claim: www.example.com",
        "Your account has been suspended. Please verify your identity by sending your PIN to this number.",
        "Hi, just checking if we're still meeting for lunch tomorrow at 12?",
        "URGENT: Your payment of ugx55000 has been processed. If this was not you, call immediately.",
        "Your package will be delivered tomorrow between 10am and 2pm. No signature required.",
        "You have been selected to win ugx500000! Call now to claim your prize before it expires!",
        "Meeting postponed to next Wednesday at 3pm. Please confirm if you can attend."
    ]

    # Load the best model for predictions
    best_technique = best_model['Model']
    family, approach = best_technique.split("_")
    model_name = model_families[family]["model_name"]
    tokenizer = model_families[family]["tokenizer"]

    # Load the saved best model
    best_model_path = f"results/{best_technique}/final_model"
    if approach == "lora":
        # For LoRA models, we need to load the base model and apply LoRA
        base_model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            id2label=id2label,
            label2id=label2id
        )
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["query", "value"] if family == "roberta" else ["q_lin", "v_lin"]
        )
        model = get_peft_model(base_model, peft_config)
        model.load_adapter(best_model_path, "default")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(best_model_path)

    model.to(device)

    # Make predictions
    example_predictions = predict_examples(model, tokenizer, example_messages, best_technique)

    # Save example predictions
    example_predictions.to_csv(f"results/{best_technique}/example_predictions.csv", index=False)

    # Load and combine with previous results if available
    try:
        baseline_results = pd.read_csv("results/metrics/baseline_ml_results.csv")
        dl_results = pd.read_csv("results/metrics/all_models_comparison.csv")

        # Combine all results
        final_comparison = pd.concat([baseline_results, dl_results, comparison_df], ignore_index=True)

        # Handle missing columns by filling with NaN
        all_columns = set()
        for df in [baseline_results, dl_results, comparison_df]:
            all_columns.update(df.columns)

        for col in all_columns:
            if col not in final_comparison.columns:
                final_comparison[col] = np.nan

        # Sort by MCC
        final_comparison = final_comparison.sort_values("MCC", ascending=False)

        # Save final comparison
        final_comparison.to_csv("results/metrics/final_all_models_comparison.csv", index=False)

        print("\n" + "="*80)
        print("FINAL COMPARISON OF ALL MODELS (ML + DL + LLM)")
        print("="*80)
        key_columns = ["Model", "MCC", "F1", "ROC_AUC", "PR_AUC"]
        available_columns = [col for col in key_columns if col in final_comparison.columns]
        print(final_comparison[available_columns].to_string(index=False))

        # Create final visualization
        plt.figure(figsize=(16, 10))

        # Focus on models with complete data for key metrics
        complete_data = final_comparison.dropna(subset=["MCC", "F1"])

        if len(complete_data) > 0:
            key_metrics = ["F1", "MCC", "ROC_AUC", "PR_AUC"]
            available_metrics = [col for col in key_metrics if col in complete_data.columns]

            melted_final = complete_data.melt(
                id_vars=["Model"],
                value_vars=available_metrics,
                var_name="Metric",
                value_name="Score"
            )

            sns.barplot(data=melted_final, x="Model", y="Score", hue="Metric")
            plt.title("Final Performance Comparison: All Model Types")
            plt.xticks(rotation=45, ha='right')
            plt.ylim(0, 1)
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.savefig("results/visualizations/final_all_models_comparison.png", dpi=300, bbox_inches='tight')
            plt.show()

            # Print top 5 models
            print(f"\n TOP 5 MODELS BY MCC:")
            top_5 = complete_data.head(5)
            for i, (_, row) in enumerate(top_5.iterrows(), 1):
                print(f"{i}. {row['Model']}: MCC = {row['MCC']:.4f}, F1 = {row['F1']:.4f}")

    except FileNotFoundError as e:
        print(f"\nNote: Could not load previous results for comparison: {e}")
        print("Showing LLM results only.")

    print(f"\nLLM fine-tuning completed successfully!")
    print(f"Results saved to: results/")
    print(f"Best model: {best_technique}")

if __name__ == "__main__":
    main()