In [None]:
# SMS Scam Detection - Baseline Machine Learning Models
# =====================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score, average_precision_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
import time
import joblib
from tqdm.notebook import tqdm
import optuna
import warnings
import os
import re

warnings.filterwarnings('ignore')

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Set up project paths
project_dir = '/content/drive/MyDrive/sms-scam-detection'
os.chdir(project_dir)

data_dir = "data/processed/"
model_dir = "models/baseline/"
results_dir = "results/"

# Create directories
os.makedirs(model_dir, exist_ok=True)
os.makedirs(os.path.join(results_dir, "metrics"), exist_ok=True)
os.makedirs(os.path.join(results_dir, "visualizations"), exist_ok=True)

print(f"Working directory: {os.getcwd()}")

def load_and_prepare_data():
    """Load and prepare the dataset for training."""
    try:
        df = pd.read_csv(os.path.join(data_dir, "sms_dataset_explored.csv"))
        print(f"Loaded processed dataset with {len(df)} rows.")
    except FileNotFoundError:
        df = pd.read_csv(os.path.join("data/raw", "spam-fraud-sms-dataset.csv"))
        print(f"Loaded original dataset with {len(df)} rows. Performing basic preprocessing...")

        # Basic preprocessing
        if df['label'].dtype == object:
            label_map = {'ham': 0, 'spam': 1}
            df['label'] = df['label'].map(label_map)
            df['label'] = df['label'].astype(int)

        # Basic text cleaning
        def clean_text(text):
            if not isinstance(text, str):
                return ""
            text = text.lower()
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
            text = re.sub(r'\S+@\S+', '', text)
            text = re.sub(r'\b\d{10,}\b', '', text)
            text = re.sub(r'[^\x00-\x7F]+', '', text)
            text = re.sub(r'\s+', ' ', text).strip()
            return text

        df['cleaned_text'] = df['message'].apply(clean_text)

    # Ensure cleaned_text column exists
    if 'cleaned_text' not in df.columns:
        print("Adding cleaned_text column...")
        def clean_text(text):
            if not isinstance(text, str):
                return ""
            text = text.lower()
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
            text = re.sub(r'\S+@\S+', '', text)
            text = re.sub(r'\b\d{10,}\b', '', text)
            text = re.sub(r'[^\x00-\x7F]+', '', text)
            text = re.sub(r'\s+', ' ', text).strip()
            return text

        df['cleaned_text'] = df['message'].apply(clean_text)

    return df

def split_data(df, test_size=0.15, val_size=0.176, random_state=42):
    """Split data into train, validation, and test sets."""
    # First split: separate test set
    train_val_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state, stratify=df['label']
    )

    # Second split: separate validation set from training set
    train_df, val_df = train_test_split(
        train_val_df, test_size=val_size, random_state=random_state, stratify=train_val_df['label']
    )

    print(f"Data split into:")
    print(f"Training set: {len(train_df)} samples")
    print(f"Validation set: {len(val_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    # Save splits
    os.makedirs(data_dir, exist_ok=True)
    train_df.to_csv(os.path.join(data_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(data_dir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(data_dir, "test.csv"), index=False)

    return train_df, val_df, test_df

def extract_tfidf_features(train_texts, val_texts, test_texts, max_features=5000):
    """Extract TF-IDF features from text data."""
    print(f"Extracting TF-IDF features with max_features={max_features}...")

    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        stop_words='english'
    )

    X_train = tfidf_vectorizer.fit_transform(train_texts)
    X_val = tfidf_vectorizer.transform(val_texts)
    X_test = tfidf_vectorizer.transform(test_texts)

    print(f"TF-IDF features extracted. Shape: X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")

    feature_names = tfidf_vectorizer.get_feature_names_out()
    print(f"Example features: {feature_names[:10]}...")

    return X_train, X_val, X_test, tfidf_vectorizer

def get_baseline_models(random_seed=42):
    """Get baseline machine learning models with balanced configurations."""
    models = {
        'logistic_regression': LogisticRegression(
            C=1.0,
            penalty='l2',
            class_weight='balanced',
            solver='liblinear',
            max_iter=1000,
            random_state=random_seed
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            class_weight='balanced',
            random_state=random_seed
        ),
        'decision_tree': DecisionTreeClassifier(
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            class_weight='balanced',
            random_state=random_seed
        ),
        'xgboost': xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_seed,
            use_label_encoder=False,
            eval_metric='logloss'
        )
    }
    return models

def optimize_hyperparameters(model_class, X_train, y_train, X_val, y_val, model_name, n_trials=20):
    """Optimize hyperparameters using Optuna."""
    print(f"Optimizing hyperparameters for {model_name}...")

    def objective(trial):
        if model_name == 'logistic_regression':
            params = {
                'C': trial.suggest_float('C', 0.01, 10, log=True),
                'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
                'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
                'class_weight': 'balanced',
                'max_iter': 1000,
                'random_state': 42
            }
        elif model_name == 'random_forest':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 25, 150),
                'max_depth': trial.suggest_int('max_depth', 5, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 5, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 10),
                'class_weight': 'balanced',
                'random_state': 42
            }
        elif model_name == 'decision_tree':
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_samples_split': trial.suggest_int('min_samples_split', 8, 25),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 4, 15),
                'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
                'class_weight': 'balanced',
                'random_state': 42
            }
        elif model_name == 'xgboost':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 25, 150),
                'max_depth': trial.suggest_int('max_depth', 2, 8),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
                'subsample': trial.suggest_float('subsample', 0.6, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
                'gamma': trial.suggest_float('gamma', 0, 2),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
                'random_state': 42,
                'use_label_encoder': False,
                'eval_metric': 'logloss'
            }

        model = model_class(**params)
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        mcc = matthews_corrcoef(y_val, y_val_pred)

        return mcc

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"Best parameters: {study.best_params}")
    print(f"Best MCC score: {study.best_value:.4f}")

    return study.best_params

def train_and_evaluate_model(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    """Train and evaluate a model."""
    print(f"\nTraining {model_name}...")

    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    print(f"Training time: {train_time:.2f} seconds")

    # Generate predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    # Generate probabilities for ROC and PR curves
    if hasattr(model, 'predict_proba'):
        y_test_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, 'decision_function'):
        y_test_prob = model.decision_function(X_test)
    else:
        y_test_prob = y_test_pred

    # Calculate metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train, y_train_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    train_mcc = matthews_corrcoef(y_train, y_train_pred)
    val_mcc = matthews_corrcoef(y_val, y_val_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)

    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_roc_auc = roc_auc_score(y_test, y_test_prob)
    test_pr_auc = average_precision_score(y_test, y_test_prob)
    test_cm = confusion_matrix(y_test, y_test_pred)

    # Print metrics
    print(f"Training - Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}, MCC: {train_mcc:.4f}")
    print(f"Validation - Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}, MCC: {val_mcc:.4f}")
    print(f"Testing - Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}, MCC: {test_mcc:.4f}")
    print(f"Testing - Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
    print(f"Testing - ROC AUC: {test_roc_auc:.4f}, PR AUC: {test_pr_auc:.4f}")
    print(f"Testing - Confusion Matrix:\n{test_cm}")

    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Legitimate', 'Scam'],
                yticklabels=['Legitimate', 'Scam'])
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'visualizations', f"{model_name.lower().replace(' ', '_')}_confusion_matrix.png"))
    plt.show()

    # Save model
    model_filename = os.path.join(model_dir, f"{model_name.lower().replace(' ', '_')}.pkl")
    joblib.dump(model, model_filename)
    print(f"Model saved to {model_filename}")

    results = {
        'model_name': model_name,
        'training_time': train_time,
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc,
        'train_f1': train_f1,
        'val_f1': val_f1,
        'test_f1': test_f1,
        'train_mcc': train_mcc,
        'val_mcc': val_mcc,
        'test_mcc': test_mcc,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_roc_auc': test_roc_auc,
        'test_pr_auc': test_pr_auc,
        'test_confusion_matrix': test_cm,
        'y_test_pred': y_test_pred,
        'y_test_prob': y_test_prob,
        'model': model
    }

    return results

def main():
    """Main execution function."""
    # Load and prepare data
    df = load_and_prepare_data()

    # Check class distribution
    class_dist = df['label'].value_counts(normalize=True)
    print("\nClass distribution:")
    print(f"Class 0 (Legitimate): {class_dist.get(0, 0)*100:.2f}%")
    print(f"Class 1 (Spam/Scam): {class_dist.get(1, 0)*100:.2f}%")

    # Split data
    train_df, val_df, test_df = split_data(df)

    # Extract TF-IDF features
    X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer = extract_tfidf_features(
        train_df['cleaned_text'],
        val_df['cleaned_text'],
        test_df['cleaned_text'],
        max_features=5000
    )

    # Get labels
    y_train = train_df['label'].values
    y_val = val_df['label'].values
    y_test = test_df['label'].values

    # Handle class imbalance using SMOTE
    print("\nHandling class imbalance using SMOTE...")
    smote = SMOTE(random_state=42)
    X_train_tfidf_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

    unique, counts = np.unique(y_train_resampled, return_counts=True)
    print("Class distribution after SMOTE:")
    for label, count in zip(unique, counts):
        print(f"Class {label}: {count} samples ({count/len(y_train_resampled)*100:.2f}%)")

    # Get baseline models
    models = get_baseline_models(random_seed=RANDOM_SEED)

    # Optimize and train models
    optimized_models = {}
    best_params_dict = {}

    for model_name, model in models.items():
        # Optimize hyperparameters
        best_params = optimize_hyperparameters(
            model.__class__,
            X_train_tfidf_resampled,
            y_train_resampled,
            X_val_tfidf,
            y_val,
            model_name=model_name,
            n_trials=20
        )

        # Create optimized model
        if model_name == 'logistic_regression':
            optimized_model = LogisticRegression(**best_params)
        elif model_name == 'random_forest':
            optimized_model = RandomForestClassifier(**best_params)
        elif model_name == 'decision_tree':
            optimized_model = DecisionTreeClassifier(**best_params)
        elif model_name == 'xgboost':
            optimized_model = xgb.XGBClassifier(**best_params)

        optimized_models[model_name] = optimized_model
        best_params_dict[model_name] = best_params

    # Train and evaluate optimized models
    all_results = {}
    for model_name, model in optimized_models.items():
        results = train_and_evaluate_model(
            model, model_name,
            X_train_tfidf_resampled, y_train_resampled,
            X_val_tfidf, y_val,
            X_test_tfidf, y_test
        )
        all_results[model_name] = results

    # Create comparison DataFrame
    comparison_data = []
    for model_name, results in all_results.items():
        comparison_data.append({
            'Model': model_name,
            'Training Time (s)': results['training_time'],
            'Train Accuracy': results['train_acc'],
            'Val Accuracy': results['val_acc'],
            'Test Accuracy': results['test_acc'],
            'Train F1': results['train_f1'],
            'Val F1': results['val_f1'],
            'Test F1': results['test_f1'],
            'Train MCC': results['train_mcc'],
            'Val MCC': results['val_mcc'],
            'Test MCC': results['test_mcc'],
            'Test Precision': results['test_precision'],
            'Test Recall': results['test_recall'],
            'Test ROC AUC': results['test_roc_auc'],
            'Test PR AUC': results['test_pr_auc']
        })

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('Test MCC', ascending=False)

    # Save comparison results
    comparison_df.to_csv(os.path.join(results_dir, 'metrics', 'baseline_ml_results.csv'), index=False)

    print("\n===== FINAL MODEL COMPARISON =====")
    print(comparison_df)

    # Visualize key metrics comparison
    metrics = ['Test F1', 'Test MCC', 'Test ROC AUC', 'Test PR AUC']
    melted_df = pd.melt(comparison_df, id_vars=['Model'], value_vars=metrics, var_name='Metric', value_name='Score')

    plt.figure(figsize=(14, 8))
    sns.barplot(x='Model', y='Score', hue='Metric', data=melted_df)
    plt.title('Model Performance Comparison')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'visualizations', "baseline_model_comparison.png"))
    plt.show()

    # Plot combined ROC curves
    plt.figure(figsize=(10, 8))
    for model_name, results in all_results.items():
        fpr, tpr, _ = roc_curve(y_test, results['y_test_prob'])
        plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {results["test_roc_auc"]:.4f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for All Models')
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'visualizations', "combined_roc_curves.png"))
    plt.show()

    # Plot combined Precision-Recall curves
    plt.figure(figsize=(10, 8))
    for model_name, results in all_results.items():
        precision, recall, _ = precision_recall_curve(y_test, results['y_test_prob'])
        plt.step(recall, precision, where='post', lw=2, label=f'{model_name} (AP = {results["test_pr_auc"]:.4f})')

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves for All Models')
    plt.legend(loc='lower left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'visualizations', "combined_pr_curves.png"))
    plt.show()

    # Identify best performing model
    best_model = comparison_df.iloc[0]
    print(f"\nBest performing model: {best_model['Model']}")
    print(f"Test MCC: {best_model['Test MCC']:.4f}")
    print(f"Test F1: {best_model['Test F1']:.4f}")
    print(f"Test ROC AUC: {best_model['Test ROC AUC']:.4f}")

    # Save best hyperparameters
    print("\n===== BEST HYPERPARAMETERS FOR EACH MODEL =====")
    for model_name, params in best_params_dict.items():
        print(f"\n{model_name.upper()} BEST PARAMETERS:")
        for param, value in params.items():
            if isinstance(value, float):
                formatted_value = f"{value:.6f}"
            else:
                formatted_value = str(value)
            print(f"  {param}: {formatted_value}")

    # Create hyperparameters table and save
    params_rows = []
    all_param_names = set()

    for model_params in best_params_dict.values():
        all_param_names.update(model_params.keys())

    for model_name, params in best_params_dict.items():
        row = {'Model': model_name}
        for param in all_param_names:
            row[param] = params.get(param, None)
        params_rows.append(row)

    params_df = pd.DataFrame(params_rows)
    params_file = os.path.join(results_dir, 'metrics', 'best_hyperparameters.csv')
    params_df.to_csv(params_file, index=False)
    print(f"\nBest parameters saved to {params_file}")

    print("\nBaseline ML models training and evaluation completed successfully!")
    print(f"Results saved to: {results_dir}")

if __name__ == "__main__":
    main()
