In [None]:
# generate model accuracy wrt different training data

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings
import shap
 

# Load cleaned dataset
df = pd.read_csv('../data/processed/v1/.csv')
df = df.dropna()
print(df.columns)

 

X_students_processed = pipeline_students.fit_transform(X_students)
 

# Define models to train (Logistic Regression, Random Forest, XGBoost, CatBoost)
models = {
    "Logistic Regression (Working Professionals)": LogisticRegression(),
    "Random Forest (Working Professionals)": RandomForestClassifier(),
    "XGBoost (Working Professionals)": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost (Working Professionals)": CatBoostClassifier(verbose=0),
    "Logistic Regression (Students)": LogisticRegression(),
    "Random Forest (Students)": RandomForestClassifier(),
    "XGBoost (Students)": XGBClassifier(eval_metric='logloss'),
    "CatBoost (Students)": CatBoostClassifier(verbose=0)
}

# Train and evaluate each model for working professionals and students
for model_name, model in models.items():
    if "Working Professionals" in model_name:
        X_train, X_val, y_train, y_val = X_wp_train, X_wp_test, y_wp_train, y_wp_test
        feature_names = X_wp.columns
    else:
        X_train, X_val, y_train, y_val = X_students_train, X_students_test, y_students_train, y_students_test
        feature_names = X_students.columns
    
    print(f"\nTraining {model_name}...")
    
    # Cross-validation on the training data
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} - Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_proba) if y_proba is not None else "N/A"
    
    # Store the metrics
    model_performance[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }
    
    # Print model performance
    print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, ROC AUC: {roc_auc}")
    
    # Save the trained model for later evaluation
    with open(f"../models/{model_name.replace(' ', '_').lower()}_model.pkl", 'wb') as file:
        pickle.dump(model, file)
    
    # Generate the confusion matrix
    conf_matrix = confusion_matrix(y_val, y_pred)
    
    # Visualize the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {model_name}')
    
    # Save the confusion matrix as PNG
    plt.savefig(f'../reports/figures/ash/confusion_matrix_{model_name.replace(" ", "_").lower()}.png')
    plt.close()
    
    # Check for feature importance attributes and calculate feature importance
    if hasattr(model, "feature_importances_"):
        print(f"{model_name} supports feature_importances_")
        feature_importance = model.feature_importances_
    elif hasattr(model, "coef_"):
        print(f"{model_name} supports coef_")
        feature_importance = model.coef_[0]
    else:
        print(f"{model_name} does not support feature importance")
        continue
    
    # Visualize feature importance
    sorted_idx = np.argsort(feature_importance)
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
    plt.xlabel('Feature Importance' if hasattr(model, "feature_importances_") else 'Feature Importance (Coefficient Value)')
    plt.title(f'Feature Importance for {model_name}')
    plt.savefig(f'../reports/figures/ash/feature_importance_{model_name.replace(" ", "_").lower()}.png')
    plt.close()
    
    # # SHAP explanations
    # print(f"Generating SHAP explanations for {model_name}...")
    
    # # Initialize SHAP explainer based on model type
    # if isinstance(model, (RandomForestClassifier, XGBClassifier, CatBoostClassifier)):
    #     explainer = shap.TreeExplainer(model, check_additivity=False)
    #     shap_values = explainer.shap_values(X_val)
    # else:
    #     explainer = shap.LinearExplainer(model, X_train, check_additivity=False)
    #     shap_values = explainer.shap_values(X_val)
    
    # # Plot SHAP summary plot
    # plt.figure()
    # shap.summary_plot(shap_values, X_val, feature_names=feature_names, show=False)
    # plt.title(f'SHAP Summary Plot for {model_name}')
    # plt.savefig(f'shap_summary_{model_name.replace(" ", "_").lower()}.png')
    # plt.close()


# Summarize and print model performance
print("\nModel Performance Summary:")
for model_name, metrics in model_performance.items():
    print(f"\n{model_name} Performance:")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")

print("\nAll models trained and saved. Model files are ready for testing on 'test.csv' when available.")

