In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna

# Load the dataset
loan_original = pd.read_csv('../data/loan_approval_dataset.csv', index_col='loan_id')

loan_original.columns = loan_original.columns.str.strip()
loan_original = loan_original.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Convert loan_status to binary
loan_original['loan_status'] = loan_original['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Data Exploration
print(loan_original.head())
print(loan_original.info())
print(loan_original.describe(include='all'))

# Check for NaN values
print("NaN values:")
print(loan_original.isnull().sum())

# Analyze loan status distribution
print("\nLoan Status Distribution:")
print(loan_original['loan_status'].value_counts(normalize=True))

# Data Preparation and Feature Engineering
# Convert remaining categorical variables to numeric
loan_original['education'] = loan_original['education'].map({'Graduate': 1, 'NotGraduate': 0})
loan_original['self_employed'] = loan_original['self_employed'].map({'Yes': 1, 'No': 0})

# Log transform for highly skewed numeric columns
numeric_columns = ['income_annum', 'loan_amount', 'residential_assets_value', 
                   'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
for col in numeric_columns:
    loan_original[f'{col}_log'] = np.log1p(loan_original[col])

# Feature engineering
loan_original['loan_to_income_ratio'] = loan_original['loan_amount'] / loan_original['income_annum']
loan_original['emi'] = loan_original['loan_amount'] / (loan_original['loan_term'] * 12)
loan_original['total_assets'] = (loan_original['residential_assets_value'] + 
                                 loan_original['commercial_assets_value'] + 
                                 loan_original['luxury_assets_value'] + 
                                 loan_original['bank_asset_value'])
loan_original['total_assets_log'] = np.log1p(loan_original['total_assets'])
loan_original['loan_to_assets_ratio'] = loan_original['loan_amount'] / loan_original['total_assets']
loan_original['income_per_dependent'] = loan_original['income_annum'] / (loan_original['no_of_dependents'] + 1)
loan_original['balance_income'] = loan_original['income_annum'] - (loan_original['emi'] * 12)

# Replace infinity values with NaN and then fill NaN
loan_original = loan_original.replace([np.inf, -np.inf], np.nan)
loan_original = loan_original.fillna(loan_original.median())

# Model Exploration
X = loan_original.drop('loan_status', axis=1)
y = loan_original['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
def objective_logistic(trial):
    model_params = {
        'C': trial.suggest_float('C', 1e-5, 1e5, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 1000, 5000),
        'tol': trial.suggest_float('tol', 1e-5, 1e-3, log=True)
    }
    threshold = trial.suggest_float('feature_selection_threshold', 0.01, 0.5)
    
    model = LogisticRegression(**model_params)
    selector = SelectFromModel(estimator=model, threshold=threshold)
    selector.fit(X_train_scaled, y_train)
    
    if selector.get_support().sum() == 0:
        return 0
    
    X_selected = selector.transform(X_train_scaled)
    return evaluate_model(model, X_selected, y_train)

def objective_random_forest(trial):
    model_params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20)
    }
    threshold = trial.suggest_float('feature_selection_threshold', 0.01, 0.5)
    
    model = RandomForestClassifier(**model_params)
    selector = SelectFromModel(estimator=model, threshold=threshold)
    selector.fit(X_train_scaled, y_train)
    
    if selector.get_support().sum() == 0:
        return 0
    
    X_selected = selector.transform(X_train_scaled)
    return evaluate_model(model, X_selected, y_train)

def objective_xgboost(trial):
    model_params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    threshold = trial.suggest_float('feature_selection_threshold', 0.01, 0.5)
    
    model = XGBClassifier(**model_params)
    selector = SelectFromModel(estimator=model, threshold=threshold)
    selector.fit(X_train_scaled, y_train)
    
    if selector.get_support().sum() == 0:
        return 0
    
    X_selected = selector.transform(X_train_scaled)
    return evaluate_model(model, X_selected, y_train)

def evaluate_model(model, X, y):
    if X.shape[1] == 0:
        return 0
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
        return scores.mean()
    except ValueError:
        return 0
# Run separate studies
study_logistic = optuna.create_study(direction='maximize')
study_logistic.optimize(objective_logistic, n_trials=30)

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_random_forest, n_trials=30)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=30)
def plot_feature_importances(model, feature_names, title):
    if hasattr(model, 'coef_'):
        importances = np.abs(model.coef_[0])
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        return
    
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title(title)
    plt.bar(range(len(importances)), importances[indices])
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Get best models and selected features
best_logistic_params = {k: v for k, v in study_logistic.best_params.items() if k != 'feature_selection_threshold'}
best_logistic = LogisticRegression(**best_logistic_params)
selector_logistic = SelectFromModel(best_logistic, threshold=study_logistic.best_params['feature_selection_threshold'])
X_train_logistic = selector_logistic.fit_transform(X_train_scaled, y_train)
X_test_logistic = selector_logistic.transform(X_test_scaled)

best_rf_params = {k: v for k, v in study_rf.best_params.items() if k != 'feature_selection_threshold'}
best_rf = RandomForestClassifier(**best_rf_params)
selector_rf = SelectFromModel(best_rf, threshold=study_rf.best_params['feature_selection_threshold'])
X_train_rf = selector_rf.fit_transform(X_train_scaled, y_train)
X_test_rf = selector_rf.transform(X_test_scaled)

best_xgb_params = {k: v for k, v in study_xgb.best_params.items() if k != 'feature_selection_threshold'}
best_xgb = XGBClassifier(**best_xgb_params)
selector_xgb = SelectFromModel(best_xgb, threshold=study_xgb.best_params['feature_selection_threshold'])
X_train_xgb = selector_xgb.fit_transform(X_train_scaled, y_train)
X_test_xgb = selector_xgb.transform(X_test_scaled)

# Train and evaluate best models
models = {
    'Logistic Regression': (best_logistic, X_train_logistic, X_test_logistic, selector_logistic),
    'Random Forest': (best_rf, X_train_rf, X_test_rf, selector_rf),
    'XGBoost': (best_xgb, X_train_xgb, X_test_xgb, selector_xgb)
}

for name, (model, X_train_selected, X_test_selected, selector) in models.items():
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    
    print(f"\nBest {name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix - {name}')
    plot_feature_importances(model, X.columns[selector.get_support()], f"Feature Importances - {name}")

# Visualization of optimization process
def plot_optimization_history(study, title):
    plt.figure(figsize=(10, 6))
    
    # Extract trial numbers and corresponding values
    trials = study.trials
    values = [t.value for t in trials if t.value is not None]
    trial_numbers = list(range(len(values)))
    
    # Plot the optimization history
    plt.plot(trial_numbers, values, marker='o')
    
    # Plot the best value as a horizontal line
    best_value = study.best_value
    plt.axhline(y=best_value, color='r', linestyle='--', label='Best value')
    
    plt.xlabel('Trial number')
    plt.ylabel('Objective value')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_overall_feature_importance(models, X):
    overall_importance = np.zeros(X.shape[1])
    feature_names = X.columns

    for name, (model, _, _, selector) in models.items():
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            continue
        
        # Normalize importances
        importances = importances / np.sum(importances)
        
        # Add to overall importance
        overall_importance[selector.get_support()] += importances

    # Normalize overall importance
    overall_importance = overall_importance / len(models)

    # Sort features by importance
    indices = np.argsort(overall_importance)[::-1]

    # Plot
    plt.figure(figsize=(12, 8))
    plt.title("Overall Feature Importance Across All Models")
    plt.bar(range(len(overall_importance)), overall_importance[indices])
    plt.xticks(range(len(overall_importance)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
# Plot optimization history and overall feature importance
plot_optimization_history(study_logistic, "Optimization History - Logistic Regression")
plot_optimization_history(study_rf, "Optimization History - Random Forest")
plot_optimization_history(study_xgb, "Optimization History - XGBoost")
plot_overall_feature_importance(models, X)

def print_best_params_and_features(name, study, selector, X):
    print(f"\nBest parameters for {name}:")
    params = study.best_params
    for param, value in params.items():
        print(f"  {param}: {value}")
    
    selected_features = X.columns[selector.get_support()].tolist()
    print(f"\nBest features for {name}:")
    for feature in selected_features:
        print(f"  {feature}")

# Best params and features for each model
for name, study, selector in zip(['Logistic Regression', 'Random Forest', 'XGBoost'], 
                                 [study_logistic, study_rf, study_xgb],
                                 [selector_logistic, selector_rf, selector_xgb]):
    print_best_params_and_features(name, study, selector, X)