In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score



In [None]:
loan_data = pd.read_csv('/content/Loan_default.csv')
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSigner     255347 non-null  object 
 17  Default   

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Load dataset
loan_data = pd.read_csv('/content/Loan_default.csv')

# Label encoding for categorical columns
categorical_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
label_encoder = LabelEncoder()
for col in categorical_columns:
    loan_data[col] = label_encoder.fit_transform(loan_data[col])

# Drop unnecessary columns
loan_data.drop(columns=['LoanID'], inplace=True)

# Binning or Bucketing for Age
bins = [0, 30, 50, float('inf')]
labels = ['young', 'middle-aged', 'senior']
loan_data['Age_Group'] = pd.cut(loan_data['Age'], bins=bins, labels=labels)

# One-Hot Encoding for Age_Group
loan_data = pd.get_dummies(loan_data, columns=['Age_Group'])

# Interaction Features
loan_data['Income_CreditScore'] = loan_data['Income'] * loan_data['CreditScore']
loan_data['LoanAmount_InterestRate'] = loan_data['LoanAmount'] * loan_data['InterestRate']

# Polynomial Features
loan_data['Age_squared'] = loan_data['Age'] ** 2
loan_data['Income_squared'] = loan_data['Income'] ** 2

# Drop original columns used for interaction and polynomial features
loan_data.drop(columns=['Age', 'Income', 'CreditScore', 'LoanAmount', 'InterestRate'], inplace=True)

# Feature Scaling (Standard Scaling)
scaler = StandardScaler()
loan_data_scaled = pd.DataFrame(scaler.fit_transform(loan_data), columns=loan_data.columns)

# Split data into features and target
X = loan_data_scaled.drop(columns=['Default'])
y = loan_data['Default']  # Assuming 'Default' is the target variable containing categorical labels (0 or 1)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Define models with hyperparameters
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1),
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=1000),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=3, min_samples_split=2, min_samples_leaf=1),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, gamma=0, subsample=1),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Initialize lists to store metrics
metrics = {
    'Model': [],
    'Accuracy': [],
    'F1 Score': [],
    'Recall': [],
    'Precision': [],
    'AUC Score': []
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)

    # Calculate metrics
    accuracy = accuracy_score(y_train, y_pred_train)
    f1 = f1_score(y_train, y_pred_train)
    recall = recall_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train)
    auc = roc_auc_score(y_train, y_pred_train)

    # Store metrics
    metrics['Model'].append(name)
    metrics['Accuracy'].append(accuracy)
    metrics['F1 Score'].append(f1)
    metrics['Recall'].append(recall)
    metrics['Precision'].append(precision)
    metrics['AUC Score'].append(auc)

# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics)

# Set 'Model' as index
metrics_df.set_index('Model', inplace=True)

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(metrics_df, annot=True, cmap='coolwarm', fmt=".3f", linewidths=0.5)
plt.title('Training Metrics of Models')
plt.xlabel('Metrics')
plt.ylabel('Model')
plt.show()
