## Dependencies

In [2]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.simplefilter('ignore')

In [3]:
import sys
print(sys.path)

['/Users/khoadangnguyen/.pyenv/versions/3.12.0/lib/python312.zip', '/Users/khoadangnguyen/.pyenv/versions/3.12.0/lib/python3.12', '/Users/khoadangnguyen/.pyenv/versions/3.12.0/lib/python3.12/lib-dynload', '', '/Users/khoadangnguyen/.pyenv/versions/3.12.0/lib/python3.12/site-packages']


## Data

In [4]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv('data/sample_submission.csv')

In [5]:
train.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

no null values so feature engineering will be done at this part with label encoder and scale adjustment as needed. This helps with intepretability and ensure no class imbalance

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


## Feature engineering

In [8]:
# Initialize LabelEncoders
encoder_home_ownership = LabelEncoder()
encoder_loan_intent = LabelEncoder()
encoder_loan_grade = LabelEncoder()
encoder_default_on_file = LabelEncoder()

# Fit and transform train columns
train['person_home_ownership'] = encoder_home_ownership.fit_transform(train['person_home_ownership'])
train['loan_intent'] = encoder_loan_intent.fit_transform(train['loan_intent'])
train['loan_grade'] = encoder_loan_grade.fit_transform(train['loan_grade'])
train['cb_person_default_on_file'] = encoder_default_on_file.fit_transform(train['cb_person_default_on_file'])
train["loantoincome"] = (((train["loan_amnt"] / train["person_income"])) * 1000000).astype(int)

# use inverse transform later if necessary

In [9]:
# Fit and transform train columns
test['person_home_ownership'] = encoder_home_ownership.transform(test['person_home_ownership'])
test['loan_intent'] = encoder_loan_intent.transform(test['loan_intent'])
test['loan_grade'] = encoder_loan_grade.transform(test['loan_grade'])
test['cb_person_default_on_file'] = encoder_default_on_file.transform(test['cb_person_default_on_file'])
test["loantoincome"] = (((test["loan_amnt"] / test["person_income"])) * 1000000).astype(int)

# use inverse transform later if necessary

## Training

In [10]:
feature_columns = [c for c in train.columns if c not in ['id', 'loan_status']]

In [11]:
aucs = []
accuracies = []
roc_curves = []
valid_preds = []  

In [12]:
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [13]:
# Prepare features (X) and target (y)
X = train[feature_columns] # assuming loan_status is the target
y = train['loan_status']

# Stratified K-Fold Cross Validation
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f'### Fold {fold + 1} Training ###')
    
    # Split the data into training and validation sets for this fold
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Initialize Random Forest model
    model = RandomForestClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_valid_pred_proba = model.predict_proba(X_valid)[:, 1]  # Probabilities for the positive class
    y_valid_pred = model.predict(X_valid)
    
    # Calculate AUC and Accuracy for this fold
    auc = roc_auc_score(y_valid, y_valid_pred_proba)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    
    # Store AUC and Accuracy
    aucs.append(auc)
    accuracies.append(accuracy)
    
    # ROC Curve calculation
    fpr, tpr, thresholds = roc_curve(y_valid, y_valid_pred_proba)
    roc_curves.append((fpr, tpr))
    
    # Store predictions for analysis
    valid_preds.append(y_valid_pred)
    
    # Print results for this fold
    print(f'Fold {fold + 1} AUC: {auc:.4f}')
    print(f'Fold {fold + 1} Accuracy: {accuracy:.4f}')

# Summary of results
print(f'\nAverage AUC: {np.mean(aucs):.4f}')
print(f'Average Accuracy: {np.mean(accuracies):.4f}')

### Fold 1 Training ###
Fold 1 AUC: 0.9267
Fold 1 Accuracy: 0.9456
### Fold 2 Training ###
Fold 2 AUC: 0.9383
Fold 2 Accuracy: 0.9506
### Fold 3 Training ###
Fold 3 AUC: 0.9325
Fold 3 Accuracy: 0.9474
### Fold 4 Training ###
Fold 4 AUC: 0.9402
Fold 4 Accuracy: 0.9511
### Fold 5 Training ###
Fold 5 AUC: 0.9387
Fold 5 Accuracy: 0.9498

Average AUC: 0.9353
Average Accuracy: 0.9489


## Model performance

In [14]:
fig = go.Figure()

for fold, (fpr, tpr) in enumerate(roc_curves):
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'Fold {fold + 1} (AUC = {aucs[fold]:.5f})'))

fig.update_layout(
    title='ROC Curves for Each Fold',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

fig.show()

## Submission

In [15]:
X_test = test[feature_columns] 
y_pred = model.predict(X_test)

submission = pd.DataFrame({
    'ID': test['id'],  
    'loan_status': y_pred  # Hard predictions
})

submission.to_csv('submission.csv', index=False)
