In [1]:
# 1. Import Required Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# 2. Load Dataset

df = pd.read_csv(r"C:\Users\dolly\Downloads\JPMC QR\Task 3 and 4_Loan_Data.csv")
print(" Data Loaded Successfully")
print("Shape of data:", df.shape)
print("\nSample Rows:")
print(df.head())


# 3. Basic Cleaning

df = df.dropna()  # drop missing values

# Automatically detect target column (assuming it’s named like 'Default' or similar)
possible_targets = [col for col in df.columns if 'default' in col.lower()]
if possible_targets:
    target_col = possible_targets[0]
else:
    raise ValueError("❌ No column found indicating 'Default'. Please check dataset column names.")


# 4. Encode Categorical Variables

df = pd.get_dummies(df, drop_first=True)


# 5. Define Features and Target

y = df[target_col]
X = df.drop(target_col, axis=1)


# 6. Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 7. Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 8. Train Models


# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)


# 9. Model Performance

log_pred = log_model.predict_proba(X_test_scaled)[:, 1]
rf_pred = rf_model.predict_proba(X_test)[:, 1]

print("\n--- Model Performance ---")
print("Logistic Regression AUC:", round(roc_auc_score(y_test, log_pred), 3))
print("Random Forest AUC:", round(roc_auc_score(y_test, rf_pred), 3))
print("\nAccuracy (Logistic):", round(accuracy_score(y_test, log_model.predict(X_test_scaled)), 3))
print("Accuracy (Random Forest):", round(accuracy_score(y_test, rf_model.predict(X_test)), 3))


# 10. Define Expected Loss Function

def predict_expected_loss(model, scaler, loan_features, recovery_rate=0.10):
    """
    Predict expected loss for a borrower using trained model.
    """
    input_df = pd.DataFrame([loan_features])
    input_df = pd.get_dummies(input_df)
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    # Scale numeric features if using logistic regression
    if isinstance(model, LogisticRegression):
        input_scaled = scaler.transform(input_df)
    else:
        input_scaled = input_df

    pd_value = model.predict_proba(input_scaled)[:, 1][0]
    expected_loss = pd_value * (1 - recovery_rate)
    return {"PD": round(pd_value, 4), "Expected_Loss": round(expected_loss, 4)}


# 11. Sample Predictions

# Sample Example 1
sample_loan_1 = X.iloc[0].to_dict()
result_1 = predict_expected_loss(log_model, scaler, sample_loan_1)
print("\n Example 1 (Logistic Regression):", result_1)

# Sample Example 2
sample_loan_2 = X.iloc[1].to_dict()
result_2 = predict_expected_loss(rf_model, scaler, sample_loan_2)
print("\n Example 2 (Random Forest):", result_2)


 Data Loaded Successfully
Shape of data: (10000, 8)

Sample Rows:
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  

--- Model Performance ---
Logistic Regression AUC: 1.0
Random Forest