<a href="https://colab.research.google.com/github/dhrithi21/Credit-Risk-Analysis-Fintech/blob/main/CREDIT__RISK_MODEL_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('credit risk analysis.csv')

# View first rows
df.head()


Unnamed: 0,home_ownership,annual_inc,emp_length,dti,loan_amnt,int_rate,term,loan_status,lti,default_flag,int_rate_clean,risk score,risk bucket,approval_decision,int_rev,expected_loss,net_profit
0,RENT,55000.0,10+ years,18.24,2500,13.56,36 months,Current,0.045455,0,0.1356,5,Low,Approve,339.0,660.0,-321.0
1,MORTGAGE,90000.0,10+ years,26.52,30000,18.94,60 months,Current,0.333333,0,0.1894,8,Medium,Approve,5682.0,2700.0,2982.0
2,MORTGAGE,59280.0,6 years,10.51,5000,17.97,36 months,Current,0.084345,0,0.1797,6,Low,Approve,898.5,711.36,187.14
3,MORTGAGE,92000.0,10+ years,16.74,4000,18.94,36 months,Current,0.043478,0,0.1894,6,Low,Approve,757.6,1104.0,-346.4
4,MORTGAGE,57250.0,10+ years,26.35,30000,16.14,60 months,Current,0.524017,0,0.1614,10,High,Reject,4842.0,4122.0,720.0


In [None]:
# Define target
y = df['default_flag']

# Drop only the target column from features
X = df.drop(columns=['default_flag'])

# Keep only numeric features
import numpy as np
X = X.select_dtypes(include=[np.number])

X.head()


Unnamed: 0,annual_inc,dti,loan_amnt,int_rate,lti,int_rate_clean,risk score,int_rev,expected_loss,net_profit
0,55000.0,18.24,2500,13.56,0.045455,0.1356,5,339.0,660.0,-321.0
1,90000.0,26.52,30000,18.94,0.333333,0.1894,8,5682.0,2700.0,2982.0
2,59280.0,10.51,5000,17.97,0.084345,0.1797,6,898.5,711.36,187.14
3,92000.0,16.74,4000,18.94,0.043478,0.1894,6,757.6,1104.0,-346.4
4,57250.0,26.35,30000,16.14,0.524017,0.1614,10,4842.0,4122.0,720.0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
from sklearn.impute import SimpleImputer

# Create imputer
imputer = SimpleImputer(strategy='median')

# Fit on training data and transform
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)

print("ROC-AUC Score:", auc)

ROC-AUC Score: 0.7208130905189115


In [None]:
# Predict probabilities for entire dataset
X_full = df.drop(columns=['default_flag'])
X_full = X_full.select_dtypes(include=[np.number])
X_full = imputer.transform(X_full)

df['ML_PD'] = model.predict_proba(X_full)[:, 1]

df[['ML_PD']].head()

Unnamed: 0,ML_PD
0,0.469801
1,0.513938
2,0.66851
3,0.587912
4,0.32587


In [None]:
df['ML_Risk_Bucket'] = pd.qcut(df['ML_PD'], q=3, labels=['Low', 'Medium', 'High'])

In [None]:
df.groupby('ML_Risk_Bucket')['default_flag'].mean()

  df.groupby('ML_Risk_Bucket')['default_flag'].mean()


Unnamed: 0_level_0,default_flag
ML_Risk_Bucket,Unnamed: 1_level_1
Low,0.000563
Medium,0.000563
High,0.003


In [None]:
import pandas as pd
import numpy as np

# Extract coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})

# Convert to odds ratios
coefficients['Odds_Ratio'] = np.exp(coefficients['Coefficient'])

# Sort by absolute impact
coefficients = coefficients.reindex(
    coefficients['Coefficient'].abs().sort_values(ascending=False).index
)

print(coefficients)

          Feature  Coefficient  Odds_Ratio
3        int_rate     0.101589    1.106929
6      risk score    -0.073937    0.928730
1             dti    -0.050056    0.951176
5  int_rate_clean     0.001016    1.001016
4             lti    -0.000244    0.999756
7         int_rev     0.000184    1.000184
9      net_profit     0.000149    1.000149
2       loan_amnt    -0.000043    0.999957
8   expected_loss     0.000034    1.000034
0      annual_inc    -0.000003    0.999997


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(class_weight='balanced', max_iter=1000))
])

pipeline.fit(X_train, y_train)

In [None]:
model = pipeline.named_steps['model']

coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})

coefficients['Odds_Ratio'] = np.exp(coefficients['Coefficient'])

In [None]:
coefficients = coefficients.reindex(
    coefficients['Coefficient'].abs().sort_values(ascending=False).index
)

print(coefficients)

          Feature  Coefficient  Odds_Ratio
1             dti    -0.839202    0.432055
2       loan_amnt    -0.467550    0.626535
7         int_rev     0.304454    1.355884
0      annual_inc    -0.272485    0.761485
5  int_rate_clean     0.244706    1.277246
3        int_rate     0.244706    1.277246
9      net_profit     0.231072    1.259950
6      risk score    -0.152328    0.858707
8   expected_loss     0.095363    1.100058
4             lti    -0.038442    0.962287
