In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data = pd.read_csv('Part 2. loan_data_final.csv')
display(data)

Unnamed: 0.1,Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_to_income_ratio,loan_type,dependents_count,regional_unemployment_rate,borrower_risk_score,loan_status
0,0,22,female,Master,71948,0,RENT,PERSONAL,16.02,0.49,3,561,No,0.49,FLOATING,3,6.84,281.4,1
1,1,21,female,High School,12282,0,OWN,EDUCATION,11.14,0.08,2,504,Yes,0.08,FIXED,0,5.96,252.4,0
2,2,25,female,High School,12438,3,MORTGAGE,MEDICAL,12.87,0.44,3,635,No,0.44,FIXED,0,6.69,318.4,1
3,3,23,female,Bachelor,79753,0,RENT,MEDICAL,15.23,0.44,2,675,No,0.44,FIXED,3,7.63,338.1,1
4,4,24,male,Master,66135,1,RENT,MEDICAL,14.27,0.53,4,586,No,0.53,FLOATING,0,4.63,294.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,44995,27,male,Associate,47971,6,RENT,MEDICAL,15.66,0.31,3,645,No,0.31,FIXED,0,6.53,323.4,1
44996,44996,37,female,Associate,65800,17,RENT,HOMEIMPROVEMENT,14.07,0.14,11,621,No,0.14,FIXED,2,4.33,313.8,1
44997,44997,33,male,Associate,56942,7,RENT,DEBTCONSOLIDATION,10.02,0.05,10,668,No,0.05,FLOATING,0,6.66,337.0,1
44998,44998,29,male,Bachelor,33164,4,RENT,EDUCATION,13.23,0.36,6,604,No,0.36,FIXED,0,6.80,303.8,1


In [124]:
#drop first index column
data = data.iloc[:,1:] 

In [125]:
#check data types
data.dtypes

person_age                          int64
person_gender                      object
person_education                   object
person_income                       int64
person_emp_exp                      int64
person_home_ownership              object
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length          int64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_to_income_ratio              float64
loan_type                          object
dependents_count                    int64
regional_unemployment_rate        float64
borrower_risk_score               float64
loan_status                         int64
dtype: object

In [126]:
#print all unique values for possible categorical columns
object_cols = data.select_dtypes(include='object').columns
for col in object_cols:
    print(data[col].value_counts())

male      24841
female    20159
Name: person_gender, dtype: int64
Bachelor       13399
Associate      12028
High School    11972
Master          6980
Doctorate        621
Name: person_education, dtype: int64
RENT        23443
MORTGAGE    18489
OWN          2951
OTHER         117
Name: person_home_ownership, dtype: int64
EDUCATION            9153
MEDICAL              8548
VENTURE              7819
PERSONAL             7552
DEBTCONSOLIDATION    7145
HOMEIMPROVEMENT      4783
Name: loan_intent, dtype: int64
Yes    22858
No     22142
Name: previous_loan_defaults_on_file, dtype: int64
FIXED       26052
FLOATING    18948
Name: loan_type, dtype: int64


In [127]:
#convert them into categorical columns
data[object_cols] = data[object_cols].astype('category')

In [128]:
#check for missing data
print(data.isnull().sum().sum())

0


In [129]:
#check for duplicated rows
print(data.duplicated().sum())

0


In [130]:
#check for unusual values
data.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_to_income_ratio,dependents_count,regional_unemployment_rate,borrower_risk_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,11.006606,0.139725,5.867489,632.608756,0.139726,1.052533,5.738823,317.963033,0.222222
std,6.045108,80422.5,6.063532,2.978808,0.087212,3.879702,50.435865,0.087212,1.122943,1.296174,25.443032,0.415744
min,20.0,8000.0,0.0,5.42,0.0,2.0,390.0,0.0,0.0,3.5,196.9,0.0
25%,24.0,47204.0,1.0,8.59,0.07,3.0,601.0,0.07,0.0,4.62,302.2,0.0
50%,26.0,67048.0,4.0,11.01,0.12,4.0,640.0,0.12,1.0,5.73,321.4,0.0
75%,30.0,95789.25,8.0,12.99,0.19,8.0,670.0,0.19,2.0,6.86,336.5,0.0
max,144.0,7200766.0,125.0,20.0,0.66,30.0,850.0,0.66,4.0,8.0,432.5,1.0


In [131]:
#since person_age > 100 are very rare, they are likely to be erroneous data points, so removing them from data
display(data[data['person_age'] > 100])
data = data[data['person_age'] < 100]

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_to_income_ratio,loan_type,dependents_count,regional_unemployment_rate,borrower_risk_score,loan_status
81,144,male,Bachelor,300616,125,RENT,VENTURE,13.57,0.02,3,789,No,0.02,FIXED,2,3.59,395.4,0
183,144,male,Associate,241424,121,MORTGAGE,EDUCATION,11.86,0.02,2,807,No,0.02,FIXED,2,7.27,404.1,0
575,123,female,High School,97140,101,RENT,EDUCATION,10.25,0.21,3,805,Yes,0.21,FLOATING,0,6.91,403.2,0
747,123,male,Bachelor,94723,100,RENT,VENTURE,11.01,0.21,4,714,Yes,0.21,FLOATING,0,7.39,358.0,0
32297,144,female,Associate,7200766,124,MORTGAGE,PERSONAL,12.73,0.0,25,850,No,0.0,FIXED,0,6.27,432.5,0
37930,116,male,Bachelor,5545545,93,MORTGAGE,VENTURE,12.15,0.0,24,708,No,0.0,FLOATING,0,3.8,361.2,0
38113,109,male,High School,5556399,85,MORTGAGE,VENTURE,12.58,0.0,22,792,No,0.0,FLOATING,2,7.8,402.6,0


In [132]:
#if loan_percent_income is 0, then they shouldn't be include in the dataset
#, which means they are likely to be erroneous data points
display(data[data['loan_percent_income'] == 0])
data = data[data['loan_percent_income'] != 0]

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_to_income_ratio,loan_type,dependents_count,regional_unemployment_rate,borrower_risk_score,loan_status
238,22,male,High School,220518,0,MORTGAGE,EDUCATION,11.01,0.0,2,551,Yes,0.0,FLOATING,4,6.15,275.9,0
16926,22,male,High School,220715,0,MORTGAGE,EDUCATION,11.01,0.0,2,577,No,0.0,FLOATING,0,4.43,289.1,0
17834,34,male,Bachelor,1138723,12,MORTGAGE,PERSONAL,9.99,0.0,7,609,Yes,0.0,FLOATING,3,5.4,306.4,0
17846,30,male,Master,605611,9,RENT,DEBTCONSOLIDATION,11.36,0.0,9,631,Yes,0.0,FLOATING,1,4.85,318.0,0
18917,35,female,Master,613103,10,RENT,PERSONAL,13.48,0.0,6,571,No,0.0,FLOATING,4,5.81,287.3,0
27877,30,female,Associate,627222,4,MORTGAGE,HOMEIMPROVEMENT,7.43,0.0,9,637,No,0.0,FIXED,2,3.64,321.2,0
29188,41,male,High School,241503,20,MORTGAGE,VENTURE,10.38,0.0,13,611,No,0.0,FLOATING,0,7.19,309.4,0
29527,36,female,Associate,447300,16,RENT,DEBTCONSOLIDATION,11.01,0.0,14,638,No,0.0,FLOATING,0,7.23,323.2,1
30049,42,male,High School,2448661,16,RENT,VENTURE,12.29,0.0,15,635,Yes,0.0,FIXED,1,4.77,321.8,0
31910,41,male,High School,533530,19,MORTGAGE,EDUCATION,15.31,0.0,11,667,Yes,0.0,FLOATING,1,7.87,336.6,0


In [133]:
#check for column pairs that are highly correlated
sorted_corr = data.corr().unstack().sort_values(ascending=False)
sorted_corr = sorted_corr[sorted_corr < 1]
sorted_corr = sorted_corr[sorted_corr.abs() > 0.5]
sorted_corr

loan_percent_income         loan_to_income_ratio          0.999999
loan_to_income_ratio        loan_percent_income           0.999999
credit_score                borrower_risk_score           0.998973
borrower_risk_score         credit_score                  0.998973
person_age                  person_emp_exp                0.952144
person_emp_exp              person_age                    0.952144
cb_person_cred_hist_length  person_age                    0.878475
person_age                  cb_person_cred_hist_length    0.878475
person_emp_exp              cb_person_cred_hist_length    0.839874
cb_person_cred_hist_length  person_emp_exp                0.839874
dtype: float64

In [134]:
#removing columns with correlation > .85 to avoid multicollinearity
data = data.drop(columns = ['loan_percent_income', 'borrower_risk_score', 'person_emp_exp', 'cb_person_cred_hist_length'])

In [135]:
data.columns

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_home_ownership', 'loan_intent', 'loan_int_rate', 'credit_score',
       'previous_loan_defaults_on_file', 'loan_to_income_ratio', 'loan_type',
       'dependents_count', 'regional_unemployment_rate', 'loan_status'],
      dtype='object')

In [136]:
#create age_group,loan_amount, credit_loan_ratio
data['loan_amount'] = data['loan_to_income_ratio'] * data['person_income']
data['age_group'] = pd.cut(data['person_age'], bins=[18, 25, 35, 50, 65, 100], 
                         labels=['18-24', '25-34', '35-49', '50-64', '65+'])
data['credit_loan_ratio'] = data['credit_score'] / data['loan_amount']
object_cols = list(object_cols) + ['age_group']

In [150]:
data_log = pd.get_dummies(data, columns=object_cols, drop_first=True)  # Drop the first to avoid multicollinearity

# Define your feature columns (exclude the target column 'target')
X = data_log.drop(columns=['loan_status'])

# Define your target column
y = data_log['loan_status']

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Normalize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)

# Train the model
logreg_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = logreg_model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.8962641761174116

Confusion Matrix:
 [[6554  440]
 [ 493 1507]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      6994
           1       0.77      0.75      0.76      2000

    accuracy                           0.90      8994
   macro avg       0.85      0.85      0.85      8994
weighted avg       0.90      0.90      0.90      8994



In [151]:
# Get feature names (assuming 'X' is your feature matrix)
feature_names = X.columns  # if X is a DataFrame

# Get the model coefficients
coefficients = logreg_model.coef_[0]  # For binary classification, the coefficients are in the first row

# Combine the feature names and coefficients into a DataFrame for easy viewing
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

# Display the features sorted by their coefficient values
print(coef_df)

                               Feature  Coefficient
4                 loan_to_income_ratio     1.661959
2                        loan_int_rate     1.011004
8                    credit_loan_ratio     0.423389
16          person_home_ownership_RENT     0.332485
1                        person_income     0.267499
0                           person_age     0.039724
14         person_home_ownership_OTHER     0.023988
12        person_education_High School     0.023354
9                   person_gender_male     0.018510
23                  loan_type_FLOATING     0.017931
13             person_education_Master     0.015320
6           regional_unemployment_rate     0.011890
26                     age_group_50-64     0.008983
11          person_education_Doctorate     0.006530
18         loan_intent_HOMEIMPROVEMENT     0.004456
5                     dependents_count     0.001934
10           person_education_Bachelor    -0.001814
25                     age_group_35-49    -0.028734
24          

In [160]:
data[data['previous_loan_defaults_on_file'] == 'Yes']['loan_status'].mean()

0.0

In [161]:
data[data['previous_loan_defaults_on_file'] == 'No']['loan_status'].mean()

0.45189135445383466

# Conclusion

I'm very surprised by the huge negative coefficient assigned to "previous_loan_defaults_on_file_yes". The negative coefficient means that if this is true, then the probability of defaulting decreases significantly. Based on the data, it's saying that everyone with a history of defaulting at least one has not defaulted again. I think it's important to understand the meaning behind this designation as that doesn't intuitively make sense to me. 

Outside of that, it makes sense that higher loan_to_income_ratio and interest rate heavily increase the probability of defaulting while higher credit score and loan amount decreases the probability of defaulting. Loan amount makes sense because if they are approved for bigger amount, they are more likely to have good credit background and/or higher income.

While the overall model accuracy is 90%, the more imporant metric is recall, which is the accuracy on predicting defaults. At 75%, the number is not particularly impressive, but that's understandable as logistical is a simpler model that provides more interpretability and transparency. I believe with more sophisticated model, such as random forest or XGBoost, I can achieve higher accuracy. The reason behind picking this model because this serves a good first-step, baseline model to help us understand which features are useful. The interpretability is also very useful when explaining to non-technical stakeholders.

Some key decisions I made throughout the process were determining which columns to remove if any, checking data quality, and creating new features. I checked for missing values, duplicated rows, highly correlated columns, outliers/abnormal values. I also thought about using VIF to remove redundant columns, but I decided against it because some of the removal recommendations don't make logical sense to me. 