In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [21]:
df = pd.read_csv('/Users/amitmishra/Downloads/Datasets/credit-risk/cr_loan2.csv')

In [22]:
# Impute the null values with the median value for all employment lengths

df['person_emp_length'].fillna((df['person_emp_length'].median()), inplace = True)

In [23]:
print(df['loan_int_rate'].isnull().sum())

3116


In [24]:
indices = df[df['loan_int_rate'].isnull()].index
df_clean = df.drop(indices, inplace = False)

In [25]:
print(df_clean.isnull().sum())

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


In [26]:
df['person_home_ownership'].value_counts()

person_home_ownership
RENT        16446
MORTGAGE    13444
OWN          2584
OTHER         107
Name: count, dtype: int64

### Calculating probability of default using Logistic Regression

In [28]:
log = LogisticRegression(solver= 'lbfgs')

In [29]:
df_num = df_clean.select_dtypes(exclude=['object'])
df_str = df_clean.select_dtypes(include = ['object'])

In [30]:
#Perform one-hot encoding on the str datatypes

df_str_onehot = pd.get_dummies(df_str)

In [31]:
df_prep = pd.concat([df_num,df_str_onehot], axis = 1)

In [32]:
df_prep.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,False,False,...,False,False,False,False,True,False,False,False,False,True
1,21,9600,5.0,1000,11.14,0,0.1,2,False,False,...,False,False,True,False,False,False,False,False,True,False
2,25,9600,1.0,5500,12.87,1,0.57,3,True,False,...,False,False,False,True,False,False,False,False,True,False
3,23,65500,4.0,35000,15.23,1,0.53,2,False,False,...,False,False,False,True,False,False,False,False,True,False
4,24,54400,8.0,35000,14.27,1,0.55,4,False,False,...,False,False,False,True,False,False,False,False,False,True


In [33]:
X = df_prep.drop(columns= ['loan_status'], axis = 1)
y = df_prep[['loan_status']]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state=42)

In [35]:
model_log = log.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
print(model_log.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [37]:
print(model_log.intercept_)

[-0.02512991]


In [38]:
features_names = model_log.feature_names_in_
Coeff = model_log.coef_


In [39]:
for feature, coeff in zip(features_names, Coeff[0]):
    print(f"{feature}, Coefficient is: {coeff:.6f}")

person_age, Coefficient is: -0.125245
person_income, Coefficient is: -0.000036
person_emp_length, Coefficient is: -0.050751
loan_amnt, Coefficient is: 0.000106
loan_int_rate, Coefficient is: 0.217009
loan_percent_income, Coefficient is: 0.002794
cb_person_cred_hist_length, Coefficient is: 0.159864
person_home_ownership_MORTGAGE, Coefficient is: -0.038705
person_home_ownership_OTHER, Coefficient is: 0.000156
person_home_ownership_OWN, Coefficient is: -0.034567
person_home_ownership_RENT, Coefficient is: 0.047987
loan_intent_DEBTCONSOLIDATION, Coefficient is: 0.019266
loan_intent_EDUCATION, Coefficient is: -0.027568
loan_intent_HOMEIMPROVEMENT, Coefficient is: 0.015949
loan_intent_MEDICAL, Coefficient is: 0.008071
loan_intent_PERSONAL, Coefficient is: -0.009777
loan_intent_VENTURE, Coefficient is: -0.031071
loan_grade_A, Coefficient is: -0.038545
loan_grade_B, Coefficient is: -0.044801
loan_grade_C, Coefficient is: -0.031492
loan_grade_D, Coefficient is: 0.063163
loan_grade_E, Coefficien

#### Based on the coefficients, we can hypothesize that loan_int_rate and cb_person_cred_hist_length are important predictors of loan defaults

In [41]:
preds = model_log.predict_proba(X_test)

In [44]:
y_pred = model_log.predict(X_test)

In [45]:
print(accuracy_score(y_pred,y_test))

0.8221618869845579
