In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
import category_encoders as ce


## Pre-Processing 
- Read the feather file into a dataframe 
- Drop some columns that do we do not want as the independent variables (Xs)

In [62]:
# df = pd.read_csv('lending_club_clean_sample.csv')
# df = pd.read_feather('lending_club_smaller_sample.feather')
df = pd.read_feather('lending_club_clean.feather')

See which values we can classify as a default or cannot repay the loan 

In [63]:
print(df[['loan_status']].value_counts())

loan_status                                        
Fully Paid                                             1497783
Current                                                1031016
Charged Off                                             362548
Late (31-120 days)                                       16154
In Grace Period                                          10028
Late (16-30 days)                                         2719
Issued                                                    2062
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                    433
dtype: int64


In [64]:
# Convert description into a default or not default prediction 
default_description = ["Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period",
            "Late (16-30 days)", "Late (31-120 days)"]
df['default'] = df['loan_status'].apply(lambda status: 1 if status in default_description else 0).astype(int)

In [65]:
# X = df.drop(columns='default')
y = df['default']


df = df.drop(['id', 'default', 'revol_util' ,'sec_app_earliest_cr_line','loan_status', 'grade', 'sub_grade', 
              'emp_title', 'issue_d','url','title',
              'zip_code','earliest_cr_line','last_pymnt_d',
              'next_pymnt_d','last_credit_pull_d','hardship_type',
              'hardship_start_date','hardship_end_date',
              'payment_plan_start_date','hardship_loan_status','debt_settlement_flag'], axis=1)


In [66]:
# Properly turn variables that are categorical into the correct data type
df = pd.concat([df,pd.get_dummies(df["emp_length"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["addr_state"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["home_ownership"],drop_first=True)],axis=1)


In [67]:

df = pd.concat([df,pd.get_dummies(df["verification_status"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["pymnt_plan"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["purpose"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["initial_list_status"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["application_type"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["hardship_flag"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["hardship_reason"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["verification_status_joint"],drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["hardship_status"],drop_first=True)],axis=1)

# drop the categorical columns as well 
df = df.drop(['emp_length', 'addr_state', 'home_ownership', 
              'verification_status', 'pymnt_plan','purpose','initial_list_status',
              'application_type','hardship_flag',
              'hardship_reason','verification_status_joint','hardship_status'], axis=1)


In [75]:
# Split train test dataset 
X_train, X_OOS_test, y_train, y_OOS_test = train_test_split(df, y, test_size=0.20, random_state=66)

In [76]:
lasso = linear_model.Lasso(alpha=1)
lasso.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [78]:
# Print Lasso Coefficients that is not 0 
coefficients = dict(zip(X_train.columns, lasso.coef_))
print("Lasso Coefficients:")
lasso_picked_var = []
for feature, coef in coefficients.items():
    if coef != 0:
        lasso_picked_var.append(feature)
        print(f"{feature}: {coef}")

Lasso Coefficients:
loan_amnt: 3.215231192662825e-05
funded_amnt: 1.1160011603015962e-05
annual_inc: -1.898293847639652e-08
revol_bal: -1.3455352569186869e-07
out_prncp: -4.585172374295031e-05
total_pymnt: -1.8772799799746e-05
total_rec_prncp: -2.7344365070965145e-05
total_rec_int: 2.0314780725234308e-05
recoveries: 2.9901604379517438e-05
last_pymnt_amnt: -2.2220259399386754e-06
last_fico_range_high: -0.0007147247411385774
last_fico_range_low: -0.0003584969296763416
annual_inc_joint: 2.825307861618525e-08
tot_coll_amt: -1.1181006617147113e-08
tot_cur_bal: -4.4983602549408605e-08
total_bal_il: -1.6930576857913345e-07
max_bal_bc: -2.493930417500715e-07
total_rev_hi_lim: 7.77068882140871e-08
avg_cur_bal: 3.4168324582467854e-08
bc_open_to_buy: 4.2050300713578345e-07
tot_hi_cred_lim: 3.20341349690361e-08
total_bal_ex_mort: 6.62878615070509e-08
total_bc_limit: -3.765805148537659e-08
total_il_high_credit_limit: 1.984897358503877e-12
hardship_amount: 4.7275769945540806e-05
hardship_payoff_bala

In [79]:
# pick the X variables that LASSO tells us are significant
X_train = X_train[lasso_picked_var]
print(X_train.head())

         loan_amnt  funded_amnt  annual_inc  revol_bal     out_prncp  \
2273679       6000         6000     57000.0       5746      0.000000   
323906       29775        29775    132400.0      48904  18171.859375   
1292667      12000        12000       100.0       4824  10994.769531   
546901       17000        17000     70000.0      17798   4956.120117   
1000326      13500        13500     40000.0       8140   8557.540039   

          total_pymnt  total_rec_prncp  total_rec_int  recoveries  \
2273679   6788.683105      6000.000000     788.679993         0.0   
323906   29811.269531     11603.139648   18208.130859         0.0   
1292667   2983.649902      1005.229980    1978.420044         0.0   
546901   15679.269531     12043.879883    3577.629883         0.0   
1000326   5820.779785      4942.459961     878.320007         0.0   

         last_pymnt_amnt  ...  max_bal_bc  total_rev_hi_lim  avg_cur_bal  \
2273679      2743.520020  ...          -1             14200        12060   


## Oversampling 
Since we see that the data is highly imbalanced, we try to oversample the data set with SMOTE. 

In [63]:
from imblearn.over_sampling import SMOTE
# solve error by downgrading to install scikit-learn==1.2.2 
# ''' steps : 
# 1. install pip 
# 2. uninstall sci-kit 
# 3. uninstall imblearn
# 4. install sci-kit 1.2.2 
# 5. install imblearn 
# '''

In [84]:
# Over sample using SMOTE
# -- by inspecting the data, we see that the minority class is extremely class (fraud "Class" == 1)
sm = SMOTE(random_state=42)

# Check if any value exceeds the int threshold 
# threshold = 2147483647
# exceeds_threshold = (X_train > threshold) | (X_train < -threshold)
# # Print the result
# columns_exceeding_threshold = X_train.columns[exceeds_threshold.any()]
# print(columns_exceeding_threshold)


X_train_32 = X_train.astype('int32')
X_smote, y_smote = sm.fit_resample(X_train_32, y_train)

In [7]:
# -- Code to Inspect the data set -- 
# df_oversampled = X_smote
# df_oversampled['Outcome_Variable'] = y_smote
# df_oversampled
# fig, ax = plt.subplots(figsize=(10, 8))
# df_oversampled['Outcome_Variable'].value_counts().plot(kind='bar', ax=ax, fontsize=14)
# ax.set_title('Oversampled Dataset', fontsize=16)
# ax.set_ylabel('Observation counts', fontsize=14)
# ax.set_xlabel('Class', fontsize=14)
# plt.show()


## K-Fold Cross Validation 
We setup K-Fold cross validation for parameters fine-tuning in the models below as a 5 Fold. 

In [110]:
# Set up K-Fold Cross Validation 
n_splits = 5
shuffle = True
random_state = 809
cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
# plot = plot_cv_indices(cv, X_smote, y_smote, n_splits)

## Model 1 : Logistic Regression 

In [9]:
def plot_cv_indices(cv, X, y, n_splits, lw=10):
    '''
    This function plots the Cross validation indices.
    '''
    
    fig, ax = plt.subplots(figsize = (15,8))
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Paired)

    # Formatting
    yticklabels = list(range(n_splits)) + ['Class']
    ax.set(yticks=np.arange(n_splits+1) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [130]:
# Model 1 : Logistic Model 
logistic_model = LogisticRegression(solver='lbfgs', max_iter=300)
for train_index, test_index in cv.split(X_smote):
    # change to loc to define the rows in the dataframe 
    X_cv_train, X_cv_test, y_cv_train, y_cv_test = X_smote.loc[train_index], X_smote.loc[test_index], y_smote[train_index], y_smote[test_index]
    logistic_model.fit(X_cv_train, y_cv_train)
    #Cross-Validation Prediction Error
    score = logistic_model.score(X_cv_test, y_cv_test)
    print(score)

In [121]:
logit = LogisticRegression(solver='lbfgs', max_iter=1000)
logit.fit(X_smote, y_smote)    
score_OOS = logit.score(X_OOS_test, y_OOS_test)
print ("Logistic Model score :" , score_OOS)
y_pred = logit.predict(X_OOS_test)
print("Logistic Model Recall : " , recall_score(y_OOS_test, y_pred))
print("Logistic Model Precision : ", precision_score(y_OOS_test,y_pred))
print("Probability if you only predict 0s ", 1-round(95/56867,3))

Logistic Model score : 0.9846037709350093
Logistic Model Recall :  0.8631578947368421
Logistic Model Precision :  0.08668076109936575
Probability if you only predict 0s  0.998




## Model 2 : Lasso 

In [15]:
# Model 2 : LASSO 

# Cross Validate the penalty term in lasso
cross_validate_result = {}  
iter_alpha = 0.01
for penalty_term in range(100): 
    print(penalty_term)
    accuracies = [] 
    lasso = linear_model.Lasso(alpha=iter_alpha)
    for train_index, test_index in cv.split(X_smote):
        # change to loc to define the rows in the dataframe 
        X_cv_train, X_cv_test, y_cv_train, y_cv_test = X_smote.loc[train_index], X_smote.loc[test_index], y_smote[train_index], y_smote[test_index]
        logistic_model.fit(X_cv_train, y_cv_train)
        #Cross-Validation Prediction Error
        score = logistic_model.score(X_cv_test, y_cv_test)
        accuracies.append(score)
    cross_validate_result[penalty_term] = (sum(accuracies)/len(accuracies))
    iter_alpha += 0.05
    print("Alpha : " + str(iter_alpha) + " " + str((sum(accuracies)/len(accuracies))))
print(cross_validate_result)
print(max(cross_validate_result, key=cross_validate_result.get))
    

### OLS vs Lasso vs Ridge regression

In [120]:
# OLS 
ols = linear_model.LinearRegression()
ols.fit(X_smote, y_smote)    
y_pred = ols.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("OLS Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("OLS Model Precision : ", precision_score(y_OOS_test,y_pred_classification))
print("--")

# LASSO 
lasso = linear_model.Lasso(alpha=1)
lasso.fit(X_smote, y_smote)
y_pred = lasso.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("LASSO score : " , )
print("LASSO Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("LASSO Model Precision : ", precision_score(y_OOS_test,y_pred_classification))
print("--")

# RIDGE
ridge = linear_model.Ridge(alpha=1)
ridge.fit(X_smote, y_smote)
y_pred = ridge.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("Ridge Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("Ridge Model Precision : ", precision_score(y_OOS_test,y_pred_classification))

OLS Model Recall :  0.7684210526315789
OLS Model Precision :  0.08805790108564536
--
LASSO score : 
LASSO Model Recall :  0.7789473684210526
LASSO Model Precision :  0.09762532981530343
--
Ridge Model Recall :  0.7684210526315789
Ridge Model Precision :  0.08805790108564536


Looks like the curse of dimensionality. 

## Random Forest 

In [122]:

random_forest = RandomForestClassifier(n_estimators = 100, max_depth=10, random_state=0)
random_forest.fit(X_smote, y_smote)


In [123]:
y_pred = random_forest.predict(X_OOS_test)
print("RND Forest Model Recall : " , recall_score(y_OOS_test, y_pred))
print("RND Forest Precision : ", precision_score(y_OOS_test,y_pred))

RND Forest Model Recall :  0.8526315789473684
RND Forest Precision :  0.4879518072289157


## Random forest with LASSO selected variables 

In [101]:
print(X_smote.columns)
print(lasso.coef_)
print(lasso.sparse_coef_)

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')
[-0.          0.         -0.00326987  0.         -0.         -0.
 -0.          0.         -0.         -0.          0.         -0.
 -0.         -0.03410666 -0.         -0.         -0.         -0.
  0.          0.          0.         -0.         -0.         -0.
  0.         -0.          0.          0.          0.00014124]
  (0, 2)	-0.003269866219385262
  (0, 13)	-0.03410666159760446
  (0, 28)	0.00014124466427369918


In [94]:
# Random forest + LASSO 
X_train_rf_lasso = X_smote[["V3","V14","Amount"]]
rf_lasso = RandomForestClassifier(max_depth=2)
rf_lasso.fit(X_train_rf_lasso, y_smote)



  (0, 2)	-0.003269866219385262
  (0, 13)	-0.03410666159760446
  (0, 28)	0.00014124466427369918


In [95]:
X_test_rf_lasso_OOS = X_OOS_test[["V3","V14","Amount"]]
y_pred = clf.predict(X_test_rf_lasso_OOS)
print("RND Forest with LASSO Model Recall : " , recall_score(y_OOS_test, y_pred))
print("RND Forest with LASSO Precision : ", precision_score(y_OOS_test,y_pred))

RND Forest with LASSO Model Recall :  0.8526315789473684
RND Forest with LASSO Precision :  0.0472027972027972


## Using Param Grid to search for best value with K-Fold Cross Validation 

In [128]:

param_grid = [{'n_estimators': [200,225,250], 'max_depth' :[10,20,30]}]
random_forest_cv = RandomForestClassifier()
grid_cv = GridSearchCV(estimator=random_forest_cv, cv=cv, param_grid=param_grid, n_jobs = 3)
grid_cv.fit(X_train_rf_lasso, y_smote)

In [129]:
print(grid_cv.best_estimator_)


RandomForestClassifier(max_depth=30, n_estimators=250)
