# Notebook for machine learning models that predict loan 


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Loan_data_ML.csv', index_col='member_id')

In [3]:
df.index

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            42447, 42448, 42449, 42450, 42451, 42452, 42453, 42454, 42455,
            42456],
           dtype='int64', name='member_id', length=42453)

In [4]:
X = df.drop('loan_status_Charged Off', axis=1).values
y = df['loan_status_Charged Off'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=32, stratify=y)

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Logistic regression without tuning
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
# Logistic regression without tuning
from sklearn.model_selection import cross_val_score
x_val = cross_val_score(reg, X_train, y_train, cv=5)
print('CV Score on training data:', np.mean(x_val))
print('Score on training data:', reg.score(X_train, y_train))
print('Score on test set:', reg.score(X_test, y_test))

CV Score on training data: 0.9541001894893928
Score on training data: 0.9533600296126796
Score on test set: 0.9558731155778895


In [8]:
# Logistic regression without tuning
from sklearn.metrics import classification_report
pred_y = reg.predict(X_test)
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     10813
           1       1.00      0.71      0.83      1923

   micro avg       0.96      0.96      0.96     12736
   macro avg       0.97      0.86      0.90     12736
weighted avg       0.96      0.96      0.95     12736



In [9]:
# Optimal logisitc regression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
param_grid = {'penalty':['l1','l2'], 'C':np.logspace(-5,8,15)}
logregcv = GridSearchCV(reg, param_grid, cv=5)
logregcv.fit(X_train, y_train)
print('Best parameters:', logregcv.best_params_)
print('Score from the best parameters:', logregcv.best_score_)

Best parameters: {'C': 0.006105402296585327, 'penalty': 'l1'}
Score from the best parameters: 0.9559174883063566


In [10]:
best_logistic = logregcv.best_estimator_

In [11]:
# Optimal logisitc regression
y_pred = best_logistic.predict(X_test)
print('CV accuracy score of optimized model on the test data:', np.mean(cross_val_score(best_logistic, X_train, y_train, cv=5)))
print('Accuracy score of optimized model on the training data:', best_logistic.score(X_train, y_train))
print('Accuracy score of optimized model on the test data:', best_logistic.score(X_test, y_test))

CV accuracy score of optimized model on the test data: 0.9559171477475903
Accuracy score of optimized model on the training data: 0.9557155836726453
Accuracy score of optimized model on the test data: 0.9571293969849246


In [12]:
# Optimal logisitc regression
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     10813
           1       0.98      0.73      0.84      1923

   micro avg       0.96      0.96      0.96     12736
   macro avg       0.96      0.87      0.91     12736
weighted avg       0.96      0.96      0.95     12736



In [12]:
# Logistic regression with SCALED features
from sklearn.preprocessing import scale
X_scaled = scale(X)

In [13]:
# Logistic regression with SCALED features
X_scaled_train, X_scaled_test, y_train2, y_test2 = train_test_split(X_scaled, y, test_size=0.3, random_state=32,\
                                                                    stratify=y)
reg.fit(X_scaled_train, y_train)
scaled_CV_score = cross_val_score(reg, X_scaled_train, y_train, cv=5)
print('CV score:',np.mean(scaled_CV_score))
print('Score using all training data:',reg.score(X_scaled_train, y_train))
print('Score using testing data:',reg.score(X_scaled_test, y_test))

CV score: 0.9527203809546319
Score using all training data: 0.9532254265235387
Score using testing data: 0.9518687185929648


In [15]:
print(classification_report(y_test2, reg.predict(X_scaled_test)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     10813
           1       0.93      0.74      0.82      1923

   micro avg       0.95      0.95      0.95     12736
   macro avg       0.94      0.86      0.90     12736
weighted avg       0.95      0.95      0.95     12736



In [58]:
# Tuning the Logistic regression with SCALED features
log_scaled_cv = GridSearchCV(reg, param_grid, cv=5)
log_scaled_cv.fit(X_scaled_train, y_train2)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e-05, 8.48343e-05, 7.19686e-04, 6.10540e-03, 5.17947e-02,
       4.39397e-01, 3.72759e+00, 3.16228e+01, 2.68270e+02, 2.27585e+03,
       1.93070e+04, 1.63789e+05, 1.38950e+06, 1.17877e+07, 1.00000e+08])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
print('Best parameters:',log_scaled_cv.best_params_)
print('Score from best parameters:',log_scaled_cv.best_score_)

Best parameters: {'C': 31.622776601683793, 'penalty': 'l1'}
Score from best parameters: 0.9533263788403944


In [60]:
# Tuned Logistic regression with SCALED features
best_scaled_model = log_scaled_cv.best_estimator_
tuned_scaled_CV_score = cross_val_score(best_scaled_model, X_scaled_train, y_train2, cv=5)
print('Tuned: CV score:',np.mean(tuned_scaled_CV_score))
print('Tuned: Score using all training data:',best_scaled_model.score(X_scaled_train, y_train2))
print('Tuned: Score using testing data:',best_scaled_model.score(X_scaled_test, y_test2))

Tuned: CV score: 0.9507343364046459
Tuned: Score using all training data: 0.9539320927415285
Tuned: Score using testing data: 0.9522613065326633


In [61]:
# Tuned Logistic regression with SCALED features
print(classification_report(y_test2, best_scaled_model.predict(X_scaled_test)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     10813
           1       0.93      0.74      0.82      1923

   micro avg       0.95      0.95      0.95     12736
   macro avg       0.94      0.87      0.90     12736
weighted avg       0.95      0.95      0.95     12736



In [62]:
#Untuned support vector machine
from sklearn.svm import SVC
svm = SVC(random_state=32)
svm.fit(X_scaled_train, y_train2)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=32,
  shrinking=True, tol=0.001, verbose=False)

In [63]:
#Untuned support vector machine
cv_score_svm2 = cross_val_score(svm, X_scaled_train, y_train2, cv=4)
print('Untuned scaled CV score with training set', np.mean(cv_score_svm2))
print('Untuned scaled score with training set', svm.score(X_scaled_train, y_train))
print('Untuned scaled score with testing set', svm.score(X_scaled_test, y_test))

Untuned scaled CV score with training set 0.9340108477797987
Untuned scaled score with training set 0.9490863815324562
Untuned scaled score with testing set 0.9382066582914573


In [64]:
#Untuned support vector machine
y_pred_smv2 = svm.predict(X_scaled_test)
print(classification_report(y_test2, y_pred_smv2))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     10813
           1       0.99      0.60      0.74      1923

   micro avg       0.94      0.94      0.94     12736
   macro avg       0.96      0.80      0.85     12736
weighted avg       0.94      0.94      0.93     12736



In [65]:
#Tuned support vector machine
param_dist_svc = {'C':np.logspace(-2, 10, 13), 'gamma':np.logspace(-9, 3, 13)}
svm_cv = RandomizedSearchCV(svm, param_distributions=param_dist_svc, cv=5)
svm_cv.fit(X_scaled_train, y_train2)
best_svm = svm_cv.best_estimator_

In [66]:
#Tuned support vector machine
print("best parameters:", svm_cv.best_params_)
print('score from best parameters:', svm_cv.best_score_)

best parameters: {'gamma': 0.01, 'C': 1000.0}
score from best parameters: 0.9433320994716828


In [67]:
#Tuned support vector machine
cv_score_svm = cross_val_score(best_svm, X_scaled_train, y_train2, cv=4)
print('Scaled CV score with training set', np.mean(cv_score_svm))
print('Scaled score with training set', best_svm.score(X_scaled_train, y_train2))
print('Scaled score with testing set', best_svm.score(X_scaled_test, y_test2))

Scaled CV score with training set 0.9428610677264737
Scaled score with training set 0.9978800013460309
Scaled score with testing set 0.9420540201005025


In [68]:
#Tuned support vector machine
y_pred_smv2 = best_svm.predict(X_scaled_test)
print(classification_report(y_test2, y_pred_smv2))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     10813
           1       0.81      0.80      0.81      1923

   micro avg       0.94      0.94      0.94     12736
   macro avg       0.89      0.89      0.89     12736
weighted avg       0.94      0.94      0.94     12736



In [44]:
# Random forest without tuning
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
# Random forest without tuning
x_vali = cross_val_score(rf, X_train, y_train, cv=5)
print('Untuned Cross Validation score on training set:',x_vali.mean())
print('Untuned Score on training set:',rf.score(X_train, y_train))
print('Untuned Score on the test set:',rf.score(X_test, y_test))

Untuned Cross Validation score on training set: 0.9300401586416485
Untuned Score on training set: 0.9942457179392267
Untuned Score on the test set: 0.9296482412060302


In [46]:
# Random forest without tuning
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     10813
           1       0.98      0.55      0.70      1923

   micro avg       0.93      0.93      0.93     12736
   macro avg       0.95      0.77      0.83     12736
weighted avg       0.93      0.93      0.92     12736



In [47]:
# Optimal random forest classifier, tuning the RF
param_dist = {'n_estimators':[800, 900, 1000, 1100, 1200], 'max_features':['auto','log2',None], 'max_depth':[7,10,None]}
rfcv = RandomizedSearchCV(rf, param_distributions=param_dist, cv=4)

In [48]:
# Optimal random forest classifier
rfcv.fit(X_train, y_train)
print('Best parameters for random forest:', rfcv.best_params_)
print('Score of random forest with best params:', rfcv.best_score_)

Best parameters for random forest: {'n_estimators': 900, 'max_features': 'auto', 'max_depth': None}
Score of random forest with best params: 0.938250832856614


In [49]:
yrf_predict = rfcv.predict(X_test)

In [50]:
# Optimal random forest classifier
print(classification_report(y_test, yrf_predict))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10813
           1       1.00      0.61      0.76      1923

   micro avg       0.94      0.94      0.94     12736
   macro avg       0.97      0.81      0.86     12736
weighted avg       0.95      0.94      0.94     12736



In [51]:
# Optimal random forest classifier
best_rforest = rfcv.best_estimator_

from sklearn.model_selection import cross_val_score
x_val = cross_val_score(best_rforest, X_train, y_train, cv=5)

In [52]:
# Optimal random forest classifier
print('Optinal Cross Validation score on training set:',x_val.mean())
print('Optimal Score on training set:',best_rforest.score(X_train, y_train))
print('Optiman Score on the test set:',best_rforest.score(X_test, y_test))

Optinal Cross Validation score on training set: 0.9390920217603366
Optimal Score on training set: 1.0
Optiman Score on the test set: 0.9415043969849246


In [None]:
# Optimal RF trained with scaled data for ensemble
#rf2 = RandomForestClassifier()
#rf2cv = RandomizedSearchCV(rf2, param_distribution=param_dist, cv=5)
#rf2cv.fit(X_scaled_train, y_train)

In [None]:
# Optimal RF trained with scaled data for ensemble
#best_scaled_rf = rf2cv.best_estimator_
#scaled_cv_score = cross_val_score(best_scaled_rf, X_scaled_train, y_train, cv=4)

In [None]:
# Optimal RF trained with scaled data for ensemble
#print('Optinal Cross Validation score on the SCALED training set:',scaled_cv_score.mean())
#print('Optimal Score on SCALED training set:',best_scaled_rf.score(X_scaled_train, y_train))
#print('Optiman Score on the SCALED test set:',best_scaled_rf.score(X_scaled_test, y_test))

In [None]:
# Optimal RF trained with scaled data for ensemble
#print(classification_report(y_test, best_scaled_rf.predict(X_scaled_test)))

In [None]:
# Optimal RF trained with scaled data for ensemble
#print(rf2cv.best_params_)
#print(rf2cv.best_score_)

In [None]:
# Ensemble model
#from sklearn.ensemble import VotingClassifier
#classifiers = [('Logistic Regression', best_scaled_model), ('Random Forest', best_scaled_rf), ('SVM', best_svm)]
#vc = VotingClassifier(estimators=classifiers)
#vc.fit(X_scaled_train, y_train)

In [None]:
# Ensemble model
#cv_ensemble = cross_val_score(vc, X_scaled_train, y_train, cv=4)

In [None]:
# Ensemble model
#print('CV score of ensemble:',np.mean(cv_ensemble))
#print('Score, ensemble, training set:',vc.score(X_scaled_train, y_train))
#print('Score, ensemble, testing set:',vc.score(X_scaled_test, y_test))


In [None]:
# Ensemble model
#y_pred_vc = vc.predict(X_scaled_test)
#print(classification_report(y_test, y_pred_vc))