# Gradient Boosting Model - LC Loan Data Default Preiciton

Grdient Boosting is a popular model in recent times for its powerful and robust structure.<br>It uses the gradient descent as its loss function.  

In [24]:
#GBM LC - Deep Borkar
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import pickle
from sklearn import ensemble


from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

In [25]:
def save_csv(df, name= 'default_name' + str(random.randint(1,100))):
    file = '/' + name + '.csv'
    df.to_csv(file, index=False)
    
def save_model(model, ver):
    file = 'models/' + "GBM_" + str(model.n_estimators)+ "_"+str(ver) + '.sav'
    pickle.dump(model, open(file, 'wb'))

def load_model(name):
    name = 'models/' + name
    return pickle.load(open(name, 'rb'))

In [26]:
model_data = pd.read_csv('data/existing_customers.csv', index_col = 0)

In [27]:
model_data.head()

Unnamed: 0,total_pymnt,out_prncp,loan_amnt,installment,total_rec_late_fee,int_rate,term,dti,revol_util,annual_inc,revol_bal,bc_open_to_buy,tot_hi_cred_lim,mo_sin_old_rev_tl_op,bc_util,loan_status
39733,572.57,12688.29,13000,295.66,0.0,12.98,1,21.9,47.2,52000.0,16320,9280.0,140185.0,173.0,63.8,0
39734,750.81,9095.92,9450,382.31,0.0,26.31,0,15.06,23.5,42000.0,14791,23518.0,283111.0,209.0,38.3,0
39735,305.19,2363.11,2500,104.39,0.0,28.72,0,11.38,82.5,142000.0,4540,0.0,48167.0,275.0,100.9,0
39739,592.85,8559.71,9000,285.41,0.0,8.81,0,12.77,7.2,75900.0,2558,20301.0,68088.0,221.0,9.8,0
39741,839.58,7476.28,8000,284.87,0.0,16.91,0,39.43,41.2,42000.0,18572,12191.0,69683.0,99.0,55.5,0


In [28]:
Y = model_data['loan_status']
X = model_data.drop("loan_status", 1)

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3)

In [30]:
from sklearn.metrics import roc_auc_score

Initially I run a basic GBM model with only 3 trees which gives me a ROC-AUC score of 50%. <br> This tells me that I would need to get optimal hyperparameters for a better model.

In [31]:
# run a simple model
params = {'n_estimators': 3,'max_leaf_nodes':6,'learning_rate': 0.1, 'random_state':1}
model = ensemble.GradientBoostingClassifier(**params)
model.fit(X_train, y_train)

roc_auc_score(y_test, model.predict(X_test))

0.5

### Parameter Tuning

To find the optimal combination of hyper-parameters of the GBM model, I have created a grid search function to try different combinations and select the parameters that provide the model with the highest accuracy.

In [32]:
def grid_search(n_est, max_f, l_r, max_lnodes, min_splits, X, Y):
    results = pd.DataFrame(columns=["# Trees", "Max Features", "Learning Rate",
                                    "Max Leaf Nodes", "Min Split","AUC"])
    
    for n_estimators in tqdm(n_est):
            for max_features in max_f:
                    for learning_rate in l_r:
                        for max_leaf_nodes in max_lnodes:
                                for min_samples_split in min_splits:
            
                                    params = {'n_estimators': n_estimators,  
                                      'max_features':max_features,'max_leaf_nodes':max_leaf_nodes, 
                                      'learning_rate': learning_rate, 
                                      'min_samples_split': min_samples_split,
                                      'subsample':0.8, 'random_state':1}
                                    model = ensemble.GradientBoostingClassifier(**params)
                                    model.fit(X, Y)
                                    results.loc[len(results)]=[n_estimators, max_features, learning_rate,
                                                   max_leaf_nodes, min_samples_split,
                                                   roc_auc_score(y_test, model.predict(X_test))]
    return results

In [33]:
run_grids = input("Enter 1 to run Grid Search.\nEnter 2 to Ignore.\n")
if run_grids == '1':
    results1 = grid_search([30,50], ['auto'], [0.1, 0.5, 0.7], [6, 8, 12, 16, 20], [0.00001, 0.1, 0.2, 0.3])
    save_csv(results1, 'gs_results1')
else:
    print("Reading last processed Grid Search Results...", end='')
    results_gs = pd.read_csv('data/gs_results1.csv')
    print(".DONE.")

Enter 1 to run Grid Search.
Enter 2 to Ignore.
2
Reading last processed Grid Search Results....DONE.


#### Model with best tuned parameters

In [None]:
After getting the best parameters, I train the model using them and I get 94% ROC_AUC score.

In [36]:
# Best model from Grid Search Results
params =  params = {'n_estimators': 30, 'max_features': 'auto','max_leaf_nodes': 20,
                    'learning_rate': 0.7, 'min_samples_split': 0.00001,
                    'subsample':0.8, 'random_state':1, 'verbose': 1}
model_best = ensemble.GradientBoostingClassifier(**params)
model_best.fit(X_train, y_train)

roc_auc_score(y_test, model_best.predict(X_test))

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.4984           0.2276            2.29m
         2           0.4340           0.0643            2.47m
         3           0.3248           0.1082            2.43m
         4           0.2911           0.0337            2.25m
         5           0.2404           0.0512            2.23m
         6           0.2156           0.0248            2.09m
         7           0.1921           0.0237            1.95m
         8           0.1785           0.0128            1.83m
         9           0.1686           0.0098            1.72m
        10           0.1550           0.0140            1.65m
        20           0.1129           0.0056           48.52s
        30           0.0961           0.0007            0.00s


0.9468967796559089

In [37]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, model_best.predict(X_test))

array([[586605,    788],
       [  8260,  70508]], dtype=int64)

In [38]:
save_model(model_best, 1)

The best model is saved as it can be used for business analysis later.

In [39]:
model_best = load_model('GBM_30_1.sav')

### Data Preparation for Business Analysis

In [40]:
default_probs = model_best.predict_proba(X_test)[:,1]

In [41]:
business_GBM = pd.DataFrame({'Default_Prob': default_probs, 'Loan_amt' : X_test['loan_amnt'], 'term': X_test['term'] , 'target': y_test})

In [42]:
business_GBM.to_csv('business_analysis_data/business_GBM.csv')