# Modeling

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

module_path = os.path.abspath(os.path.join( os.pardir, 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import dataloading as dl

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import pickle 

target_csv = '../src/data/training_data.csv'

## Transformed DF

In [2]:
df = dl.clean_df(target_csv)

In [3]:
df.head()

Unnamed: 0,max_credit,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,pay_status_apr,bill_sep,bill_aug,bill_jul,...,age_2,age_3,age_4,age_5,education_1,education_2,education_3,education_4,education_5,education_6
0,220000,0,0,0,0,0,0,222598,222168,217900,...,0,0,0,0,1,0,0,0,0,0
1,200000,-1,-1,-1,-1,-1,-1,326,326,326,...,0,0,0,0,0,0,1,0,0,0
2,180000,-2,-2,-2,-2,-2,-2,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,80000,0,0,0,0,0,0,51372,51872,47593,...,0,0,0,0,0,1,0,0,0,0
4,10000,0,0,0,0,0,0,8257,7995,4878,...,0,0,0,0,0,1,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22499 entries, 0 to 22499
Data columns (total 47 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   max_credit         22499 non-null  int64  
 1   pay_status_sep     22499 non-null  int64  
 2   pay_status_aug     22499 non-null  int64  
 3   pay_status_jul     22499 non-null  int64  
 4   pay_status_jun     22499 non-null  int64  
 5   pay_status_may     22499 non-null  int64  
 6   pay_status_apr     22499 non-null  int64  
 7   bill_sep           22499 non-null  int64  
 8   bill_aug           22499 non-null  int64  
 9   bill_jul           22499 non-null  int64  
 10  bill_jun           22499 non-null  int64  
 11  bill_may           22499 non-null  int64  
 12  bill_apr           22499 non-null  int64  
 13  payment_sep        22499 non-null  int64  
 14  payment_aug        22499 non-null  int64  
 15  payment_jul        22499 non-null  int64  
 16  payment_jun        224

In [5]:
df.describe()

Unnamed: 0,max_credit,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,pay_status_apr,bill_sep,bill_aug,bill_jul,...,age_2,age_3,age_4,age_5,education_1,education_2,education_3,education_4,education_5,education_6
count,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,...,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0,22499.0
mean,167054.521534,-0.01409,-0.128806,-0.160185,-0.214587,-0.2597,-0.287613,51455.869416,49385.313214,47042.13,...,0.211609,0.078626,0.010978,0.000933,0.351971,0.467399,0.16503,0.004,0.009245,0.001867
std,129864.960118,1.123191,1.197954,1.19888,1.174824,1.139984,1.154249,74470.257248,71953.814345,69917.92,...,0.408458,0.26916,0.104203,0.030538,0.477596,0.498947,0.371215,0.063122,0.095707,0.043167
min,10000.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3506.0,2975.0,2611.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,140000.0,0.0,0.0,0.0,0.0,0.0,0.0,22387.0,21145.0,20053.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,240000.0,0.0,0.0,0.0,0.0,0.0,0.0,66988.5,63755.0,60161.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1000000.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Train-test split

In [6]:
# Create matrix of features
X = df.drop('default', axis = 1) # grabs everything else but 'Survived'

# Create target variable
y = df['default'] # y is the column we're trying to predict

# Create a list of the features being used in the 
feature_cols = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Model

In [7]:
# instantiate RFC
rfc = RandomForestClassifier(random_state = 1, n_estimators=100)

In [8]:
# Fit to train data
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [9]:
#use the fitted model to predict on the test data
rfc_preds = rfc.predict(X_test)

rfc_f1 = metrics.f1_score(y_test, rfc_preds)

# checking accuracy on the test data
print('Test F1 score: ', rfc_f1)

Test F1 score:  0.467005076142132


In [10]:
# grid search
param_grid = { 
    'n_estimators': [275, 300, 325],
    'max_depth': list(range(6,10)),
    'max_features': list(range(3,9))
}

#create a grid search object and fit it to the data

grid_tree=GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 16.0min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [6, 7, 8, 9],
                         'max_features': [3, 4, 5, 6, 7, 8],
                         'n_estimators': [200, 250, 300, 350, 400]},
             scoring='f1', verbose=1)

In [11]:
### Identify the best params 



# Single best score achieved across all params (min_samples_split)
print(grid_tree.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_tree.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_tree.best_estimator_)
#Identify the best score during fitting with cross-validation

0.46627785647443654
{'max_depth': 9, 'max_features': 8, 'n_estimators': 350}
RandomForestClassifier(max_depth=9, max_features=8, n_estimators=350)


In [12]:
#Predict the response for test dataset
y_pred = grid_tree.best_estimator_.predict(X_test)

# Model F1, how often is the classifier correct?
print("F1:",metrics.f1_score(y_test, y_pred))

F1: 0.483937823834197


In [13]:
saved_model = pickle.dump(grid_tree.best_estimator_, open( "../src/model.p", "wb" ))