# Model Selection

## Import libraries and datasets

In [None]:
# Import libraries
import pandas as pd # For placing the data into dataframes for analysis
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Fix display settings for viewing output
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_rows', 400)

In [None]:
# Import training datasets
X_train = pd.read_csv("../datasets/X_train.csv")
y_train = pd.read_csv("../datasets/y_train.csv")
X_val = pd.read_csv("../datasets/X_val.csv")
y_val = pd.read_csv("../datasets/y_val.csv")

In [None]:
# Check on imported X train
X_train.head()

In [None]:
X_train.dtypes

In [None]:
X_train.shape

In [None]:
# Check on imported y train
y_train.head()

In [None]:
y_train.dtypes

In [None]:
y_train = np.ravel(y_train)

In [None]:
y_train.shape

In [None]:
# Check on imported X validation set
X_val.head()

In [None]:
X_val.dtypes

In [None]:
X_val.shape

In [None]:
# Check on imported y validation set
y_val.head()

In [None]:
y_val.dtypes

In [None]:
y_val = np.ravel(y_val)

In [None]:
y_val.shape

### Scale features

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_val_sc = ss.transform(X_val)

### Baseline

In [None]:
blr 

### K Nearest Neighbours

In [None]:
# K Nearest Neighbours model hyperparameter tuning
knn = KNeighborsClassifier()# Instantiate K Nearest Neighbours model
params = {
    'n_neighbors': range(8,12),
    'weights':['uniform'],
    'leaf_size': range(28, 32),
     'algorithm': ['auto'] # Set params for gridsearch
    }
gs_knn = GridSearchCV(knn, param_grid = params, scoring='balanced_accuracy', verbose=1, return_train_score=True)
gs_knn.fit(X_train_sc, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
# cross_val_score(knn, X_train_sc, y_train, cv=10).mean()

In [None]:
# knn.fit(X_val_sc, y_val)

In [None]:
# cross_val_score(knn, X_val_sc, y_val, cv=10).mean()

### AdaBoost

In [18]:
ada = AdaBoostClassifier()
pipe_params_ada = {
                'learning_rate': [0.9, 1.0],
                'n_estimators': [30, 50],
                'random_state': [42]
                }
# gs_ada = GridSearchCV(ada, param_grid=pipe_params_ada, scoring='roc_auc', cv=5, return_train_score= True, n_jobs = -1, verbose=1)
gs_ada = GridSearchCV(ada, param_grid=pipe_params_ada, scoring='balanced_accuracy', cv=5, return_train_score= True, verbose=1)
gs_ada.fit(X_train_sc, y_train)
print(gs_ada.best_score_)
gs_ada.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  8.4min finished


0.2161049280814465


{'learning_rate': 1.0, 'n_estimators': 50, 'random_state': 42}

In [None]:
ada = AdaBoostClassifier()
pipe_params_ada = {
                'learning_rate': [1.2, 1.6],
                'n_estimators': [60, 80],
                'random_state': [42]
                }
# gs_ada = GridSearchCV(ada, param_grid=pipe_params_ada, scoring='roc_auc', cv=5, return_train_score= True, n_jobs = -1, verbose=1)
gs_ada = GridSearchCV(ada, param_grid=pipe_params_ada, scoring='balanced_accuracy', cv=5, return_train_score= True, verbose=1)
gs_ada.fit(X_train_sc, y_train)
print(gs_ada.best_score_)
gs_ada.best_params_

### Gradient Boost

In [19]:
gbc = GradientBoostingClassifier()
pipe_params_gbc = {
                'learning_rate': [0.3,0.5],
                'max_depth': [2,3],
                'min_samples_leaf': [1,2],
                'n_estimators': [50,100],
                'random_state': [42],
                'max_features': ['auto']
                }
gs_gbc = GridSearchCV(gbc, param_grid=pipe_params_gbc, scoring='balanced_accuracy', cv=5, return_train_score= True, verbose=1)
gs_gbc.fit(X_train_sc, y_train)
print(gs_gbc.best_score_)
gs_gbc.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

### Bagging

In [None]:
bbc = BalancedBaggingClassifier()
pipe_params_bbc = {
                'n_estimators': range(10,20),
                'max_samples': [1,3,5],
                'max_features': [1,3,5],
                'random_state': [42]
                }
gs_bbc = GridSearchCV(bbc, param_grid=pipe_params_bbc, scoring='balanced_accuracy', cv=5, return_train_score= True, verbose=1)
gs_bbc.fit(X_train_sc, y_train)
print(gs_bbc.best_score_)
gs_bbc.best_params_

In [None]:
bc = BaggingClassifier()
pipe_params_bc = {
                'n_estimators': range(10,15),
                'max_samples': [3,7],
                'max_features': [3,7],
                'random_state': [42]
                }
gs_bc = GridSearchCV(bc, param_grid=pipe_params_bc, scoring='balanced_accuracy', cv=5, return_train_score= True, verbose=1)
gs_bc.fit(X_train_sc, y_train)
print(gs_bc.best_score_)
gs_bc.best_params_

### Random Forest

In [18]:
# Import rf_train dataset
rf_train = pd.read_csv("../datasets/rf_train.csv")

In [19]:
rf_train.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,loan_status,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,mths_since_last_major_derog,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,bc_open_to_buy,bc_util,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,total_inc_consol,total_dti_consol,sub_grade_d,est_age_cr_line,fico_median,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,pymnt_plan_y,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,application_type_Joint App,disbursement_method_DirectPay,region_northeast,region_south,region_west
0,3600.0,36.0,13.99,123.03,3,10.0,1,0.0,1.0,30.0,0.0,7.0,0.0,2765.0,29.7,13.0,30.0,3.0,1.0,4.0,4.0,1506.0,37.2,148.0,128.0,3.0,3.0,1.0,4.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,55000.0,5.91,34,148,677.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,24700.0,36.0,11.99,820.28,3,10.0,1,1.0,4.0,6.0,0.0,22.0,0.0,21470.0,19.2,38.0,1.0,0.0,0.0,6.0,4.0,57830.0,27.1,113.0,192.0,2.0,2.0,4.0,2.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,65000.0,16.06,31,192,717.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,20000.0,60.0,10.78,432.66,2,10.0,1,0.0,0.0,0.0,0.0,6.0,0.0,7869.0,56.2,18.0,0.0,2.0,5.0,1.0,6.0,2737.0,55.9,125.0,184.0,14.0,14.0,5.0,101.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,134000.0,24.63,24,184,697.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,10400.0,60.0,22.45,289.91,6,3.0,1,1.0,3.0,12.0,0.0,12.0,0.0,21929.0,64.5,35.0,1.0,2.0,1.0,3.0,10.0,4567.0,77.5,128.0,210.0,4.0,4.0,6.0,4.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,104433.0,25.37,61,210,697.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,11950.0,36.0,13.44,405.18,3,4.0,1,0.0,0.0,0.0,0.0,5.0,0.0,8822.0,68.4,6.0,0.0,0.0,0.0,0.0,0.0,844.0,91.0,338.0,54.0,32.0,32.0,0.0,36.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,3.0,5.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,34000.0,10.2,33,338,692.0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [20]:
rf_train.dtypes

loan_amnt                              float64
term                                   float64
int_rate                               float64
installment                            float64
grade                                    int64
emp_length                             float64
loan_status                              int64
delinq_2yrs                            float64
inq_last_6mths                         float64
mths_since_last_delinq                 float64
mths_since_last_record                 float64
open_acc                               float64
pub_rec                                float64
revol_bal                              float64
revol_util                             float64
total_acc                              float64
mths_since_last_major_derog            float64
inq_fi                                 float64
total_cu_tl                            float64
inq_last_12m                           float64
acc_open_past_24mths                   float64
bc_open_to_bu

In [21]:
rf_train.shape

(901852, 125)

In [22]:
X_rf_col = rf_train.drop(columns = 'loan_status').columns
X_rf = rf_train[X_rf_col]
y_rf = rf_train['loan_status']

In [23]:
feat_labels = X_rf.columns.tolist()
# feat_labels.index.tolist()
type(feat_labels)

list

In [24]:
# Train-validation split on the rf_train dataset
rf_X_train, rf_X_val, rf_y_train, rf_y_val = train_test_split(X_rf, y_rf, test_size=.2, stratify=y_rf, random_state=42)

In [25]:
# To scale data use for the model
ss = StandardScaler()
rf_X_train_sc = ss.fit_transform(rf_X_train)
rf_X_val_sc = ss.transform(rf_X_val)

In [26]:
# Create a random forest classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1)

# Train the classifier
rfc.fit(rf_X_train_sc, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, rfc.feature_importances_):
    print(feature)

('loan_amnt', 0.023492706020756156)
('term', 0.015973879134550156)
('int_rate', 0.03708264001603949)
('installment', 0.027826386569094454)
('grade', 0.019633909565374447)
('emp_length', 0.013759597208474231)
('delinq_2yrs', 0.0058486320380014886)
('inq_last_6mths', 0.008169773290684934)
('mths_since_last_delinq', 0.015270763737893935)
('mths_since_last_record', 0.008332385348997624)
('open_acc', 0.015577151748882665)
('pub_rec', 0.004013000532307809)
('revol_bal', 0.026432212858311627)
('revol_util', 0.02610543225145921)
('total_acc', 0.020962102170684588)
('mths_since_last_major_derog', 0.01211708312406542)
('inq_fi', 0.006185016963523172)
('total_cu_tl', 0.006581331102772327)
('inq_last_12m', 0.009183531431357457)
('acc_open_past_24mths', 0.017670778285165455)
('bc_open_to_buy', 0.026549305863209034)
('bc_util', 0.02551480547013439)
('mo_sin_old_il_acct', 0.025886504591748925)
('mo_sin_old_rev_tl_op', 0.027191005881744886)
('mo_sin_rcnt_rev_tl_op', 0.019393512350309977)
('mo_sin_rcnt

In [27]:
# Create feature selector using random forest classifier
sfm = SelectFromModel(rfc)

# Train selector
sfm.fit(rf_X_train_sc, rf_y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=1000, n_jobs=-1,
                                                 random_state=42))

In [28]:
# List out features which are important
sfm.get_support()
selected_feat= X_rf.columns[(sfm.get_support())]
print(len(selected_feat))
print(selected_feat)

44
Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'emp_length',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'revol_bal', 'revol_util', 'total_acc',
       'mths_since_last_major_derog', 'inq_last_12m', 'acc_open_past_24mths',
       'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'pct_tl_nvr_dlq',
       'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort',
       'total_bc_limit', 'total_il_high_credit_limit', 'total_inc_consol',
       'total_dti_consol', 'sub_grade_d', 'est_age_cr_line', 'fico_median'],
      dtype='object')


In [29]:
# Create new X train and X val with selected features
fs_X_train = sfm.transform(rf_X_train_sc)
fs_X_val = sfm.transform(rf_X_val_sc)

In [30]:
# Create a new random forest classifier for the most important features
rf_fs = RandomForestClassifier()

# Train the new classifier on the new dataset containing the most important features
rf_fs.fit(fs_X_train, y_train)

RandomForestClassifier()

In [33]:
rf_params = {'min_samples_split': [3,5],
             'min_samples_leaf': range(15, 20, 1),
             'class_weight': ['balanced'],
             'n_estimators': [1000]}

gs_rf = GridSearchCV(rf_fs, param_grid=rf_params, scoring='accuracy', verbose=1)

In [None]:
gs_rf.fit(fs_X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
gs_rf.score(fs_X_train, y_train), gs_rf.score(fs_X_val, y_val)

In [None]:
gs_rf.best_params_

In [None]:
pd.Series(sel.estimator_.feature_importances_.ravel()).hist()

In [None]:
# Create new X train and X val with selected features
fs_X_train = rf_X_train[]
fs_X_val = rf_X_val[]

In [None]:
# To scale feature selected data use for the model
ss = StandardScaler()
fs_X_train_sc = ss.fit_transform(fs_X_train)
fs_X_val_sc = ss.transform(fs_X_val)