In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score,
    precision_score, f1_score, RocCurveDisplay)

from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('../data/train_resampled', index_col=0)
df.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,current_pincode_id,state_id,employee_code_id,mobileno_avl_flag,...,delinquent_accts_in_last_six_months,no_of_inquiries,employment_self,employment_none,age_at_disbursal,average_acct_age_months,credit_history_length_months,cns_score_category,cns_score_unreliable,loan_default
0,50578,58400,89.55,67,22807,45,1441,6,1998,1,...,0,0,0,0,34,0,0,0,0,0
1,47145,65550,73.23,67,22807,45,1502,6,1998,1,...,1,0,1,0,33,23,23,9,0,1
2,53278,61360,89.63,67,22807,45,1497,6,1998,1,...,0,0,1,0,33,0,0,0,0,0
3,57513,66113,88.48,67,22807,45,1501,6,1998,1,...,0,1,1,0,25,8,15,12,0,1
4,52378,60300,88.39,67,22807,45,1495,6,1998,1,...,0,1,1,0,41,0,0,0,0,1


In [3]:
df.columns


Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'current_pincode_id', 'state_id', 'employee_code_id',
       'mobileno_avl_flag', 'aadhar_flag', 'pan_flag', 'voterid_flag',
       'driving_flag', 'passport_flag', 'perform_cns_score', 'pri_no_of_accts',
       'pri_active_accts', 'pri_overdue_accts', 'pri_current_balance',
       'pri_sanctioned_amount', 'pri_disbursed_amount', 'sec_no_of_accts',
       'sec_active_accts', 'sec_overdue_accts', 'sec_current_balance',
       'sec_sanctioned_amount', 'sec_disbursed_amount', 'primary_instal_amt',
       'sec_instal_amt', 'new_accts_in_last_six_months',
       'delinquent_accts_in_last_six_months', 'no_of_inquiries',
       'employment_self', 'employment_none', 'age_at_disbursal',
       'average_acct_age_months', 'credit_history_length_months',
       'cns_score_category', 'cns_score_unreliable', 'loan_default'],
      dtype='object')

* exploring a new set of features

In [4]:
X = df[['disbursed_amount', 'ltv', 'pri_overdue_accts', 'pri_no_of_accts', 'no_of_inquiries', 'age_at_disbursal']]
y = df['loan_default']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Logistic Regression

In [6]:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'lr__penalty': ['l1', 'l2', 'elasticnet', None],
    'lr__warm_start': [True, False]
}

gs_lr = GridSearchCV(lr_pipe,
                 param_grid=pipe_params,
                 n_jobs = -1)

gs_lr.fit(X_train, y_train)
print(gs_lr.best_score_)
gs_lr.best_params_

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/oseianom/opt/anaconda3/lib/python3.

0.5753978963132044


{'lr__penalty': 'l1', 'lr__warm_start': True}

In [7]:
print(f' Training Accuracy Score: {gs_lr.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_lr.score(X_test, y_test)}')

 Training Accuracy Score: 0.5755111133835378
 Test Accuracy Score: 0.5752147427469542


In [8]:
print(f' Training Misclassification Rate: {1 - gs_lr.score(X_train, y_train)}')
print(f' Test Misclassification Rate: {1 - gs_lr.score(X_test, y_test)}')

 Training Misclassification Rate: 0.4244888866164622
 Test Misclassification Rate: 0.42478525725304583


In [9]:
predslr = gs_lr.best_estimator_.predict(X_test)

In [10]:
f1_score(y_test, predslr)

0.6035320223742472

## Random Forest

In [11]:
rf_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

rf_params = {
    'rf__n_estimators' :[100, 150],
    'rf__max_depth': [1, 2, 3],
    'rf__min_samples_leaf': [1, 2],
    'rf__min_samples_split': [1.0, 2.0]
}

rs_rf = RandomizedSearchCV(rf_pipe, param_distributions= rf_params, cv=5, n_jobs = -1)
#gs_rf = GridSearchCV(rf_pipe, param_grid= rf_params, cv=5, n_jobs = -1)

rs_rf.fit(X_train, y_train)
print(rs_rf.best_score_)
rs_rf.best_params_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/Users/oseianom/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in

0.5004163410331767


{'rf__n_estimators': 150,
 'rf__min_samples_split': 1.0,
 'rf__min_samples_leaf': 2,
 'rf__max_depth': 2}

In [12]:
print(f' Training Accuracy Score: {rs_rf.score(X_train, y_train)}')
print(f' Test Accuracy Score: {rs_rf.score(X_test, y_test)}')

 Training Accuracy Score: 0.5004163410198164
 Test Accuracy Score: 0.498750986063634


In [13]:
print(f' Training Misclassification Score: {1 - rs_rf.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - rs_rf.score(X_test, y_test)}')

 Training Misclassification Score: 0.49958365898018364
 Testtest Misclassification Score: 0.501249013936366


In [14]:
preds_rf = rs_rf.predict(X_test)


In [15]:
f1_score(y_test, preds_rf)


0.0

### Bagging

In [16]:
bagg_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('bagg', BaggingClassifier())
])

bagg_pipe_params = {
    'bagg__n_estimators':[10, 15, 20],
    'bagg__max_samples': [1, 2, 3],
    'bagg__max_features': [1, 2, 3, 4]
}

gs_bagg = GridSearchCV(bagg_pipe,
                    param_grid = bagg_pipe_params, cv = 5,
                    n_jobs = -1)

gs_bagg.fit(X_train, y_train)
print(gs_bagg.best_score_)
print(gs_bagg.best_params_)

0.532291896709024
{'bagg__max_features': 3, 'bagg__max_samples': 3, 'bagg__n_estimators': 20}


In [17]:
print(f' Training Accuracy Score: {gs_bagg.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_bagg.score(X_test, y_test)}')

 Training Accuracy Score: 0.49378775373063466
 Test Accuracy Score: 0.49212244719081427


In [18]:
print(f' Training Misclassification Score: {1 - gs_bagg.score(X_train, y_train)}')
print(f' Test Misclassification Score: {1 - gs_bagg.score(X_test, y_test)}')

 Training Misclassification Score: 0.5062122462693653
 Test Misclassification Score: 0.5078775528091857


In [19]:
preds_bagg = gs_bagg.predict(X_test)


In [20]:
f1_score(y_test, preds_bagg)


0.5192836179987348

### AdaBoost

In [21]:
ada_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('ada', AdaBoostClassifier())
])

ada_pipe_params = {
    'ada__n_estimators':[50, 100, 150, 200],
    'ada__learning_rate': [1.0, 1.5, 2.0]
}

gs_ada = GridSearchCV(ada_pipe,
                    param_grid = ada_pipe_params, cv = 5,
                    n_jobs = -1)

gs_ada.fit(X_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

0.5972010190752592
{'ada__learning_rate': 1.5, 'ada__n_estimators': 200}


In [22]:
print(f' Training Accuracy Score: {gs_ada.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_ada.score(X_test, y_test)}')

 Training Accuracy Score: 0.5991622050004748
 Test Accuracy Score: 0.5982448067315277


In [23]:
print(f' Training Misclassification Score: {1 - gs_ada.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - gs_ada.score(X_test, y_test)}')

 Training Misclassification Score: 0.4008377949995252
 Testtest Misclassification Score: 0.4017551932684723


In [24]:
preds_ada = gs_ada.predict(X_test)

In [25]:
f1_score(y_test, preds_ada)

0.6149063756944372

### Stacked

In [29]:
level1_estimators = [
    #('ss', StandardScaler()),
    #('nb', gs_nb.best_estimator_),
    ('ada', gs_ada.best_estimator_),
    ('rf', rs_rf.best_estimator_)
]

stacked_model = StackingClassifier(estimators=level1_estimators,
                                 final_estimator=LogisticRegression())

In [30]:
stacked_model.fit(X_train, y_train)

StackingClassifier(estimators=[('ada',
                                Pipeline(steps=[('ss', StandardScaler()),
                                                ('ada',
                                                 AdaBoostClassifier(learning_rate=1.5,
                                                                    n_estimators=200))])),
                               ('rf',
                                Pipeline(steps=[('ss', StandardScaler()),
                                                ('rf',
                                                 RandomForestClassifier(max_depth=2,
                                                                        min_samples_leaf=2,
                                                                        min_samples_split=1.0,
                                                                        n_estimators=150))]))],
                   final_estimator=LogisticRegression())

In [31]:
print(f' Training Accuracy Score: {stacked_model.score(X_train, y_train)}')
print(f' Test Accuracy Score: {stacked_model.score(X_test, y_test)}')

 Training Accuracy Score: 0.5937351632860263
 Test Accuracy Score: 0.5934240511876588


In [32]:
print(f' Training Misclassification Score: {1 - stacked_model.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - stacked_model.score(X_test, y_test)}')

 Training Misclassification Score: 0.40626483671397373
 Testtest Misclassification Score: 0.40657594881234116
