In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score,
    precision_score, f1_score, RocCurveDisplay)

from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('../data/train_resampled.csv')
df.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,current_pincode_id,state_id,employee_code_id,aadhar_flag,...,no_of_inquiries,employment_self,employment_none,age_at_disbursal,average_acct_age_months,credit_history_length_months,cns_score_category,cns_score_unreliable,loan_default,disbursed_amount_treated
0,50578,58400,89.55,67,22807,45,1441,6,1998,1,...,0,0,0,34,0,0,0,0,0,50578.0
1,47145,65550,73.23,67,22807,45,1502,6,1998,1,...,0,1,0,33,23,23,9,0,1,47145.0
2,53278,61360,89.63,67,22807,45,1497,6,1998,1,...,0,1,0,33,0,0,0,0,0,53278.0
3,57513,66113,88.48,67,22807,45,1501,6,1998,1,...,1,1,0,25,8,15,12,0,1,57513.0
4,52378,60300,88.39,67,22807,45,1495,6,1998,1,...,1,1,0,41,0,0,0,0,1,52378.0


In [3]:
features = ['disbursed_amount_treated', 'pri_active_accts',  'pri_overdue_accts', 'sec_active_accts', 'sec_overdue_accts',
            'sec_current_balance', 'perform_cns_score', 'no_of_inquiries', 'age_at_disbursal', 'cns_score_unreliable',
            'credit_history_length_months', 'primary_instal_amt', 'sec_instal_amt']

* exploring a new set of features

In [4]:
X = df[features]
y = df['loan_default']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Logistic Regression

In [5]:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'lr__penalty': ['l1', 'l2', 'elasticnet', None],
    'lr__warm_start': [True, False]
}

gs_lr = GridSearchCV(lr_pipe,
                 param_grid=pipe_params,
                 n_jobs = -1)

gs_lr.fit(X_train, y_train)
print(gs_lr.best_score_)
gs_lr.best_params_

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 457

0.5655554451184404


{'lr__penalty': 'l2', 'lr__warm_start': True}

In [6]:
print(f' Training Accuracy Score: {gs_lr.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_lr.score(X_test, y_test)}')

 Training Accuracy Score: 0.5656577092478836
 Test Accuracy Score: 0.5676877903409588


In [7]:
print(f' Training Misclassification Rate: {1 - gs_lr.score(X_train, y_train)}')
print(f' Test Misclassification Rate: {1 - gs_lr.score(X_test, y_test)}')

 Training Misclassification Rate: 0.4343422907521164
 Test Misclassification Rate: 0.43231220965904116


In [8]:
predslr = gs_lr.best_estimator_.predict(X_test)

In [9]:
f1_score(y_test, predslr)

0.5768942074674559

## Random Forest

In [10]:
rf_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

rf_params = {
    'rf__n_estimators' :[100, 150],
    'rf__max_depth': [1, 2, 3],
    'rf__min_samples_leaf': [1, 2]
}

rs_rf = RandomizedSearchCV(rf_pipe, param_distributions= rf_params, cv=5, n_jobs = -1)
#gs_rf = GridSearchCV(rf_pipe, param_grid= rf_params, cv=5, n_jobs = -1)

rs_rf.fit(X_train, y_train)
print(rs_rf.best_score_)
rs_rf.best_params_

0.6087124766334185


{'rf__n_estimators': 100, 'rf__min_samples_leaf': 1, 'rf__max_depth': 3}

In [11]:
print(f' Training Accuracy Score: {rs_rf.score(X_train, y_train)}')
print(f' Test Accuracy Score: {rs_rf.score(X_test, y_test)}')

 Training Accuracy Score: 0.6082669257233012
 Test Accuracy Score: 0.6079191866070646


In [12]:
print(f' Training Misclassification Score: {1 - rs_rf.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - rs_rf.score(X_test, y_test)}')

 Training Misclassification Score: 0.3917330742766988
 Testtest Misclassification Score: 0.39208081339293543


In [13]:
preds_rf = rs_rf.predict(X_test)

In [14]:
f1_score(y_test, preds_rf)

0.6157496886140102

### Bagging

In [16]:
bagg_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('bagg', BaggingClassifier())
])

bagg_pipe_params = {
    'bagg__n_estimators':[10, 15, 20],
    'bagg__max_samples': [1, 2, 3],
    'bagg__max_features': [1, 2, 3, 4]
}

gs_bagg = GridSearchCV(bagg_pipe,
                    param_grid = bagg_pipe_params, cv = 5,
                    n_jobs = -1)

gs_bagg.fit(X_train, y_train)
print(gs_bagg.best_score_)
print(gs_bagg.best_params_)

0.5278764874824216
{'bagg__max_features': 4, 'bagg__max_samples': 3, 'bagg__n_estimators': 15}


In [17]:
print(f' Training Accuracy Score: {gs_bagg.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_bagg.score(X_test, y_test)}')

 Training Accuracy Score: 0.5314227906535093
 Test Accuracy Score: 0.5330988693136998


In [18]:
print(f' Training Misclassification Score: {1 - gs_bagg.score(X_train, y_train)}')
print(f' Test Misclassification Score: {1 - gs_bagg.score(X_test, y_test)}')

 Training Misclassification Score: 0.4685772093464907
 Test Misclassification Score: 0.46690113068630024


In [19]:
preds_bagg = gs_bagg.predict(X_test)


In [20]:
f1_score(y_test, preds_bagg)


0.5620201646471187

### AdaBoost

In [21]:
ada_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('ada', AdaBoostClassifier())
])

ada_pipe_params = {
    'ada__n_estimators':[50, 100, 150, 200],
    'ada__learning_rate': [1.0, 1.5, 2.0]
}

gs_ada = GridSearchCV(ada_pipe,
                    param_grid = ada_pipe_params, cv = 5,
                    n_jobs = -1)

gs_ada.fit(X_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

0.6351209187343446
{'ada__learning_rate': 1.5, 'ada__n_estimators': 200}


In [22]:
print(f' Training Accuracy Score: {gs_ada.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_ada.score(X_test, y_test)}')

 Training Accuracy Score: 0.6380535692112164
 Test Accuracy Score: 0.6380927338066439


In [23]:
print(f' Training Misclassification Score: {1 - gs_ada.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - gs_ada.score(X_test, y_test)}')

 Training Misclassification Score: 0.36194643078878364
 Testtest Misclassification Score: 0.3619072661933561


In [24]:
preds_ada = gs_ada.predict(X_test)

In [25]:
f1_score(y_test, preds_ada)

0.650580743436224

### Stacked

In [26]:
level1_estimators = [
    #('ss', StandardScaler()),
    #('nb', gs_nb.best_estimator_),
    ('ada', gs_ada.best_estimator_),
    ('rf', rs_rf.best_estimator_)
]

stacked_model = StackingClassifier(estimators=level1_estimators,
                                 final_estimator=LogisticRegression())

In [None]:
stacked_model.fit(X_train, y_train)

In [None]:
print(f' Training Accuracy Score: {stacked_model.score(X_train, y_train)}')
print(f' Test Accuracy Score: {stacked_model.score(X_test, y_test)}')

In [None]:
print(f' Training Misclassification Score: {1 - stacked_model.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - stacked_model.score(X_test, y_test)}')