In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score,
    precision_score, f1_score, RocCurveDisplay)

In [2]:
df = pd.read_csv('../data/train_resampled.csv')

features = ['disbursed_amount_treated', 'pri_active_accts',  'pri_overdue_accts', 'sec_active_accts', 'sec_overdue_accts',
            'sec_current_balance', 'perform_cns_score', 'no_of_inquiries', 'age_at_disbursal', 'cns_score_unreliable',
            'credit_history_length_months', 'primary_instal_amt', 'sec_instal_amt', 'loan_default']

In [3]:
X = df[features]
y = df['loan_default']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

### Logistic Regression

In [4]:
pipe = Pipeline([
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'lr__penalty': ['l1', 'l2', 'elasticnet', None],
    'lr__warm_start': [True, False]
}

gs_lr = GridSearchCV(pipe,
                 param_grid=pipe_params,
                 n_jobs = -1)

gs_lr.fit(X_train, y_train)
print(gs_lr.best_score_)
gs_lr.best_params_

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 457

1.0


{'lr__penalty': 'l1', 'lr__warm_start': True}

* Accuracy

In [5]:
print(f' Training Accuracy Score: {gs_lr.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_lr.score(X_test, y_test)}')

 Training Accuracy Score: 1.0
 Test Accuracy Score: 1.0


* Misclassification

In [6]:
print(f' Training Misclassification Rate: {1 - gs_lr.score(X_train, y_train)}')
print(f' Test Misclassification Rate: {1 - gs_lr.score(X_test, y_test)}')

 Training Misclassification Rate: 0.0
 Test Misclassification Rate: 0.0


* Preds / F1

In [7]:
predslr = gs_lr.best_estimator_.predict(X_test)

In [8]:
f1_score(y_test, predslr)

1.0

### Random Forest

In [9]:
rf = RandomForestClassifier()

rf_params = {
    'n_estimators' :[100, 150,200],
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [1, 2]
}
gs_rf = GridSearchCV(rf, param_grid= rf_params, cv=5, n_jobs = -1)

gs_rf.fit(X_train, y_train)
print(gs_2.best_score_)
gs_rf.best_params_


KeyboardInterrupt: 

* Accuracy

In [None]:
print(f' Training Accuracy Score: {gs_rf.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_rf.score(X_test, y_test)}')

* Misclassification

In [None]:
print(f' Training Misclassification Score: {1 - gs_rf.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - gs_rf.score(X_test, y_test)}')

* Preds / F1

In [None]:
preds_rf = gs_rf.predict(X_test)


In [None]:
f1_score(y_test, preds_rf)


### Naive Bayes

In [None]:
nb_pipe = Pipeline([
    ('mnb', MultinomialNB())
])

nb_pipe_params = {
    'mnb__alpha':[1.0, 1.5, 2.0],
    'mnb__fit_prior': [True, False]
}

gs_nb = GridSearchCV(nb_pipe,
                    param_grid = nb_pipe_params, cv = 5,
                    n_jobs = -1)

gs_nb.fit(X_train, y_train)
print(gs_nb.best_score_)
print(gs_nb.best_params_)

* Accuracy

In [None]:
print(f' Training Accuracy Score: {gs_nb.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_nb.score(X_test, y_test)}')


* Misclassification

In [None]:
print(f' Training Misclassification Score: {1 - gs_nb.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - gs_nb.score(X_test, y_test)}')

* Preds / F1

In [None]:
preds_nb = gs_nb.predict(X_test)


In [None]:
f1_score(y_test, preds_nb)


### Bagging

In [None]:
bagg_pipe = Pipeline([
    ('bagg', DecisionTreeClassifier())
])

bagg_pipe_params = {
    'bagg__n_estimators':[10, 15, 20],
    'bagg__max_samples': [1, 2, 3],
    'bagg__max_features': [1, 2, 3, 4]
}

gs_bagg = GridSearchCV(bagg_pipe,
                    param_grid = bagg_pipe_params, cv = 5,
                    n_jobs = -1)

gs_bagg.fit(X_train, y_train)
print(gs_bagg.best_score_)
print(gs_bagg.best_params_)

* Accuracy

In [None]:
print(f' Training Accuracy Score: {gs_bagg.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_bagg.score(X_test, y_test)}')

* Missclassification

In [None]:
print(f' Training Misclassification Score: {1 - gs_bagg.score(X_train, y_train)}')
print(f' Test Misclassification Score: {1 - gs_bagg.score(X_test, y_test)}')

* Preds / F1 Score

In [None]:
preds_bagg = gs_bagg.predict(X_test)


In [None]:
f1_score(y_test, preds_bagg)


### AdaBoost

In [None]:
ada_pipe = Pipeline([
    ('ada', AdaBoostClassifier())
])

ada_pipe_params = {
    'ada__n_estimators':[50, 100, 150, 200],
    'ada__learning_rate': [1.0, 1.5, 2.0]
}

gs_ada = GridSearchCV(ada_pipe,
                    param_grid = ada_pipe_params, cv = 5,
                    n_jobs = -1)

gs_ada.fit(X_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

* Accuracy

In [None]:
print(f' Training Accuracy Score: {gs_ada.score(X_train, y_train)}')
print(f' Test Accuracy Score: {gs_ada.score(X_test, y_test)}')

* Misclassification

In [None]:
print(f' Training Misclassification Score: {1 - gs_ada.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - gs_ada.score(X_test, y_test)}')

* Preds / F1

In [None]:
preds_ada = gs_ada.predict(X_test)

In [None]:
f1_score(y_test, preds_ada)

### Stacking

In [None]:
level1_estimators = [
    ('nb', gs_nb.best_estimator_),
    ('ada', gs_ada.best_estimator_),
    ('rf', gs_rf.best_estimator_)
]

stacked_model = StackingClassifier(estimators=level1_estimators,
                                 final_estimator=LogisticRegression())

In [None]:
stacked_model.fit(X_train, y_train)

* Accuracy

In [None]:
print(f' Training Accuracy Score: {stacked_model.score(X_train, y_train)}')
print(f' Test Accuracy Score: {stacked_model.score(X_test, y_test)}')

* Misclassification

In [None]:
print(f' Training Misclassification Score: {1 - stacked_model.score(X_train, y_train)}')
print(f' Testtest Misclassification Score: {1 - stacked_model.score(X_test, y_test)}')

* Preds / F1

In [None]:
preds_stacked_model = stacked_model.predict(X_test)

In [None]:
f1_score(y_test, preds_stacked_model)