# <a id='0'>Contents</a>

- <a href='#2'>Importing Packages</a>  
- <a href='#3'>Uploading Data</a>
- <a href='#4'>Creating Train, Validation, and Testing Sets</a>  
- <a href='#5'>Data Cleaning</a>  
- <a href='#5'>Exploratory Data Analysis</a>
- <a href='#6'>Feature Engineering</a>  

# Introduction

This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card customers in Taiwan from April 2005 to September 2005.


There are 25 variables:

- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
- SEX: Gender
    - 1=male
    - 2=female
- EDUCATION:
    - 1=graduate school
    - 2=university
    - 3=high school
    - 4=others
    - 5=unknown
    - 6=unknown
- MARRIAGE: Marital status
    - 1=married
    - 2=single
    - 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005
    - -1=pay duly
    - 1=payment delay for one month
    - 2=payment delay for two months
    ....
    - 8=payment delay for eight months
    - 9=payment delay for nine months and above
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September,2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- default.payment.next.month: Default payment
    - 1=yes
    - 0=no

# Importing Packages

In [3]:
# Importing Packages
import numpy as np 
import pandas as pd
import re
import json
import requests
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
plt.style.use("fivethirtyeight")
import pickle

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report,balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

%reload_ext autoreload
%autoreload 2
from utils import *

In [4]:
df = pd.read_excel("data/default of credit card clients.xls")
new_header = df.iloc[0]
df = df[1:] 
df.columns = new_header
df = df.rename(columns={"default payment next month": "default"}) 

# Create Dataset Splits

In [5]:
X = df.drop(["default"], axis=1)
y = df["default"]
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)
X_tr, X_tt, y_tr, y_tt = train_test_split(X_train, y_train, train_size=0.875, random_state=42)
train = pd.concat([X_tr, y_tr], axis=1)
val = pd.concat([X_val, y_val], axis=1)
tr = train.drop(["ID"], axis=1)
val = val.drop(["ID"], axis=1)

# Data Cleaning

In [6]:
url = 'https://openexchangerates.org/api/latest.json?app_id=c51b1508fb4145259b1c2fade72a2c04'
response = requests.get(url)
data = response.json()
rate = data['rates']['TWD']

In [7]:
data = [tr, val]
for d in data:
    d.rename(columns={"PAY_0": "behind1", "PAY_2": "behind2", "PAY_3": "behind3", "PAY_4": "behind4", "PAY_5": "behind5", "PAY_6": "behind6", "BILL_AMT1": "billed1", "BILL_AMT2": "billed2", "BILL_AMT3": "billed3", "BILL_AMT4": "billed4", "BILL_AMT5": "billed5", "BILL_AMT6": "billed6", "PAY_AMT1": "paid1", "PAY_AMT2": "paid2", "PAY_AMT3": "paid3", "PAY_AMT4": "paid4", "PAY_AMT5": "paid5", "PAY_AMT6": "paid6", "SEX": "gender", "EDUCATION": "education", "MARRIAGE": "marriage", "AGE": "age", "LIMIT_BAL": "limit"}, inplace=True)
    d[['limit']] = d[['limit']]/rate
    d[['billed1', 'billed2', 'billed3', 'billed4', 'billed5', 'billed6']] = d[['billed1', 'billed2', 'billed3', 'billed4', 'billed5', 'billed6']].divide(rate, axis=1).astype(int)
    d[['paid1', 'paid2', 'paid3', 'paid4', 'paid5', 'paid6']] = d[['paid1', 'paid2', 'paid3', 'paid4', 'paid5', 'paid6']].divide(rate, axis=1).astype(int)
    d['limit'] = d['limit'].apply(lambda x: round(x, 2))
    d.replace({'marriage': {0:3}}, inplace=True)
    d.replace({'education': {5:4, 0:4, 6:4}}, inplace=True)

In [8]:
tr.head()

Unnamed: 0,limit,gender,education,marriage,age,behind1,behind2,behind3,behind4,behind5,behind6,billed1,billed2,billed3,billed4,billed5,billed6,paid1,paid2,paid3,paid4,paid5,paid6,default
6191,1791.5,2,2,1,44,0,0,0,0,0,0,1633,1501,1279,801,847,982,108,179,107,107,179,33,0
16054,5732.81,2,3,1,46,-1,-1,-1,0,-1,-1,892,83,173,147,143,30,83,173,35,143,30,942,0
19706,3583.01,2,2,1,47,-1,-1,-1,-1,-1,-2,238,238,0,224,-14,-14,238,0,224,0,0,0,1
23128,6091.12,2,2,1,29,0,0,0,0,0,0,2833,2242,2268,2289,1558,1576,80,89,92,60,68,75,0
28516,5374.51,2,1,2,33,-2,-2,-2,-2,-2,-2,874,961,1171,1198,996,81,967,1172,1199,996,81,6071,0


In [9]:
tr.describe()

Unnamed: 0,limit,education,marriage,billed1,billed2,billed3,billed4,billed5,billed6,paid1,paid2,paid3,paid4,paid5,paid6
count,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0
mean,5991.325979,1.84281,1.555333,1831.031952,1761.133667,1682.417905,1546.52819,1446.264762,1396.106143,204.954286,214.768381,188.736667,176.128,172.372143,184.361381
std,4642.188925,0.746378,0.522538,2632.407699,2550.197573,2493.955364,2303.286683,2185.746166,2138.160074,627.397874,898.821276,668.116436,602.046148,559.641402,632.686513
min,358.3,1.0,1.0,-5932.0,-2500.0,-5634.0,-6091.0,-2914.0,-7490.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1791.5,1.0,1.0,127.0,108.0,98.0,84.0,63.0,46.0,35.0,29.0,13.0,10.0,8.0,4.0
50%,5016.21,2.0,2.0,804.0,767.0,719.0,682.0,648.0,613.0,75.0,71.0,64.0,53.0,53.0,53.0
75%,8599.22,2.0,2.0,2391.25,2267.5,2138.25,1947.0,1801.0,1770.0,179.0,179.0,161.0,143.0,144.0,143.0
max,35830.09,4.0,3.0,34558.0,35254.0,59624.0,31945.0,33220.0,34456.0,31299.0,60347.0,32105.0,22250.0,14976.0,18887.0


** Observations: **

- No missing data
- There were anomalous values for education and marriage, and the anomalous values were reassigned under other.
- Did not reassign -2 and -1 to 0 for 'behind' features despite being anomalous because they were so many -2 and -1.  There must be so significance to those values.

# Exploratory Data Anlaysis

In [33]:
# organize features into categorical and continuous
categorical = tr[['gender', 'marriage', 'education', 'behind1', 'behind2', 'behind3', 'behind4', 'behind5', 'behind6']]
continuous = tr[['limit', 'age', 'billed1', 'billed2', 'billed3', 'billed4', 'billed5', 'billed6', 'paid1', 'paid2', 'paid3', 'paid4', 'paid5', 'paid6']]
cat_col = categorical.columns
cont_col = continuous.columns

In [34]:
# display distributions of all the continuous variables

# con_1 = pd.melt(tr, value_vars = cont_col)
# sns.set_theme(style="darkgrid", font='serif', context='talk')
# g = sns.FacetGrid(con_1, col='variable', col_wrap=3, sharex=False, sharey=False, height=4)
# g = g.map(sns.distplot, 'value', color='r')
# g.set_xticklabels(rotation=45)
# g.fig.subplots_adjust(top=0.9)
# g.fig.suptitle("Distributions of Continuous Features")
# g.fig.tight_layout()
# plt.savefig("../images/distplot.png")

<img src="images/distplot.png">

** Observations: **

- 

In [35]:
# Use bar graphs of the distribution of data for categorical variables

# cat_1 = pd.melt(tr, value_vars=cat_col)
# sns.set_theme(style="darkgrid", font='serif', context='talk')
# g = sns.FacetGrid(cat_1, col='variable', col_wrap=3, sharex=False, sharey=False, height=4)
# g = g.map(sns.countplot, 'value', color='dodgerblue')
# g.set_xticklabels()
# g.fig.subplots_adjust(top=0.9)
# g.fig.suptitle("Distributions of Categorical Features")
# g.fig.tight_layout()
# plt.savefig("../images/countplot.png")

<img src="images/countplot.png">

In [36]:
yes = tr.default.sum()
no = len(tr)-yes
perc_y = round(yes/len(tr)*100, 1)
perc_n = round(no/len(tr)*100, 1)

# plt.figure(figsize=(8,6))
# sns.set_theme(style="darkgrid", font='serif', context='talk')
# sns.countplot('default', data=tr)
# plt.title('Credit Card Baseline Default', size=16)
# plt.box(False);
# plt.savefig("../images/baseline.png")

<img src="images/baseline.png">

In [37]:
print("Number of Total Non-Defaulters: ", yes)
print("Number of Defaulters: ", no)
print("Percentage of Non-Defaulters: ", perc_y)
print("Percentage of Defaulters: ", perc_n)

pd.DataFrame
default = pd.DataFrame(data = {"Training Dataset": [yes, no, perc_y, perc_n]}, 
                       index = ["Number of Total Non-Defaulters: ", "Number of Defaulters: ", "Percentage of Non-Defaulters: ", "Percentage of Defaulters: "])
default

Number of Total Non-Defaulters:  4656
Number of Defaulters:  16344
Percentage of Non-Defaulters:  22.2
Percentage of Defaulters:  77.8


Unnamed: 0,Training Dataset
Number of Total Non-Defaulters:,4656.0
Number of Defaulters:,16344.0
Percentage of Non-Defaulters:,22.2
Percentage of Defaulters:,77.8


In [38]:
# subset = tr[['gender', 'education', 'marriage', 'behind1', 'behind2', 'behind3', 'behind4', 'behind5', 'behind6', 'default']]
# f, axes = plt.subplots(3, 3, figsize=(15, 12), facecolor='white')
# sns.set_theme(style="darkgrid", font='serif', context='paper')
# f.suptitle('Frequency of Categorical Variables', size=16)
# ax1 = sns.countplot(x="gender", hue="default", data=subset, ax=axes[0,0])
# ax2 = sns.countplot(x="education", hue="default", data=subset, ax=axes[0,1])
# ax3 = sns.countplot(x="marriage", hue="default", data=subset, ax=axes[0,2])
# ax4 = sns.countplot(x="behind1", hue="default", data=subset, ax=axes[1,0])
# ax5 = sns.countplot(x="behind2", hue="default", data=subset, ax=axes[1,1])
# ax6 = sns.countplot(x="behind3", hue="default", data=subset, ax=axes[1,2])
# ax7 = sns.countplot(x="behind4", hue="default", data=subset, ax=axes[2,0])
# ax8 = sns.countplot(x="behind5", hue="default", data=subset, ax=axes[2,1])
# ax9 = sns.countplot(x="behind6", hue="default", data=subset, ax=axes[2,2])
# plt.savefig("../images/default_freq_by_cat.png")

<img src="images/default_freq_by_cat.png">

** Observations: **

- `gender`, `education`, and `marriage` doesn't seem to change with each group in terms of proportions.  Behind seems to have some correlation with default.  That would make sense since being behind in payments would make it more likely that you would default next month. 

- There isn’t a very clear distinction between the distribution of `default` on any of the demographic data. However, you do see quite some distribution differences of target classes for monthly repayment status (`behind1`-`behind6`).



In [39]:
education = tr.groupby(['education', 'default']).size().unstack(1)
education
# education.plot(kind="bar", stacked=True)
# plt.title("Distribution Count of Educational Level and Default Status", size=14)
# plt.savefig("../data/stacked_bar2.png")

default,0,1
education,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6013,1424
2,7408,2341
3,2626,866
4,297,25


<img src="images/stacked_bar2.png">

** No clear relationship with education. The proportion doesn't seem to change with each group. **

In [40]:
marriage = tr.groupby(['marriage', 'default']).size().unstack(1)
marriage
# marriage.plot(kind="bar", stacked=True)
# plt.title("Distribution of Default Status for Marital Status", size=14)
# plt.savefig("../images/stacked_bar3.png")

default,0,1
marriage,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7354,2258
2,8778,2336
3,212,62


<img src="images/stacked_bar3.png">

# Feature Engineering

- `age_bin`: 1 = young adult, 2 = middle age, 3 = senior
- `gen-mar`: interaction between gender and marriage status
- `gen-age`: interaction between age and gender
- 'avail...`: fraction of estimated available balance based on what is billed per month

In [41]:
data = [tr, val]

# create features for demographic variables
for d in data:
    d['age_bin'] = 0
    d.loc[((d['age'] > 20) & (d['age'] < 30)) , 'age_bin'] = 1
    d.loc[((d['age'] >= 30) & (d['age'] < 60)) , 'age_bin'] = 2
    d.loc[((d['age'] >= 60) & (d['age'] < 81)) , 'age_bin'] = 3
    # create categories for single, married, divorced males and females
    d['gen-mar'] = d['gender'] + d['marriage']
    # create categories for young, middle age and senior males and females
    d['gen-age'] = d['gender'] + d['age_bin']

# feature for credit use percentage: fraction of estimated available balance based on what is billed per month
# (credit limit - monthly billed amount) / credit limit
for d in data:
    d['avail6'] = (d.limit - d.billed6) / d.limit
    d['avail5'] = (d.limit - d.billed5) / d.limit
    d['avail4'] = (d.limit - d.billed4) / d.limit
    d['avail3'] = (d.limit - d.billed3) / d.limit
    d['avail2'] = (d.limit - d.billed2) / d.limit
    d['avail1'] = (d.limit - d.billed1) / d.limit
    d['avg_av'] = (d.avail1 + d.avail2 + d.avail3 + d.avail4 + d.avail5 + d.avail6) / 6

# create a feature that indicates whether a client has had a delayed payment or not
def delayed_payment(d):
    if (d.behind1 > 0) or (d.behind2 > 0) or (d.behind3 > 0) or (d.behind4 > 0) or (d.behind5 > 0) or (d.behind6 > 0):
        return 1
    else:
        return 0
for d in data:
    d['delayed'] = d.apply(delayed_payment, axis=1)

# create feature for the total number of months with delayed payment status for a particular client
def total_months_with_delayed_payments(d):
    count = 0
    if (d.behind1 > 0):
        count += 1
    if (d.behind2 > 0):
        count += 1
    if (d.behind3 > 0):
        count += 1
    if (d.behind4 > 0):
        count += 1
    if (d.behind5 > 0):
        count += 1
    if (d.behind6 > 0):
        count += 1
    return count
for d in data:
    d['latemths'] = d.apply(total_months_with_delayed_payments, axis=1)

# the ratio of amount paid and amount billed
for d in data:
    d['pperb1'] = d.paid1 / d.billed2
    d['pperb2'] = d.paid2 / d.billed3
    d['pperb3'] = d.paid3 / d.billed4
    d['pperb4'] = d.paid4 / d.billed5
    d['pperb5'] = d.paid5 / d.billed6

# remove any infinity and NaN values
datasets = ['pperb1', 'pperb2', 'pperb3', 'pperb4', 'pperb5']
for data in datasets:
    tr.replace({data: {np.inf: 0, np.nan: 0}}, inplace=True)
    val.replace({data: {np.inf: 0, np.nan: 0}}, inplace=True)

In [None]:
# plt.style.use("fivethirtyeight")
# sns.set_theme(style="darkgrid", font='serif', context='paper')
# plt.figure(figsize = (20,16))
# plt.title('Pearson Correlation of Features', y = 1.05, size = 20)
# g = sns.heatmap(tr.corr(), cmap='RdBu', square=True, linecolor='white', linewidths=0.2)
# plt.savefig("../images/correlation_matrix_2.png")


<img src="images/correlation_matrix_2.png">

** This includes my engineered features.  Default seems to be correlated with two of my engineered features, delayed and latemnths.  Delayed is whether you have had a delayed payment durig the 6 month history or not. latemnths is the total nunber of months you were given a status of behind in payments.  Seems to be correlated with behind1 and limit.**

In [None]:
pickle_in = open("../data/training_features.pickle","rb")
train2 = pickle.load(pickle_in)
pickle_in = open("../data/validate_features.pickle","rb")
validate2 = pickle.load(pickle_in)

In [None]:
X_train2 = train2.drop(["default"], axis=1)
y_tr = train2["default"]
X_validate2 = validate2.drop(["default"], axis=1)
y_val = validate2["default"]

In [None]:
# # Grab indices of columns for creating dummy variables and create dataframe with dummy variables
dum_feat = X_train2[['gender', 'education', 'marriage', 'age_bin', 'gen-mar', 'gen-age']]
dum_index = dum_feat.columns
tr_dum = pd.get_dummies(data=dum_feat, columns=dum_index, drop_first=True, prefix=['sex', 'edu', 'mar', 'agebin', 'sexmar', 'sexage'])
cont_feat = X_train2.drop(['gender', 'education', 'marriage', 'age_bin', 'gen-mar', 'gen-age'], axis=1)
X_train2_dum = cont_feat.join(tr_dum)
X_train2_dum.head()

In [None]:
dum_feat2 = X_validate2[['gender', 'education', 'marriage', 'age_bin', 'gen-mar', 'gen-age']]
dum_index2 = dum_feat2.columns
val_dum = pd.get_dummies(data=dum_feat2, columns=dum_index2, drop_first=True, prefix=['sex', 'edu', 'mar', 'agebin', 'sexmar', 'sexage'])
cont_feat2 = X_validate2.drop(['gender', 'education', 'marriage', 'age_bin', 'gen-mar', 'gen-age'], axis=1)
X_validate2_dum = cont_feat2.join(val_dum)
X_validate2_dum.head()

In [None]:
scaler = StandardScaler().fit(X_train2_dum)
X_tr2_dum = scaler.transform(X_train2_dum)
X_val2_dum = scaler.transform(X_validate2_dum)

In [None]:
scaler2 = StandardScaler().fit(X_train2)
X_tr2 = scaler2.transform(X_train2)
X_val2 = scaler2.transform(X_validate2)

In [None]:
rfc2 = RandomForestClassifier().fit(X_tr2, y_tr)
y_pred_rfc_tr2 = rfc2.predict(X_tr2)
y_pred_rfc_val2 = rfc2.predict(X_val2)
get_metrics(X_tr2, y_tr, X_val2, y_val, y_pred_rfc_tr2, y_pred_rfc_val2, rfc2)

In [None]:
dtc2 = DecisionTreeClassifier().fit(X_tr2, y_tr)
y_pred_dtc_tr2 = dtc2.predict(X_tr2)
y_pred_dtc_val2 = dtc2.predict(X_val2)
get_metrics(X_tr2, y_tr, X_val2, y_val, y_pred_dtc_tr2, y_pred_dtc_val2, dtc2)

In [None]:
abc2 = AdaBoostClassifier().fit(X_tr2, y_tr)
y_pred_abc_tr2 = abc2.predict(X_tr2)
y_pred_abc_val2 = abc2.predict(X_val2)
get_metrics(X_tr2, y_tr, X_val2, y_val, y_pred_abc_tr2, y_pred_abc_val2, abc2)

# New Baseline Model

In [None]:
pickle_in = open("../data/training_model.pickle","rb")
train3 = pickle.load(pickle_in)
pickle_in = open("../data/validate_model.pickle","rb")
validate3 = pickle.load(pickle_in)

X_train3 = train3.drop(["default"], axis=1)
y_tr = train3["default"]
X_validate3 = validate3.drop(["default"], axis=1)
y_val = validate3["default"]

scaler3 = StandardScaler().fit(X_train3)
X_tr3 = scaler3.transform(X_train3)
X_val3 = scaler3.transform(X_validate3)

In [None]:
logreg3 = LogisticRegression(solver="liblinear", random_state=42).fit(X_tr3, y_tr)
y_pred_log_tr3 = logreg3.predict(X_tr3)
y_pred_log_val3 = logreg3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_log_tr3, y_pred_log_val3, logreg3)

rfc3 = RandomForestClassifier().fit(X_tr3, y_tr)
y_pred_rfc_tr3 = rfc3.predict(X_tr3)
y_pred_rfc_val3 = rfc3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_rfc_tr3, y_pred_rfc_val3, rfc3)

dtc3 = DecisionTreeClassifier().fit(X_tr3, y_tr)
y_pred_dtc_tr3 = dtc3.predict(X_tr3)
y_pred_dtc_val3 = dtc3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_dtc_tr3, y_pred_dtc_val3, dtc3)

abc3 = AdaBoostClassifier().fit(X_tr3, y_tr)
y_pred_abc_tr3 = abc3.predict(X_tr3)
y_pred_abc_val3 = abc3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_abc_tr3, y_pred_abc_val3, abc3)

gbc3 = GradientBoostingClassifier().fit(X_tr3, y_tr)
y_pred_gbc_tr3 = gbc3.predict(X_tr3)
y_pred_gbc_val3 = gbc3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_gbc_tr3, y_pred_gbc_val3, gbc3)

xgb3 = XGBClassifier().fit(X_tr3, y_tr)
y_pred_xgb_tr3 = xgb3.predict(X_tr3)
y_pred_xgb_val3 = xgb3.predict(X_val3)
get_metrics(X_tr3, y_tr, X_val3, y_val, y_pred_xgb_tr3, y_pred_xgb_val3, xgb3)

In [None]:
# logreg = LogisticRegression()
# params = {'C': [0.001, 0.01, 0.1, 1, 10], 
#           'penalty': ['none', 'l1', 'l2', 'elasticnet'],
#           'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']}
# gslog = GridSearchCV(estimator = logreg,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 10,
#                      n_jobs = -1).fit(X_tr_dum, y_tr)
# y_pred_gslog_tr = gslog.predict(X_tr_dum)
# y_pred_gslog_val = gslog.predict(X_val_dum)
# print("Best: %f using %s" % (gslog.best_score_, gslog.best_params_))
# print("")
# get_metrics(X_tr_dum, y_tr, X_val_dum, y_val, y_pred_gslog_tr, y_pred_gslog_val, gslog)

# Best: 0.532578 using {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}

logb = LogisticRegression(C=1, penalty='l1', solver='saga').fit(X_tr, y_tr)
y_pred_logb_tr = logb.predict(X_tr)
y_pred_logb_val = logb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_logb_tr, y_pred_logb_val, logb)

In [None]:
# dtc = DecisionTreeClassifier()
# params = {'criterion': ['gini', 'entropy'],
#           'max_depth': [2, 4, 6, 8, 10],
#           'min_samples_leaf': [2, 4, 6, 8, 10], 
#           'min_samples_split': [2, 4, 6, 8, 10]}
# gsdtc = GridSearchCV(estimator = dtc,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 5,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gsdtc_tr = gsdtc.predict(X_tr)
# y_pred_gsdtc_val = gsdtc.predict(X_val)
# print("Best: %f using %s" % (gsdtc.best_score_, gsdtc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsdtc_tr, y_pred_gsdtc_val, gsdtc)

# # Best: 0.511668 using {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 10, 'min_samples_split': 8}

dtcb = DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=10, min_samples_split=8).fit(X_tr, y_tr)
y_pred_dtcb_tr = dtcb.predict(X_tr)
y_pred_dtcb_val = dtcb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_dtcb_tr, y_pred_dtcb_val, dtcb)

In [None]:
# rfc = RandomForestClassifier()
# params = {'n_estimators': [100, 200, 400, 600, 1000],
#           'criterion': ['entropy', 'gini'],
#           'max_depth': [5, 8, 15, 25, 30],
#           'min_samples_split': [2, 5, 10, 15, 100],
#           'min_samples_leaf': [1, 2, 5, 10]}
# gsrfc = GridSearchCV(estimator = rfc,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 5,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gsrfc_tr = gsrfc.predict(X_tr)
# y_pred_gsrfc_val = gsrfc.predict(X_val)
# print("Best: %f using %s" % (gsrfc.best_score_, gsrfc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsrfc_tr, y_pred_gsrfc_val, gsrfc)

# # Best: 0.558041 using {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}

rfcb = RandomForestClassifier(criterion='entropy', max_depth=8, min_samples_leaf=2, min_samples_split=5, n_estimators=1000).fit(X_tr, y_tr)
y_pred_rfcb_tr = rfcb.predict(X_tr)
y_pred_rfcb_val = rfcb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_rfcb_tr, y_pred_rfcb_val, rfcb)

In [None]:
# abc = AdaBoostClassifier()
# params = {'n_estimators': [10, 50, 100, 200],
#           'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5]}
# gsabc = GridSearchCV(estimator = abc,
#                      param_grid = params,
#                      n_jobs = -1,
#                      cv = 5,
#                      scoring = 'average_precision').fit(X_tr, y_tr)
# y_pred_gsabc_tr = gsabc.predict(X_tr)
# y_pred_gsabc_val = gsabc.predict(X_val)
# print("Best: %f using %s" % (gsabc.best_score_, gsabc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsabc_tr, y_pred_gsabc_val, gsabc)

# Best: 0.542080 using {'learning_rate': 0.2, 'n_estimators': 200}

abcb = AdaBoostClassifier(learning_rate=0.2, n_estimators=200).fit(X_tr, y_tr)
y_pred_abcb_tr = abcb.predict(X_tr)
y_pred_abcb_val = abcb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_abcb_tr, y_pred_abcb_val, abcb)

In [None]:
# gbc = GradientBoostingClassifier()
# params = {'n_estimators': [10, 100, 1000],
#           'learning_rate': [0.001, 0.01, 0.1],
#           'max_depth': [3, 7, 9]}
# gsgbc = GridSearchCV(estimator = gbc,
#                      param_grid = params, 
#                      n_jobs = -1, 
#                      cv = 5, 
#                      scoring = 'average_precision').fit(X_tr, y_tr)
# y_pred_gsgbc_tr = gsgbc.predict(X_tr)
# y_pred_gsgbc_val = gsgbc.predict(X_val)
# print("Best: %f using %s" % (gsgbc.best_score_, gsgbc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsgbc_tr, y_pred_gsgbc_tr, gsgbc)

# # Best: 0.554906 using {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}

gbcb = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=1000).fit(X_tr, y_tr)
y_pred_gbcb_tr = gbcb.predict(X_tr)
y_pred_gbcb_val = gbcb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gbcb_tr, y_pred_gbcb_val, gbcb)

In [None]:
# xgb = XGBClassifier()
# params = {'n_estimators': [50, 100, 150, 200], 
#           'max_depth': [3, 5, 7, 10], 
#           'min_child_weight': [2, 3, 4, 5]}
# gsxgb = GridSearchCV(estimator = xgb,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 5,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gsxgb_tr = gsxgb.predict(X_tr)
# y_pred_gsxgb_val = gsxgb.predict(X_val)
# print("Best: %f using %s" % (gsxgb.best_score_, gsxgb.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsxgb_tr, y_pred_gsxgb_val, gsxgb)

# Best: 0.550954 using {'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50}

xgbb = XGBClassifier(max_depth=3, min_child_weight=3, n_estimators=50).fit(X_tr, y_tr)
y_pred_xgbb_tr = xgbb.predict(X_tr)
y_pred_xgbb_val = xgbb.predict(X_val)
get_metrics(X_tr, y_tr, X_val, y_val, y_pred_xgbb_tr, y_pred_xgbb_val, xgbb)

In [None]:
data = {'Accuracy': [accuracy(y_val, y_pred_logb_val), 
                     accuracy(y_val, y_pred_dtcb_val), 
                     accuracy(y_val, y_pred_rfcb_val), 
                     accuracy(y_val, y_pred_abcb_val), 
                     accuracy(y_val, y_pred_gbcb_val),
                     accuracy(y_val, y_pred_xgbb_val)],
        'F1 Score': [f1(y_val, y_pred_logb_val), 
                     f1(y_val, y_pred_dtcb_val), 
                     f1(y_val, y_pred_rfcb_val), 
                     f1(y_val, y_pred_abcb_val), 
                     f1(y_val, y_pred_gbcb_val),
                     f1(y_val, y_pred_xgbb_val)],
        'ROC AUC': [auc(X_val, y_val, logb),
                    auc(X_val, y_val, dtcb),
                    auc(X_val, y_val, rfcb),
                    auc(X_val, y_val, abcb),
                    auc(X_val, y_val, gbcb),
                    auc(X_val, y_val, xgbb)],
        'Recall': [recall(y_val, y_pred_logb_val), 
                   recall(y_val, y_pred_dtcb_val), 
                   recall(y_val, y_pred_rfcb_val), 
                   recall(y_val, y_pred_abcb_val),
                   recall(y_val, y_pred_gbcb_val),
                   recall(y_val, y_pred_xgbb_val)],
        'Precision': [precision(y_val, y_pred_logb_val), 
                      precision(y_val, y_pred_dtcb_val), 
                      precision(y_val, y_pred_rfcb_val), 
                      precision(y_val, y_pred_abcb_val),
                      precision(y_val, y_pred_gbcb_val),
                      precision(y_val, y_pred_xgbb_val)],
        'PR AUC': [aps(X_val, y_val, logb),
                   aps(X_val, y_val, dtcb),
                   aps(X_val, y_val, rfcb),
                   aps(X_val, y_val, abcb),
                   aps(X_val, y_val, gbcb),
                   aps(X_val, y_val, xgbb)]}
scores3 = pd.DataFrame(data=data, index = ['Logistic with GridSearchCV', 
                                          'Random Forest with GridSearchCV', 
                                          'Decision Tree with GridSearchCV', 
                                          'AdaBoost with GridSearchCV', 
                                          'Gradient Boosting with GridSearchCV',
                                          'XGBoost with GridSearchCV'])