In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, RocCurveDisplay
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
credit = pd.read_csv('../data/train_cleaned.csv')
credit.head()

Unnamed: 0,ID,is_delinquent,length_of_credit,number_of_delinquent_months,average_delinquency_rate,3mo_delinquency,6mo_delinquency,12mo_delinquency,FLAG_OWN_CAR,FLAG_OWN_REALTY,...,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED
0,5008804,0,16,2,0.125,0,0,0,Y,Y,...,Civil marriage,Rented apartment,1,1,0,0,missing,2.0,32,12
1,5008805,0,15,2,0.133333,0,0,0,Y,Y,...,Civil marriage,Rented apartment,1,1,0,0,missing,2.0,32,12
2,5008806,0,30,7,0.233333,0,0,1,Y,Y,...,Married,House / apartment,1,0,0,0,Security staff,2.0,58,3
3,5008808,1,5,2,0.4,1,1,1,N,Y,...,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8
4,5008809,0,5,0,0.0,0,0,0,N,Y,...,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8


In [5]:
credit.columns

Index(['ID', 'is_delinquent', 'length_of_credit',
       'number_of_delinquent_months', 'average_delinquency_rate',
       '3mo_delinquency', '6mo_delinquency', '12mo_delinquency',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'AGE',
       'YEARS_EMPLOYED'],
      dtype='object')

In [11]:
credit.columns = credit.columns.str.lower()
credit.columns

Index(['id', 'is_delinquent', 'length_of_credit',
       'number_of_delinquent_months', 'average_delinquency_rate',
       '3mo_delinquency', '6mo_delinquency', '12mo_delinquency',
       'flag_own_car', 'flag_own_realty', 'cnt_children', 'amt_income_total',
       'name_income_type', 'name_education_type', 'name_family_status',
       'name_housing_type', 'flag_mobil', 'flag_work_phone', 'flag_phone',
       'flag_email', 'occupation_type', 'cnt_fam_members', 'age',
       'years_employed'],
      dtype='object')

In [14]:
X = credit.drop(columns=[
    'id', 'is_delinquent', 'number_of_delinquent_months', 'average_delinquency_rate', 
    '3mo_delinquency', '6mo_delinquency', '12mo_delinquency'])

y = credit['is_delinquent']

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.dtypes

length_of_credit         int64
flag_own_car            object
flag_own_realty         object
cnt_children             int64
amt_income_total       float64
name_income_type        object
name_education_type     object
name_family_status      object
name_housing_type       object
flag_mobil               int64
flag_work_phone          int64
flag_phone               int64
flag_email               int64
occupation_type         object
cnt_fam_members        float64
age                      int64
years_employed           int64
dtype: object

In [20]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [25]:
X = credit.drop(columns=[
    'id', 'is_delinquent', 'number_of_delinquent_months', 'average_delinquency_rate', 
    '3mo_delinquency', '6mo_delinquency', '12mo_delinquency'])

y = credit['is_delinquent']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

sc = StandardScaler()

oh = OneHotEncoder(
    categories='auto', 
    drop='first', 
    sparse_output=False, 
    dtype='int',
    handle_unknown = 'ignore'
)

ct = ColumnTransformer(
    transformers=[
        ('oh', oh, cat_cols),
        ('sc', sc, num_cols)
    ]
)

X_train_ct = ct.fit_transform(X_train)
X_val_ct = ct.transform(X_val)

In [27]:
gb = GradientBoostingClassifier()
gb.fit(X_train_ct, y_train)

print(gb.score(X_train_ct, y_train))
print(gb.score(X_val_ct, y_val))

0.778815196394076
0.7633612363168062


In [52]:
pgrids = {
    'learning_rate': [.15, .175, 0.2],
    'n_estimators': [285, 290, 295],
    'max_depth': [8, 9, 10]
}

In [53]:
%%time
gb = GradientBoostingClassifier()
gs = GridSearchCV(gb, param_grid=pgrids, cv=5)
gs.fit(X_train_ct, y_train)

CPU times: user 5min 26s, sys: 1.68 s, total: 5min 28s
Wall time: 5min 33s


In [54]:
print(gs.score(X_train_ct, y_train))
print(gs.score(X_val_ct, y_val))

0.9760141661300709
0.7908886027044431


In [102]:
gs.best_params_

{'n_estimators': 290}

In [None]:
%%time
tree = DecisionTreeClassifier(random_state=123)#, max_depth=1
ada = AdaBoostClassifier(estimator=tree, random_state=42)

pgrids_ada = {
    'learning_rate': [2.25, 2.5, 2.7],
    'n_estimators': [250, 300, 350],
    'estimator__max_depth': [71, 81, 91, 101, None],
    # 'estimator__min_samples_leaf': [7, 8, 9],
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}


gs_ada = GridSearchCV(ada, param_grid=pgrids_ada, cv=5)
gs_ada.fit(X_train_ct, y_train)

print(gs_ada.score(X_train_ct, y_train))
print(gs_ada.score(X_val_ct, y_val))

In [None]:
gs_ada.best_params_