In [11]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as imb_pipeline
from category_encoders.target_encoder import TargetEncoder

import lightgbm as lgb
import xgboost as xgb
import optuna
from optuna.integration import LightGBMPruningCallback

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

%matplotlib inline

# Read Data

In [2]:
train_data = pd.read_pickle('./data/train_file_eng.pkl')

In [3]:
train_data.head(1)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y,y_encoded,quarter
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no,0,q4


In [4]:
train_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
       'poutcome', 'y', 'y_encoded', 'quarter'],
      dtype='object')

In [5]:
feature_set = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
               'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
               'poutcome', 'quarter']

cat_features = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                'contact', 'month', 'day_of_week', 'poutcome', 'quarter']

num_features = ['age', 'duration', 'campaign', 'previous']

In [6]:
for col_name in cat_features:
    train_data[col_name] = train_data[col_name].astype('category')

# Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_set], train_data.y_encoded, random_state=24, test_size=0.2)

In [8]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [9]:
X_train.shape, X_test.shape

((26328, 15), (6582, 15))

# Hyperparameter Tuning with GridSearchCV

## Logistic Regression

In [15]:
results_df = pd.DataFrame(columns=['penalty', 'C', 'score'])
for pen in ['l1', 'l2', 'elasticnet', 'none']:
    for c in [1,2,5,10]:

        scores = []
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
        for train_idx, test_idx in cv.split(X_train, y_train):
            X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

            # Over Sampling
            sampler = RandomOverSampler(sampling_strategy='auto', random_state=24)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)

            #Pipeline
            cat_transformer = Pipeline(steps=[('encoder', OneHotEncoder())])

            num_transformer = Pipeline(steps=[('scaler', StandardScaler())])

            preprocessor = ColumnTransformer(transformers=[
                                                            ('num_trans', num_transformer, num_features),
                                                            ('cat_trans', cat_transformer, cat_features)
                                                        ],
                                            remainder='drop')

            model = LogisticRegression(penalty=pen, C=c, solver='saga', random_state=24)
            if pen == 'elasticnet':
                model.set_params(l1_ratio=0.5)

            clf = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", model)])

            #Fit Pipeline
            clf.fit(X_train_fold2, y_train_fold2)

            #Predict
            y_preds_fold = clf.predict_proba(X_test_fold)
            y_preds_fold = [i[1] for i in y_preds_fold]
            fold_score = roc_auc_score(y_test_fold, y_preds_fold)
            scores.append(fold_score)
            
        results_df.loc[len(results_df)] = [pen, c, np.mean(scores)]

In [25]:
results_df.sort_values('score', ascending=False).head()

Unnamed: 0,penalty,C,score
0,l1,1,0.91494
1,l1,2,0.914933
8,elasticnet,1,0.914932
9,elasticnet,2,0.914927
4,l2,1,0.914926


## KNN

In [26]:
results_df2 = pd.DataFrame(columns=['n', 'w', 'score'])
for n in [5,10,15,25,50,100]:
    for w in ['uniform', 'distance']:

        scores = []
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
        for train_idx, test_idx in cv.split(X_train, y_train):
            X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

            # Over Sampling
            sampler = RandomOverSampler(sampling_strategy='auto', random_state=24)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)

            #Pipeline
            cat_transformer = Pipeline(steps=[('encoder', OneHotEncoder())])

            num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])

            preprocessor = ColumnTransformer(transformers=[
                                                            ('num_trans', num_transformer, num_features),
                                                            ('cat_trans', cat_transformer, cat_features)
                                                        ],
                                            remainder='drop')

            model = KNeighborsClassifier(n_neighbors=n, weights=w)

            clf = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", model)])

            #Fit Pipeline
            clf.fit(X_train_fold2, y_train_fold2)

            #Predict
            y_preds_fold = clf.predict_proba(X_test_fold)
            y_preds_fold = [i[1] for i in y_preds_fold]
            fold_score = roc_auc_score(y_test_fold, y_preds_fold)
            scores.append(fold_score)
            
        results_df2.loc[len(results_df2)] = [n, w, np.mean(scores)]

In [27]:
results_df2.sort_values('score', ascending=False).head()

Unnamed: 0,n,w,score
10,100,uniform,0.833726
8,50,uniform,0.821971
11,100,distance,0.817164
6,25,uniform,0.801594
9,50,distance,0.800958


## Random Forest Classifier

In [28]:
results_df3 = pd.DataFrame(columns=['num_tree', 'max_depth', 'score'])
for num_tree_item in [100, 200, 300, 400, 500, 750]:
    for max_depth_item in [2, 4, 6, 10]:

        scores = []
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
        for train_idx, test_idx in cv.split(X_train, y_train):
            X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

            # Over Sampling
            sampler = RandomOverSampler(sampling_strategy='auto', random_state=24)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)

            #Pipeline
            cat_transformer = Pipeline(steps=[('encoder', TargetEncoder(min_samples_leaf=100, smoothing=5))])

            num_transformer = Pipeline(steps=[('identity', FunctionTransformer(func = None))])

            preprocessor = ColumnTransformer(transformers=[
                                                            ('num_trans', num_transformer, num_features),
                                                            ('cat_trans', cat_transformer, cat_features)
                                                        ],
                                            remainder='drop')

            model = RandomForestClassifier(n_estimators=num_tree_item, max_depth=max_depth_item, random_state=24)

            clf = Pipeline(steps=[("preprocessor", preprocessor),
                                  ("classifier", model)])

            #Fit Pipeline
            clf.fit(X_train_fold2, y_train_fold2)

            #Predict
            y_preds_fold = clf.predict_proba(X_test_fold)
            y_preds_fold = [i[1] for i in y_preds_fold]
            fold_score = roc_auc_score(y_test_fold, y_preds_fold)
            scores.append(fold_score)
            
        results_df3.loc[len(results_df3)] = [num_tree_item, max_depth_item, np.mean(scores)]

In [29]:
results_df3.sort_values('score', ascending=False).head()

Unnamed: 0,num_tree,max_depth,score
23,750.0,10.0,0.927118
19,500.0,10.0,0.927079
15,400.0,10.0,0.927039
11,300.0,10.0,0.926967
7,200.0,10.0,0.926908
