Author: Daniel Alvarez

<alvarez.da@gmail.com>

### Installations

In [25]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.0 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3


In [39]:
!pip install lightgbm



In [29]:
!pip install boruta

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 1.8 MB/s eta 0:00:011
Installing collected packages: boruta
Successfully installed boruta-0.3


In [2]:
## Import packages.

# General libraries.
import os, platform, sys
import random
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
#from pandas.plotting import autocorrelation_plot

#Standardize variables
from sklearn.preprocessing import StandardScaler

# Import joblib for data persistance
import joblib

# import parquet for persistence
import pyarrow.parquet as pq

# SK-learn libraries for learning
from sklearn.model_selection import train_test_split

# sklearn libraries
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.ensemble import GradientBoostingRegressor as GB
from sklearn.ensemble import ExtraTreesRegressor as ET
from sklearn.ensemble import AdaBoostRegressor as AB

#import lightgbm
#from lightgbm import LGBMClassifier
#import xgboost as xgb
#from xgboost.sklearn import XGBClassifier
from sklearn.impute import SimpleImputer, KNNImputer
#from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, average_precision_score, confusion_matrix, plot_confusion_matrix, classification_report, roc_curve, auc, f1_score, make_scorer, roc_auc_score
from sklearn.decomposition import PCA

from scipy.stats import randint, uniform
from scipy.interpolate import interp1d

# apply Boruta method for dimensionality reduction
from boruta import BorutaPy

# Set display of images in the notebook
%matplotlib notebook

In [22]:
# check versions
print(os.name)
print(f'Platform system: {platform.system()}. Release version: {platform.release()}')
print(sys.version)

print(f'Numpy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')
print(f'Seaborn version: {sns.__version__}')
print(f'Joblib version: {joblib.__version__}')

posix
Platform system: Darwin. Release version: 18.6.0
3.6.10 |Anaconda, Inc.| (default, Mar 23 2020, 17:45:12) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Numpy version: 1.14.2
Pandas version: 0.25.3
Seaborn version: 0.10.0
Joblib version: 0.14.1


In [10]:
# Set random seed for this project
random_seed = 224
random.seed(random_seed)

### Ingestion

In [None]:
# Read data from csv and set data types (dtype), except the first column -'calldate'- which will be parsed later.
df = pd.read_csv("  .csv", sep=',', engine='python')

In [None]:
# Convert columns to str without spaces in lower case
df.columns = df.columns.str.replace('\s+', '_').str.lower()

### Exploratory Data Analysis

In [None]:
# Inspect the dataframe
print(df.shape)
print(df.info())

In [None]:
# first 5 rows
df.head(5)

In [None]:
# last 5 rows
df.tail(5)

Data quality checks

In [None]:
### analyze null values
def nullvalues(d):
    ''' Number of null values for each feature and percentage of null values for each feature'''
    print("{:60s}|{:18s}|{:10s}".format("Feature","Null values","Null Values as a Percent of Total"))
    print("="*100)
    for col in d.columns:
        null_values = d[col].isnull().sum(axis=0)
        null_values_pct = d[col].isnull().sum(axis=0)/len(d)*100
        print("{:60s}|{:14d}\t|{:6f}".format(str(col),null_values,null_values_pct))

In [None]:
nullvalues(d=df)

In [None]:
### analyze cardinality
def cardinality(data):
    ''' Check number of unique values of variables not accounting for null values '''
    print("{:35s}\t| {:10s}\t| {:10s}".format("Feature","Distinct Values","Distinct Values as a Percent of Total"))
    print("="*100)
    for col in data.columns[:]:
        unique_values = len(np.unique(data[col].ffill()))
        unique_values_pct = len(np.unique(data[col].ffill()))/len(data) 
        print("{:35s}\t| {:10d}\t\t| {:6f}".format(str(col),unique_values,unique_values_pct))

In [None]:
cardinality(data=df)

In [None]:
### analyze duplicates
def rowduplication(data):
    ''' Assess the number and percent of duplicates for entire rows in dataset '''
    data_dedup = data.drop_duplicates(keep='first')
    data_duplicates = data[data.duplicated(subset=None, keep='first')]
    
    print('Shape of de-duplicated dataset', data_dedup.shape)
    print('Number of duplicates:', len(data) - len(data_dedup))
    print('Confirm number of duplicates:', len(data_duplicates)) 

In [None]:
rowduplication(data=df)

In [None]:
def duplicationanalyzer(data):
    ''' Assess the number and percentage of duplicates for each variable in the dataset'''
    variable = pd.Series(np.nan)
    
    for var in data:
        ''' Output the number of duplicates and percentage of duplicates '''
        variable = data[var]
        
        # construct variable dataframe less duplicates
        variable_dedup = variable.drop_duplicates(keep='first')
        
        #print the number of duplicates
        duplicates=data[data[var].duplicated(keep=False)]
        print(var)
        print('Number of duplicates: ', len(duplicates))
        
        #print the percentage of duplicates
        percentage = "{0:.2f}".format(len(duplicates)/len(data))
        print('Percentage of duplicates: ', percentage)

In [None]:
duplicationanalyzer(data=df)

In [None]:
# define functions for summary statistics on categorical and numeric variables

def catvardistribution(data, var, title):
    ''' Examine value counts and countplots'''
    print(data[var].value_counts(dropna=False))
    sns.set(style='darkgrid')
    ax = sns.countplot(x=data[var], data=data)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_title(title)
    plt.show()

def summarystats(data, var, titleplot, n_bins):
    ''' Print summary statistics, show histogram and boxplot '''
    print(data[var].unique())
    print(data[var].describe())
    n_bins = n_bins
    #fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
    plt.hist(data[var], bins=n_bins)
    plt.title(titleplot, loc='center', pad=None)
    plt.show()
    sns.set(style='darkgrid')
    ax = sns.boxplot(x=data[var])
    print(ax)

def sidebysideboxplots(data, xvar, yvar, xtitle, ytitle):
    ''' Generate side-by-side boxplots'''
    ax = sns.boxplot(x=xvar, y=yvar, data=data)
    ax.set_xlabel(xtitle)
    ax.set_ylabel(ytitle)

def corrmap(data,figx, figy):
    '''Generate correlation heatmap'''
    var_corr = data.corr()
    var_corr = var_corr.round(3)
    # plot the heatmap and annotation on it
    fig, ax = plt.subplots(figsize=(figx,figy))         # Sample figsize
    sns.heatmap(var_corr, xticklabels=var_corr.columns, yticklabels=var_corr.columns, annot=True)

    # Fix axes
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show()
    
def binary_means(data, feature_group, y_col):
    ''' Show mean scores by feature category '''
    cols = feature_dict[feature_group]
    #cols = feat_eng
    for col in cols:
        print(col)
        print(df.groupby(y_col)[col].mean())
        print('')

In [None]:
# save dictionary of feature lists
feature_dict = {'xxx':xxx, 'ids':ids}

### Feature Engineering

In [None]:
# save engineered features into a list
feat_eng = ['xxx','xxy']
feature_dict['feat_eng'] = feat_eng

#### Evaluate features

In [None]:
binary_means(df,'feat_eng','label')

### Modeling and Evaluation

#### Evaluation metrics functions

In [1]:
# check percentage of null values among model features
def check_null_percent(data_for_modeling, X_cols):
    return((data_for_modeling[X_cols].isnull().sum().sort_values(ascending=False).head(10)/data_for_modeling.shape[0])*100)

# Prediction errors
def prediction(actual,pred):
    ''' Compute prediction errors'''
    print("Mean Absolute Error: %s" %mean_absolute_error(actual,pred))
    print("Mean Squared Error: %s" %mean_squared_error(actual,pred))
    print("Root Mean Squared Error: %s" %sqrt(mean_squared_error(actual,pred)))

NameError: name 'mean_absolute_error' is not defined

In [None]:
# Apply randomized search cross-validation for a given classifier
def model_pipeline(imputer, scaler, estimator, params, scoring_metric, X_train, y_train, X_test, y_test, multi=False):
    pipeline = Pipeline([('imputer', imputer),
                         ('scaler', scaler),
                         ('estimator', estimator)])
    print(pipeline)
    search = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=100,
                                scoring=scoring_metric, cv=5, refit=True, verbose=1, n_jobs=-1)
    search.fit(X_train, y_train)

    print('training set')
    prediction(actual=y_train, search.predict(X_train))

    print('test set')
    prediction(actual=y_test, search.predict(X_test))  
    
    print('best parameters')
    print(pd.Series(search.best_params_))
    
    return(search)

In [38]:
def feat_importance(search, X_cols, model_type):
    ''' Show feature importance '''
    feat_importance = pd.DataFrame()
    if model_type == 'linear':
        feat_importance = pd.DataFrame([search.best_estimator_['estimator'].coef_[0]], columns=X_cols).T
        feat_importance.columns = ['coef']
        feat_importance['abs_coef'] = np.abs(feat_importance['coef'])
        feat_importance.sort_values('abs_coef', ascending=False, inplace=True)
    elif model_type == 'tree':
        feat_importance = pd.DataFrame([search.best_estimator_['estimator'].feature_importances_], columns=X_cols).T
        feat_importance.columns = ['importance']
        feat_importance.sort_values('importance', ascending=False, inplace=True)
    return(feat_importance)

#### Split train and validation sets

In [None]:
y_col = 'label'
data_for_modeling = df[(df[y_col].isnull()==False)]

In [None]:
# temporal train and test
# df_train = data_for_modeling[data_for_modeling['date']<=datetime.date(yyyy,mm,dd)]
# df_dev = data_for_modeling[data_for_modeling['date']>datetime.date(yyyy,mm,dd)]

In [None]:
# random split by dates
def ts_daily_train_dev_split(df, date_feature, test_size, random_state=None):
    
    """Time Series Cross Validator, avoids shuffling information within unique days.
    
    Given a dataframe containing time series with a time step shorter than 1 day and with a variable containing each
    unique date (day), returns 2 dataframes with train and test sets, where data belonging to every unique day can be in
    one of the test sets only.
    
    param df: original dataframe
    param date feature: datetime feature name to split by
    param test_size: test size relative to whole size [0, 1]
    param random_state: random state to replicate the split
    
    return df_train: dataframe with training set
    return df_dev: dataframe with development(dev) set
    """
    
    # Get the list of unique dates in df and create a pandas Series
    days_list = df[date_feature].unique()
    sampled_series = pd.Series(days_list)
    
    # Get train and dev dates(days)
    train, dev = train_test_split(sampled_series, test_size=test_size, random_state=random_state)
    
    # Split dataframe
    df_train = df.loc[df[date_feature].isin(train)]
    df_dev = df.loc[df[date_feature].isin(dev)]
    
    # Return data sets
    return df_train, df_dev

In [None]:
test_frac = 0.2
ts_daily_train_dev_split(df=data_for_modeling, date_feature='date', test_size=test_frac, random_state=None)

In [None]:
# random split general - not by dates
#stratify randomization by label and relevant features
test_frac = 0.2
df_train, df_dev = train_test_split(data_for_modeling, test_size=test_frac, 
                               random_state=random_seed, stratify=data_for_modeling[[y_col]]) 

In [None]:
# create the X train and dev datasets
X_train = df_train.drop('label', axis=1)
X_dev = df_dev.drop('label', axis=1)

# create the target variable
y_train = df_train[y_col]
y_dev = df_dev[y_col]


#### Apply Boruta methods for feature selection

In [None]:
# remove unwanted features
X_cols = list(X_train.columns)
for x in ['xxx', 'xxy', 'xxz']:
    X_cols.remove(x)

for x in X_cols:
    if X_train[x].dtypes == 'object':
        X_cols.remove(x)        

In [None]:
# impute missing values and standardize values 
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()

imputer.fit(df_train[X_cols])
Ximp = imputer.transform(df_train[X_cols])
scaler.fit(Ximp)
Xscaled = scaler.transform(Ximp)

In [None]:
# instantiate random forest
forest = RandomForestRegressor(n_jobs = -1, max_depth = 5)

# fit boruta
boruta_selector = BorutaPy(forest, n_estimators = 'auto', random_state = 0)
boruta_selector.fit(np.array(Xscaled), np.array(df_train[y_col]))

In [None]:
# select top 20 features following Boruta ranking
boruta_ranking = boruta_selector.ranking_
for i, val in enumerate(boruta_ranking):
    if val <= 20:
        print (val, X_cols[i])

In [None]:
orig_X_cols = X_cols

In [None]:
# store results
boruta_ranking = boruta_selector.ranking_
selected_features = np.array(X_cols)[boruta_ranking <= 2]
print(selected_features)

## Models

In [None]:
# assign scoring metric 
scoring_metric = 'neg_mean_squared_error'
# scoring_metric = 'neg_root_mean_squared_error'
# scoring_metric = 'r2'

### KNN

In [None]:
# def Gridsearch_knn(param_grid, cv, train_data, train_target):
#     ''' Fit a KNN regression model and find the optimal value for k '''
#     # GridSearchCV method call to extract parameter values from KNN estimator
#     reg = GridSearchCV(KNeighborsRegressor(), k_range, cv=cv, iid=False, n_jobs=-1)
    
#     # Fit on the train set
#     reg_fit = reg.fit(train_data, train_target)

#     return reg_fit.best_params_['n_neighbors']
     
# # define the parameter values that should be searched
# # Create the k parameter grid to search over in the GridSearchCV method call
# # single key-value pair for param_grid
# k_range = {'n_neighbors': list(range(1,50,1))}
    
# # Tune the hyperparameters to find the optimal value for k in the KNN regression
# best_k = Gridsearch_knn(param_grid= k_range, cv = 5, train_data = X_train, train_target = y_train)

In [None]:
# # calculate predicted values on dev data
# knn = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
# knn.fit(X_train, y_train) 
# knn_predictions = np.round(knn.predict(X_dev))

In [None]:
# # evaluate predictions
# prediction(actual=y_dev, pred=knn_predictions)

In [None]:
# KNN regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = KNN(random_state=random_seed)
params = {'estimator__n_neighbors': list(range(1,50,1)),
          'estimator__weights: ['uniform', 'distance'],
          'estimator__algorithm: ['auto', 'ball_tree', 'kd_tree', 'brute']}

knn = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                    df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(knn, X_cols=selected_features, 'linear')

### Linear Regression

In [None]:
# LASSO regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = LinearRegression(random_state=random_seed)
params = {'n_features_to_select': list(range(1, len(selected_features)))}

lr = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                    df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(lr, X_cols=selected_features, 'linear')

### LASSO regression

In [None]:
# def Gridsearch_lasso(param_grid, cv, train_data, train_target):
#     ''' Fit a lasso regression model and find the optimal value for alpha '''
#     # GridSearchCV method call to extract parameter values from Ridge Regression estimator
#     reg = GridSearchCV(Lasso(max_iter=1000, tol=0.001), alpha_range, cv=cv, iid=False, n_jobs=-1)
    
#     # Fit on the train set
#     reg_fit = reg.fit(train_data, train_target)

#     return reg_fit.best_params_['alpha']
    
# # define the parameter values that should be searched
# # Create the alpha parameter grid to search over in the GridSearchCV method call
# # single key-value pair for param_grid
# alpha_range = {'alpha': list(range(1,1000,3))}
    
# # Tune the hyperparameters to find the optimal value for alpha in the LASSO regression
# best_alpha = Gridsearch_lasso(param_grid= alpha_range, cv = 5, train_data = X_train, train_target = y_train)

In [None]:
# # calculate predicted values on dev data
# lassolr = Lasso(alpha=best_alpha, tol=0.001).fit(X_train, y_train)
# lassolr_predictions = np.round(lassolr.predict(X_dev))

In [None]:
# # evaluate predictions
# prediction(actual=y_dev, pred=lassolr_predictions)

In [None]:
# LASSO regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = Lasso(random_state=random_seed)
params = {'estimator__alpha': list(range(1,1000,3)),
          'estimator__selection': ['cyclic', 'random']}

lasso = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                    df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(lasso, X_cols=selected_features, 'linear')

### Ridge Regression

In [None]:
# def Gridsearch_ridge(param_grid, cv,train_data, train_target):
#     ''' Fit a Ridge regression model and find the optimal value for alpha '''
#     # GridSearchCV method call to extract parameter values from Ridge Regression estimator
#     reg = GridSearchCV(Ridge(), alpha_range, cv=cv, iid=False, n_jobs=-1)
    
#     # Fit on the train set
#     reg_fit = reg.fit(train_data, train_target)

#     return reg_fit.best_params_['alpha']
     
# alpha_range = {'alpha': list(range(0,1000,3))}
    
# # Tune the hyperparameters to find the optimal value for alpha in the Ridge regression
# best_alpha = Gridsearch_ridge(param_grid= alpha_range, cv = 5,train_data = X_train, train_target=y_train)

In [None]:
# calculate predicted values on dev data
# ridgelr = Ridge(alpha=best_alpha).fit(X_train, y_train)
# ridgelr_predictions = np.round(ridgelr.predict(X_dev))

In [None]:
# evaluate predictions
# prediction(actual=y_dev, pred=ridgelr_predictions)

In [None]:
# Ridge regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = Ridge(random_state=random_seed)
params = {'estimator__alpha': list(range(0,1000,3)),
          'estimator__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}

ridge = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                    df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(ridge, X_cols=selected_features, 'linear')

### Random Forest

In [None]:
# Random Forest regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = RF(random_state=random_seed, bootstrap=True)
params = {'estimator__max_leaf_nodes': randint(30,120), 'estimator__max_depth': randint(30, 100),
          'estimator__max_features': ['auto','sqrt','log2'],
           'estimator__min_samples_leaf': randint(15,50), 'estimator__criterion': ['mse','mae'],
           'estimator__n_estimators': randint(30,150)}

rf = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                         df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(rf, X_cols=selected_features, 'tree')

### Gradient Boosting Regression

In [None]:
# Gradient Boosting regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = GB(learning_rate=0.1, random_state=random_seed)
params = {'estimator__max_leaf_nodes': randint(30,120), 'estimator__max_depth': randint(30, 100),
           'estimator__min_samples_leaf': randint(15,50), 'estimator__criterion': ['friedman_mse', 'mse', 'mae'],
           'estimator__n_estimators': randint(30,150), 'estimator__max_features': ['auto','sqrt','log2'],
          'estimator__learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
         'estimator__loss':['ls', 'lad', 'huber', 'quantile']} # default to subsample =1

gb = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                     df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(gb, X_cols=selected_features, 'tree')

### Extra Trees Regression

In [None]:
# Extra Trees regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = ET(random_state=random_seed)
params = {'estimator__max_leaf_nodes': randint(30,120), 'estimator__max_depth': randint(30, 100),
           'estimator__min_samples_leaf': randint(15,50), 'estimator__criterion': ['mse', 'mae'],
           'estimator__n_estimators': randint(30,150), 'estimator__max_features': ['auto','sqrt','log2']} 

et = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                     df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(et, X_cols=selected_features, 'tree')

### Ada Boost Regression

In [None]:
# Ada Boost regression
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
estimator = AB(random_state=random_seed)
params = {'estimator__n_estimators': randint(30,150),
           'estimator__learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
           'estimator__loss':['linear', 'square', 'exponential']} 

ab = model_pipeline(imputer, scaler, estimator, params, scoring_metric, 
                     df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
# Assess feature importance
feat_importance(ab, X_cols=selected_features, 'tree')

### Multi-Layer Perceptron (Neural Network)

In [None]:
# MLPClassifier
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
scaler = StandardScaler()
classifier = MLPClassifier(random_state=random_seed)
params = {'classifier__hidden_layer_sizes': randint(3, 100), 
          'classifier__activation': ['identity','logistic','tanh','relu'],
          'classifier__alpha': [1e4,1e3,1e2], 
          'classifier__learning_rate': ['constant', 'invscaling','adaptive'], 
          'classifier__max_iter': randint(150,250), 
          'classifier__early_stopping': [True, False]}
#scoring_metric = 'roc_auc'
mlp = model_pipeline(imputer, scaler, classifier, params, scoring_metric, 
                    df_train[selected_features], df_train[y_col], df_dev[selected_features], df_dev[y_col])

In [None]:
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]

for title, normalize in titles_options:
    disp = plot_confusion_matrix(mlp, df_dev[selected_features], df_dev[y_col],
                                 display_labels=["Categrory 1", "Categrory 2"],
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    
    disp.ax_.set_title(title)
plt.show()