In [1]:
from optbinning import OptimalBinning

from os import listdir
from os.path import isfile, join
from datetime import datetime, timedelta
import time
import sys

import pandas as pd
import numpy as np
import random

from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2

from optbinning import OptimalBinning
from catboost import CatBoostClassifier, Pool, cv

import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from bayes_opt import BayesianOptimization
import catboost
import pickle
import csv
import scipy as sc

## Create a dataset

In [41]:
def create_df(seed):
    np.random.seed(seed)
    x1=np.random.normal(0,1,1000)
    x2=np.random.normal(0,1,1000)
    x3=np.random.normal(0,1,1000)
    x4=np.random.normal(0,1,1000)/5

    z=1 + 2*x1 + x2 + 2*x2*x3 +x4
    pr=1/(1+np.exp(-z))
    y=sc.stats.binom.rvs(1, pr, size=1000)
    y1= z+x4
    df = pd.DataFrame(data={'y':y, 'y1': y1,'x1':x1, 'x2':x2, 'x3':x3, 'x4':x4})
    return [df,z]

In [42]:
df = create_df(30)[0]

In [43]:
df['x5'] = 'B'
df.loc[df.x1>0.1,'x5'] = 'B'
df.loc[df.x1>0.4,'x5'] = 'C'
df.loc[df.x1>0.6,'x5'] = 'D'

In [44]:
#data = df['data'].copy()
#data.columns = df['feature_names'].copy()
#df_view = pd.DataFrame(data, columns = list(df['feature_names']))
#df_view['target'] = df['target'].copy()

In [45]:
df_view = df.copy()

In [46]:
df_view.shape

(1000, 7)

In [47]:
df_types = df_view.dtypes
df_types.value_counts()

float64    5
object     1
int64      1
dtype: int64

In [48]:
categorical_cols = df_types[df_types=='object']
categorical_cols = list(categorical_cols.index)
categorical_cols = [x for x in categorical_cols if x!='uuid']
categorical_cols

['x5']

In [49]:
numerical_cols = df_types[df_types!='object']
numerical_cols = list(numerical_cols.index)
numerical_cols = [x for x in numerical_cols if x!='default']
to_cat = [x for x in numerical_cols if 'status' in x] 
categorical_cols = categorical_cols + to_cat
#numerical_cols = list(set(numerical_cols)-set(ordinal_cols))
numerical_cols = list(set(numerical_cols)-set(categorical_cols))

In [50]:
#numerical_cols.pop('y')
pos = numerical_cols.index("y")
numerical_cols.pop(pos)
numerical_cols

['x4', 'x3', 'x2', 'x1', 'y1']

In [51]:
df_view= df_view[df_view.y.isnull()==False]
random.seed(1)
df_view['random'] = np.random.randint(1, df_view.shape[0], df_view.shape[0])/df_view.shape[0]
porc_train=0.6
porc_test=0.2

df_view['flag_train'] = 0 #train
df_view['flag_train'] = ((df_view['random'] > porc_train)).astype('int') #eval
df_view.loc[df_view['random'] > (1-porc_test),'flag_train']=2 #test

df_view.flag_train.value_counts()

0    613
2    199
1    188
Name: flag_train, dtype: int64

In [52]:
#print(categorical_cols)
predictors = numerical_cols + categorical_cols 
end_num = len(numerical_cols)
end_cat = len(predictors)
predictors

['x4', 'x3', 'x2', 'x1', 'y1', 'x5']

In [53]:
l = predictors.copy()
l.extend(['y', 'flag_train'])
#print(l)
for x in categorical_cols:
    df_view[x] = df_view[x].astype(str)
df_view2 = df_view[l].copy()

cat_features = [df_view2.columns.get_loc(c) for c in categorical_cols  if c in df_view2]
cat_features

[5]

In [54]:

X_train = df_view2[(df_view2['flag_train'] ==0)]
y_train = X_train.pop('y')

X_val = df_view2[(df_view2['flag_train'] ==1)]
y_val = X_val.pop('y')

X_test = df_view2[(df_view2['flag_train'] ==2)]
y_test = X_test.pop('y')

x_train = X_train[predictors].values

__Save dataset for test__

In [56]:
df_view2[(df_view2['flag_train'] == 2)].to_csv('X_test.csv', index = False)


In [None]:
def catboost_classifier(depth, l2_leaf_reg, num_boost_round, subsample):
    params = {
        "loss_function": "Logloss",
        "eval_metric" : "AUC", 
        "depth" : int(depth),
        "min_data_in_leaf": 100,
        "l2_leaf_reg" : int(l2_leaf_reg),
        "learning_rate" : 0.01,
        "random_state" : 42,
        "logging_level" : "Silent",
        "thread_count": 24,
        "num_boost_round": int(num_boost_round),
        "subsample": float(subsample)
    }
    train_data = catboost.Pool(data=x_train, 
                               label=y_train, 
                               cat_features=cat_features)
    cv_result = catboost.cv(
                       train_data,
                       params,
                       early_stopping_rounds=20,
                       stratified=True,
                       nfold=3)
    return cv_result['test-AUC-mean'].iloc[-1]

catboostBO = BayesianOptimization(catboost_classifier, {
                                                'depth': (3,  10),
                                                'l2_leaf_reg': (2, 9),
                                                'num_boost_round': (100, 1000),
                                                'subsample':(0.2,.9)
                                                })


In [None]:
model = CatBoostClassifier(thread_count=5, 
                           max_depth=10, 
                           loss_function='Logloss' ,
                           verbose=100,
                           eval_metric= "AUC",#"CrossEntropy",
                           #early_stopping_rounds=20,
                           random_seed=42,
                           learning_rate=0.01,
                           l2_leaf_reg=5.9,
                           min_data_in_leaf=100,
                           iterations=900,
                           subsample=0.4)

model.fit(x_train,  y_train,  eval_set = (X_val[predictors].values, y_val),   cat_features = cat_features, plot=True)

In [None]:
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

filename = "predictors.csv"
with open(filename, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(predictors)

filename = "to_cat.csv"
with open(filename, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(categorical_cols)   