# CHURN PROJECT

In [26]:
import pandas as pd
import numpy as np
import warnings
import os
import sys
import re
from sklearn.utils import shuffle
from dateutil.relativedelta import relativedelta
from datetime import datetime
import string
import pickle
import math
from scipy.stats import binom
import perceptron

warnings.filterwarnings('ignore')

## 1. Read input datasets

In [3]:
# Read Train Dataset
df_start = pd.read_csv('data/p01_bank_data/bank_data_train.csv')

# Read Final Test Dataset
df_final = pd.read_csv('data/p01_bank_data/bank_data_test.csv')

## 2. PreProcessing Functions

### Add new calculated trend features

In [4]:
import inspect

def add_trend_features(df):
    tm1 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm1))
    suffix_list = ['1M', '3M', 'TREND']
    prefix_list = ['CNT_TRAN_ATM_TENDENCY',
                   'CNT_TRAN_AUT_TENDENCY',
                   'CNT_TRAN_CLO_TENDENCY',
                   'CNT_TRAN_MED_TENDENCY',
                   'CNT_TRAN_SUP_TENDENCY',
                   'REST_DYNAMIC_CC_',
                   'REST_DYNAMIC_CUR_',
                   'REST_DYNAMIC_FDEP_',
                   'REST_DYNAMIC_IL_',
                   'REST_DYNAMIC_PAYM_',
                   'SUM_TRAN_ATM_TENDENCY',
                   'SUM_TRAN_AUT_TENDENCY',
                   'SUM_TRAN_CLO_TENDENCY',
                   'SUM_TRAN_MED_TENDENCY',
                   'SUM_TRAN_SUP_TENDENCY',
                   'TURNOVER_DYNAMIC_CC_',
                   'TURNOVER_DYNAMIC_CUR_',
                   'TURNOVER_DYNAMIC_IL_',
                   'TURNOVER_DYNAMIC_PAYM_'
                  ]
    for item in prefix_list:
        col0 = item + suffix_list[0]
        col1 = item + suffix_list[1]
        col2 = item + suffix_list[2]
        df_wk = df[[col0, col1]]
        df_wk['tmp0'] = df_wk[col0]/((df_wk[col1] - df_wk[col0])/2)
        df.insert(1, col2, df_wk['tmp0'])
    print("Time of " + f_name + " = "+ str(datetime.now() - tm1))
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df    


### Replace NULL values by datatype

In [5]:
# Словарь для замены null по типам даннных
dic_null = {}
dic_null['int64'] = -1
dic_null['float64'] = -1.0
dic_null['object'] = 'N/A'

# Формирование словаря со списками полей каждого типа данных
def create_dic_type_list(df_in):
    dic_start_types = {}
    dataTypeSeries = df_in.dtypes
    lst_ind = dataTypeSeries.index
    lst_val = dataTypeSeries.values
    for i in range(len(lst_ind)):
        s = lst_val[i]
        if s in dic_start_types.keys():
            wk = dic_start_types[s]
        else:
            wk = []
        wk.append(lst_ind[i])
        dic_start_types[s] = wk
    return dic_start_types

# Замена пустых значений на значения из словаря
def replace_null(df_in, dic_null, dic_start_types):
    tm1 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm1))
    for it in dic_start_types.keys():
        val0 = dic_null['object']
        if it == 'int64':
            val0 = dic_null['int64']
        if it == 'float64':
            val0 = dic_null['float64']
        df_in[dic_start_types[it]] = df_in[dic_start_types[it]].fillna(value=val0)
    print("Time of " + f_name + " = "+ str(datetime.now() - tm1))    
    return df_in

### Categorial features encoding by LabelEncoder

In [6]:
from sklearn.preprocessing import LabelEncoder
import re
import string
import bisect 



# Удаление множественных пробелов
def remove_mult_spaces(text):
    return " ".join(text.split())

# Удаление знаков пунктуации
def remove_punctuation(text):
    return re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

def replace_label_encoder(df, df1):
    tm1 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm1))
    categorical_features = list(df.select_dtypes(exclude=[np.number]).columns)
    for col in categorical_features:
        df[col] = df[col].str.lower()
        df[col] = df[col].apply(lambda text: remove_punctuation(text))
        df[col] = df[col].apply(lambda text: remove_mult_spaces(text))
        df1[col] = df1[col].str.lower()
        df1[col] = df1[col].apply(lambda text: remove_punctuation(text))
        df1[col] = df1[col].apply(lambda text: remove_mult_spaces(text))

        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        
        df1[col] = df1[col].map(lambda s: 'other' if s not in encoder.classes_ else s)
        encoder.classes = encoder.classes_.tolist()
        bisect.insort_left(encoder.classes, 'other')
        encoder.classes_ = encoder.classes
        df1[col] = encoder.transform(df1[col])
        
    print("Time of " + f_name + " = "+ str(datetime.now() - tm1))    
    return df, df1

### DataFrame MinMaxScaler

In [7]:
from sklearn.preprocessing import MinMaxScaler


def dataframe_scaler(df, df1):
    tm1 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm1))
    scaler = MinMaxScaler()
    cols = list(df.columns)
    for i in range(1, len(cols) - 1):
        col = cols[i]
        arr = df[col].values  
        data = arr.reshape(len(arr), 1)
        scaler.fit(data) 
        lst = scaler.transform(data)
        df[col] = pd.DataFrame(lst)
        
        arr1 = df1[col].values  
        data1 = arr1.reshape(len(arr1), 1)
        lst1 = scaler.transform(data1)
        df1[col] = pd.DataFrame(lst1)

    print("Time of " + f_name + " = "+ str(datetime.now() - tm1))    
    return(df, df1)

### Feature selection

In [8]:
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.utils import shuffle

def feature_selection(df, df1):
    tm0 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm0))
    # Вычисление гиперпараметра альфа для лассо
    tm1 = datetime.now()
    print("   Start calc Alpha value " + str(tm1))
    use_columns = df.columns[1:-1]
    target_columns = df.columns[df.shape[1] - 1:]
    df_lasso = shuffle(df, random_state=42)
    X = df_lasso[use_columns].to_numpy()
    y = df_lasso[target_columns].to_numpy()
    clf = LassoCV(cv=5, random_state=42)
    clf.fit(X, y)
    alpha = clf.alphas_[-1:][0]
    print("   Time of calc Alpha value = "+ str(datetime.now() - tm1))

    # Модель лассо для выбора значимых фичей
    tm1 = datetime.now()
    print("   Start: Lasso " + str(tm1))
    lso = Lasso(alpha=alpha)
    lso.fit(X, y)
    print("   Time of Lasso = "+ str(datetime.now() - tm1))

    lst_select_features = []
    lst_lasso = list(lso.coef_)
    lst_feature_names = list(df_lasso.columns[1:-1])
    for i in range(len(lst_feature_names)):
        lst_select_features.append([lst_feature_names[i], lst_lasso[i]])
    
    lst_selected = [x for x in lst_select_features if abs(x[1]) > 0]
    lst_selected = sorted(lst_selected, key=lambda x: x)

    lst_drop = [x[0] for x in lst_select_features if abs(x[1]) == 0]
    lst_drop = sorted(lst_drop, key=lambda x: x)
    
    df = df.drop(columns = lst_drop)
    df1 = df1.drop(columns = lst_drop)

    print("Time of " + f_name + " = "+ str(datetime.now() - tm0))    
    return df, df1

### Anomaly detection

In [9]:
from sklearn.ensemble import IsolationForest
from numpy import quantile, where, random

def anomaly_detection(df):
    tm1 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm1))
    df_anomaly = df.iloc[:, 1 : -1].copy()
    x = df_anomaly.to_numpy()
    iforest = IsolationForest(n_estimators=1000)
    iforest.fit(x)
    scores = iforest.score_samples(x)
    thresh = quantile(scores, 0.001)
    index = where(scores <= thresh)
    del_list = list(index[0])
    df_anomaly_del = df.drop(del_list)
    print("Time of " + f_name + " = "+ str(datetime.now() - tm1))    
    return df_anomaly_del

## 3. Execute Prepocessing

In [15]:
df_trend = add_trend_features(df_start)
df_trend_f = add_trend_features(df_final)

Start add_trend_features 2022-03-29 13:09:27.208466
Time of add_trend_features = 0:00:06.515208
Start add_trend_features 2022-03-29 13:09:34.512999
Time of add_trend_features = 0:00:01.777491


In [16]:
dic_start_types = create_dic_type_list(df_trend)
df_no_null = replace_null(df_trend, dic_null, dic_start_types)

dic_start_types_f = create_dic_type_list(df_trend_f)
df_no_null_f = replace_null(df_trend_f, dic_null, dic_start_types_f)

Start replace_null 2022-03-29 13:09:36.606571
Time of replace_null = 0:00:01.013497
Start replace_null 2022-03-29 13:09:37.620756
Time of replace_null = 0:00:00.288369


In [17]:
df_label, df_label_f  = replace_label_encoder(df_no_null, df_no_null_f)

Start replace_label_encoder 2022-03-29 13:09:37.918338
Time of replace_label_encoder = 0:01:02.647138


In [18]:
df_scaled, df_scaled_f = dataframe_scaler(df_label, df_label_f )

Start dataframe_scaler 2022-03-29 13:10:40.571450
Time of dataframe_scaler = 0:00:00.835214


In [19]:
df_features_selection, df_features_selection_f = feature_selection(df_scaled, df_scaled_f)

Start feature_selection 2022-03-29 13:10:41.411420
   Start calc Alpha value 2022-03-29 13:10:41.416907
   Time of calc Alpha value = 0:01:13.350830
   Start: Lasso 2022-03-29 13:11:54.767867
   Time of Lasso = 0:00:56.005590
Time of feature_selection = 0:02:09.652027


In [22]:
df_ready = anomaly_detection(df_features_selection)
df_ready_f = df_features_selection_f.copy()

Start anomaly_detection 2022-03-29 13:17:17.470338
Time of anomaly_detection = 0:07:33.789324


In [None]:
# df_ready.to_pickle('df_ready.pkl') 
df_ready = pd.read_pickle('data/df_ready.pkl')
# df_ready_f.to_pickle('df_ready_f.pkl') 
df_ready_f = pd.read_pickle('data/df_ready_f.pkl')

In [23]:
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)
pd.set_option("max_colwidth", -1)

## 4. Stratified Split Data

In [12]:
from sklearn.model_selection import StratifiedShuffleSplit

def create_train_test(df, not_neural=True, col_list=[]):
    shuffle_index = []
    if len(col_list) == 0:
        use_columns = list(df.columns[1:-1])
    else:
        use_columns = col_list
    target_column = list(df.columns[df.shape[1] - 1 : df.shape[1]])
    X = df[use_columns].to_numpy()
    if not_neural:
        y = df[target_column].to_numpy()
    else:
        df = df.assign(TARGET_ADD = 1 - df.TARGET)
        target_column = list(df.columns[df.shape[1] - 2 : df.shape[1]])
        y = df[target_column].to_numpy()
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        shuffle_index.append([train_index, test_index])
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        if not_neural:
            y_train = y_train.reshape((y_train.shape[0],))
            y_test = y_test.reshape((y_test.shape[0],))
    return X_train, X_test, y_train, y_test

def create_final_input(df):
    use_columns = list(df.columns[1:-1])
    X = df[use_columns].to_numpy()
    return X

In [13]:
X_train, X_test, y_train, y_test = create_train_test(df_ready, True)
X_Zero = create_final_input(df_ready_f)

NameError: name 'df_ready' is not defined

## 5. Funclions for models

### Score functions

In [29]:
from sklearn.metrics import roc_auc_score, accuracy_score
def my_round(y_in):
    y0 = []
    ll = len(y_in)
    for i in range(ll):
        curr = y_in[i]
        rr = round(curr)
        y0.append(int(rr))
    return y0

def get_scores(y_true, y_pred):
    roc = roc_auc_score(y_true, y_pred)
    acc = accuracy_score(my_round(y_true), my_round(y_pred))
    return roc #acc, roc

In [None]:
from sklearn.model_selection import GridSearchCV

def GridSearchCV_best_param(model, param_grig, X_train, y_train):
    tm0 = datetime.now()
    f_name = inspect.getframeinfo(inspect.currentframe()).function
    print("Start " + f_name + " " + str(tm0))
    grid_model = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
    grid_model.fit(X_train, y_train)
    best_param = grid_model.best_params_
    print("Time of " + f_name + " = "+ str(datetime.now() - tm0))
    return best_param


### Show results

In [46]:
# Отображение результатов моделирования
list_rez = []

def show_result_table(list_for_result=list_rez):
    list_rez_columns = ['Library   ', 'Algorithms', 'Hyperparameters', 'Accuracy', 'AUC']
    df_result = pd.DataFrame(list_for_result, columns=list_rez_columns)
    df_result.style.set_properties(**{'text-align': 'left'})
    return df_result

## 6. Models

### 6.1. Baseline: Naive classifier

In [None]:
train_one = np.sum(y_train)
train_zero = len(y_train) - train_one
if train_one > train_zero:
    naive_rez = 1.0
else:
    naive_rez = 0.0
y_pred_naive = np.full((len(y_test), 1), naive_rez)
y_pred_naive = y_pred_naive.reshape((y_pred_naive.shape[0],))
acc, auc = get_scores(y_test, y_pred_naive)
list_rez.append(['My_own_code', 'Naive classifier', 'No', acc, auc ])
show_result_table() 

### 6.2. Baseline: RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(random_state=42,criterion='gini')
param_grid = {
    'n_jobs': [100],
    'n_estimators': [300],
    'max_features': ['sqrt'],
    'max_depth' : [6]
}
best_param = GridSearchCV_best_param(rfc, param_grid, X_train[:20000], y_train[:20000])
print("The best params for RandomForestClassifier:")
print(best_param)
rfc_best = RandomForestClassifier(random_state=42,criterion='gini')
rfc_best.set_params(**best_param)
rfc_best.fit(X_train, y_train)
y_pred_rfc = rfc_best.predict_proba(X_test)
acc, auc = get_scores(y_test, y_pred_rfc[:, 1])
list_rez.append(['sklearn.ensemble', 'RandomForestClassifier', best_param, acc, auc ])
show_result_table() 

### 6.3. MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

MLP_clf = MLPClassifier(random_state=42)
param_grid = {
    'solver': ['lbfgs'], 
    'max_iter': [100], 
    'alpha': [0.0001], 
    'hidden_layer_sizes': [100],
    'activation': ['logistic' ]}

best_param = GridSearchCV_best_param(MLP_clf, param_grid, X_train[:20000], y_train[:20000])
print("The best params for MLPClassifier:")
print(best_param)
MLP_best_clf = MLPClassifier(random_state=42)
MLP_best_clf.set_params(**best_param)
MLP_best_clf.fit(X_train, y_train)
y_pred_mlp = MLP_best_clf.predict_proba(X_test)
acc, auc = get_scores(y_test, y_pred_mlp[:, 1])
list_rez.append(['sklearn.neural_network', 'MLPClassifier', best_param, acc, auc ])
show_result_table() 

### 6.4. Keras

In [50]:
import tensorflow as tf 
import keras
from keras import layers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras import backend as K



In [52]:
X_train, X_test, y_train, y_test = create_train_test(df_ready, True)

In [53]:
# n_all = y_train.shape[0]
# n_one = y_train.sum()
# class_weight = {0: n_one/n_all, 1: (n_all - n_one)/n_all}
# class_weight

In [54]:
from sklearn.utils import class_weight

class_weight_arr = class_weight.compute_class_weight(
    'balanced',
    np.unique(y_train),
    y_train)
class_weight = {}
for i in range(len(class_weight_arr)):
    class_weight[i] = class_weight_arr[i]
class_weight

{0: 0.5442965888827105, 1: 6.143775430698641}

In [58]:
input_nodes = X_train.shape[1]

learning_rate = 0.005

reduceLROnPlateau = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.1,
    patience=10,
    verbose=0,
    mode="auto",
    min_delta=0.00001,
    cooldown=0,
    min_lr=0,
)

early_stopping=EarlyStopping(monitor='val_loss',
                             mode='min',
                             patience=20,
                             restore_best_weights=True,
                             min_delta=0.00001
                            )
keras_clf = keras.Sequential([
  layers.Dense(50, 
               activation='relu', use_bias=False,
#                kernel_regularizer = keras.regularizers.l2(l=0.0001),
               input_shape=(input_nodes,)), 
#   layers.BatchNormalization(),
    
    
  layers.Dense(20,
#                kernel_regularizer = keras.regularizers.l2(l=0.0001),
               activation='relu', use_bias=True),
#   layers.BatchNormalization(),
  layers.Dropout(0.25),
    
  layers.Dense(1, activation='sigmoid')])


# keras_clf.compile(
#     optimizer=keras.optimizers.SGD(
#     learning_rate=learning_rate, momentum=0.9, nesterov=True

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)



keras_clf.compile(
    optimizer=tf.optimizers.Nadam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07        
        

# keras_clf.compile(
#     optimizer=keras.optimizers.Adam(
#                                 beta_1=0.9,
#                                 beta_2=0.999,
#                                 learning_rate=learning_rate,
#                                 epsilon=1e-08
        
                                    ),
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    metrics = [keras.metrics.AUC()])

In [59]:
keras_clf.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 50)                4700      
                                                                 
 dense_10 (Dense)            (None, 20)                1020      
                                                                 
 dropout_3 (Dropout)         (None, 20)                0         
                                                                 
 dense_11 (Dense)            (None, 1)                 21        
                                                                 
Total params: 5,741
Trainable params: 5,741
Non-trainable params: 0
_________________________________________________________________


In [60]:
y_train_000 = y_train.reshape(-1)
y_test_000 = y_test.reshape(-1)
X_train_000 = X_train.copy()


best_auc = 0



In [61]:
# tf.keras.backend.floatx()
# tf.keras.backend.set_floatx('float64')
# tf.keras.backend.floatx()



In [62]:
# w_file = 'weights_keros.txt'
# keras_clf.load_weights(w_file)
# learning_rate = 6.88982775356993e-06
# K.set_value(keras_clf.optimizer.learning_rate, learning_rate)
# K.set_floatx('float64')
#best_auc = 0.8080009905157872

In [63]:
# learning_rate = 3e-5

In [64]:
n_epochs = 50
for i in range(10000):
    shuffled_indices = np.random.permutation(len(y_train))
    X_train = X_train[shuffled_indices]
    y_train = y_train[shuffled_indices]
    history = keras_clf.fit(X_train, 
                            y_train,
                            class_weight=class_weight,
                            batch_size=100,
                            epochs=1,
                            validation_split=0.1,
                            callbacks=[early_stopping,
                                       reduceLROnPlateau
                                      ]
                            ,use_multiprocessing=True
                           )
    y_pred_keras = keras_clf.predict(X_train_000)
    y_pred_keras = y_pred_keras.reshape(-1)
    auc1 = roc_auc_score(y_train_000, y_pred_keras)
    
    y_pred_keras = keras_clf.predict(X_test)
    y_pred_keras = y_pred_keras.reshape(-1)
    auc2 = roc_auc_score(y_test_000, y_pred_keras)
    if auc2 > best_auc:
        best_auc = auc2
        with open('log_keros.txt', 'a') as the_file:
            the_file.write(f'i = {i}, learning_rate = {learning_rate}, auc1 = {auc1}, auc2 = {auc2} \n')
            w_file = 'weights_keros.txt'
            keras_clf.save_weights(w_file, overwrite=True)
    print (i, learning_rate, auc1, auc2)
    n_epochs -= 1
    if n_epochs <= 0:
        n_epochs = 10
        learning_rate = learning_rate + 5e-5
        w_file = 'weights_keros8277.txt'
        # keras_clf.load_weights(w_file)
        if learning_rate > 5e-3:
            break
        K.set_value(keras_clf.optimizer.learning_rate, learning_rate)

0 0.005 0.7454044823327981 0.7399606437206401
1 0.005 0.7595609446299565 0.7536639380059897
2 0.005 0.7715870690650828 0.7690122944613083
3 0.005 0.7680740874267769 0.7651606887534524
4 0.005 0.7769441264899176 0.7702672300966378
5 0.005 0.7762762652544021 0.7677485324818599
6 0.005 0.7849165221719331 0.7774529537666666
7 0.005 0.7823158022212313 0.7755852783708097
8 0.005 0.7848548021211067 0.7784869612416547
9 0.005 0.7882625467597276 0.7812065003449697
10 0.005 0.7864295000022601 0.7759546808566058
11 0.005 0.7946285674547849 0.7854120986208393
12 0.005 0.7959366341054979 0.7861874545098695
13 0.005 0.7880043044537036 0.7768755485957287
14 0.005 0.7903398940388868 0.7825984202567602
15 0.005 0.7993596799636415 0.7878810489577377
16 0.005 0.7957382706078393 0.7874113591784907
17 0.005 0.8001003034758333 0.7895050365214918
18 0.005 0.8025891643780877 0.7927456499442018
19 0.005 0.8038678142257338 0.7917096055184273
20 0.005 0.8032359305890303 0.7927711012060831
21 0.005 0.804953651028

KeyboardInterrupt: 

In [None]:
keras_clf.save_weights(w_file, overwrite=True)

In [None]:
best_auc

In [None]:
y_pred_keras = keras_clf.predict(X_train)
y_train = y_train.reshape(-1)
y_pred_keras = y_pred_keras.reshape(-1)
auc = roc_auc_score(y_train, y_pred_keras)
acc = accuracy_score(my_round(y_train), my_round(y_pred_keras))
auc, acc 

In [None]:
y_pred_keras = keras_clf.predict(X_test)
y_test = y_test.reshape(-1)
y_pred_keras = y_pred_keras.reshape(-1)
auc = roc_auc_score(y_test, y_pred_keras)
acc = accuracy_score(my_round(y_test), my_round(y_pred_keras))

In [None]:
auc, acc 

In [None]:
from keras import backend as K

learning_rate = learning_rate * 0.9
learning_rate = 5e-5
K.set_value(keras_clf.optimizer.learning_rate, learning_rate)

w_file = 'weights_keros8277.txt'
keras_clf.load_weights(w_file)




In [None]:
y_Zero = keras_clf.predict(X_Zero)
y_Zero


In [None]:
y_pred_keras = keras_clf.predict(X_test)
y_test = y_test.reshape(-1)
y_pred_keras = y_pred_keras.reshape(-1)
y_pred = []
for i in range(len(y_pred_keras)):
    r = 0
    if y_pred_keras[i]>0.5:
        r=1
    y_pred.append(r)
sum(y_pred), sum(y_test)

auc = roc_auc_score(y_test, y_pred_keras)
acc = accuracy_score(my_round(y_test), my_round(y_pred_keras))

auc, acc

In [None]:
params = {
    'optimizer': 'Adam()',
    'loss': 'BinaryCrossentropy(from_logits=True)',
    'learning_rate': '1.8739346279424403e-05'
}
list_rez.append(['keras.layers', 'KERAS', params, acc, auc ])


In [None]:
show_result_table(list_rez) 

In [None]:
list_rez= [['My_own_code', 'Naive classifier', 'No', 0.918638240308876, 0.5],
 ['sklearn.ensemble',
  'RandomForestClassifier',
  {'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 300, 'n_jobs': 100},
  0.918638240308876,
  0.7966546545329787],
 ['sklearn.neural_network',
  'MLPClassifier',
  {'activation': 'logistic',
   'alpha': 0.0001,
   'hidden_layer_sizes': 100,
   'max_iter': 100,
   'solver': 'lbfgs'},
  0.9189341524934125,
  0.7246480914724593],
 ['keras.layers',
  'KERAS',
  {'optimizer': 'Adam()',
   'loss': 'BinaryCrossentropy(from_logits=True)',
   'learning_rate': '1.8739346279424403e-05'},
  0.7111051615539618,
  0.8245074490998133],
 ['tensorflow',
  'tf.estimator.LinearClassifier',
  {'num_epochs': '10', 'n_batch': '128', 'steps': '10000'},
  0.9185819,
  0.7152796]]

In [None]:
import pickle

# with open("list_rez.pkl", "wb") as fp:
#     pickle.dump(list_rez, fp)
    
# with open("list_rez.pkl", "rb") as fp:  
#     list_rez = pickle.load(fp)

## 6.5 Tensorflow

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
FEATURES = list(df_ready.columns[1:-1])
LABEL = df_ready.columns[df_ready.shape[1] - 1 : df_ready.shape[1]][0]

In [None]:
def get_input_fn77(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

def create_train_val_test(df):
    use_columns = list(df.columns[1:-1])
    target_column = list(df.columns[df.shape[1] - 1 : df.shape[1]])
    X = df[use_columns].to_numpy()
    y = df[target_column].to_numpy()
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        df0 = df.iloc[train_index]
        X_0 = df0[use_columns].to_numpy()
        y_0 = df0[target_column].to_numpy()
        sss_val = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=0)
        for train_index0, val_index in sss.split(X_0, y_0):
            df_train = df.iloc[train_index0].copy()
            df_test = df.iloc[test_index].copy()
            df_val = df.iloc[val_index].copy()
    return df_train, df_test, df_val

df_train, df_test, df_val = create_train_val_test(df_ready)

In [None]:
my_features = [tf.feature_column.numeric_column(k) for k in FEATURES]

In [None]:
model = tf.estimator.LinearClassifier(
    n_classes = 2,
    feature_columns=my_features)

In [None]:
model.train(input_fn=get_input_fn77(df_train, 
                                      num_epochs=5,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=10000)

In [None]:
model.evaluate(input_fn=get_input_fn77(df_val, 
                                      num_epochs=5,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=10000)

In [None]:
test_res = model.evaluate(input_fn=get_input_fn77(df_test, 
                                      num_epochs=5,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=10000)

In [None]:
test_res['accuracy'], test_res['auc']

In [None]:
list_rez = [['My_own_code', 'Naive classifier', 'No', 0.918638240308876, 0.5],
 ['sklearn.ensemble',
  'RandomForestClassifier',
  {'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 300, 'n_jobs': 100},
  0.918638240308876,
  0.7966546545329787],
 ['sklearn.neural_network',
  'MLPClassifier',
  {'activation': 'logistic',
   'alpha': 0.0001,
   'hidden_layer_sizes': 100,
   'max_iter': 100,
   'solver': 'lbfgs'},
  0.9189341524934125,
  0.7246480914724593],
 ['keras.layers',
  'KERAS',
  {'optimizer': 'keras.optimizers.Adam()',
   'loss': 'keras.losses.BinaryCrossentropy(from_logits=True)',
   'learning_rate': '0.0001'},
  0.7802781574534643,
  0.7651976526323951],
 ['tensorflow',
  'tf.estimator.LinearClassifier',
  {'num_epochs': '10', 'n_batch': '128', 'steps': '10000'},
  0.9185819,
  0.7152796]]

In [None]:
params = {
      'num_epochs': '10',
      'n_batch': '128',
      'steps': '10000'}

In [None]:
list_rez.append(['tensorflow', 'tf.estimator.LinearClassifier', params, test_res['accuracy'], test_res['auc']])
# show_result_table() 

In [None]:
show_result_table(list_rez)

## 6.6 MLP by Numpy

In [24]:
X_train, X_test, y_train, y_test = create_train_test(df_ready, True)

In [1]:
my_net = perceptron.Net(n_hidden_layers = 2, debug=True, n_neurons=50, lr=0.01, n_epochs=10)
my_net.fit(X_train, y_train)

In [30]:
my_pred = my_net.predict(X_test)
auc = roc_auc_score(y_test, my_pred)
acc = accuracy_score(y_test, my_pred)

In [40]:
acc, auc

(0.918610058196063, 0.5)

In [66]:
pd.DataFrame(y_train).value_counts()

0    260765
1    23102 
dtype: int64

In [41]:
params = {  'n_hidden_layers': '2',
            'n_neurons': '50',
            'lr': '0.01',
            'n_epochs': '2'}

In [47]:
# list_rez = []
list_rez.append(['perceptron', 'my_net', params, acc, auc])


In [48]:
show_result_table(list_rez)

Unnamed: 0,Library,Algorithms,Hyperparameters,Accuracy,AUC
0,perceptron,my_net,"{'n_hidden_layers': '2', 'n_neurons': '50', 'lr': '0.01', 'n_epochs': '2'}",0.91861,0.5
