# Ensembles of classifiers

In this case, if we want to use a keras NN in our Voting Ensemble, we cannot use the native sklearn function. We need to build the ensemble by hand.

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy
import sklearn
# plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("notebook")
DATA_PATH = '../data/'

VAL_SPLITS = 4

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

  return f(*args, **kwds)
Using TensorFlow backend.


In [4]:
from plot_utils import plot_confusion_matrix
from cv_utils import run_cv_f1
from cv_utils import plot_cv_roc
from cv_utils import plot_cv_roc_prc

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
# Experimental: Based on LightGMB https://github.com/Microsoft/LightGBM
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
# Pipelines
from sklearn.pipeline import Pipeline
import xgboost as xgb

from sklearn.metrics import f1_score, accuracy_score, precision_score
from sklearn_utils import FeatureSelectorDic

For this part of the project, we will only work with the training set, that we will split again into train and validation to perform the hyperparameter tuning.

We will save the test set for the final part, when we have already tuned our hyperparameters.

In [7]:
df = pd.read_csv(os.path.join(DATA_PATH,'df_train.csv'))
df.drop(columns= df.columns[0:2],inplace=True)

idx_to_feat = dict(enumerate([feat for feat in df.columns if feat is not 'Class']))
feat_to_idx = {feat : idx for idx,feat in idx_to_feat.items()}

cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)

X = df.drop(columns='Class').to_numpy()
y = df['Class'].to_numpy()
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24,V25,V26,V27,V28,Class,TimeScaled,TimeSin,TimeCos,AmountBC
0,-0.829392,1.118573,0.926038,1.163686,0.009824,0.527347,0.17337,0.723997,-0.638939,-0.162923,...,-0.298908,-0.060301,-0.217935,0.291312,0.120779,0,0.460069,-0.480989,0.876727,3.195062
1,-2.814527,1.613321,0.654307,0.581821,0.399491,0.73004,0.456233,-2.464347,0.654797,2.248682,...,-0.329526,-0.307374,-0.440007,-2.135657,0.011041,0,0.266395,-0.204567,-0.978853,3.125269
2,2.105028,-0.7004,-1.338043,-0.596395,-0.395217,-0.75505,-0.276951,-0.291562,-0.965418,1.107179,...,-0.278137,-0.040685,0.789267,-0.066054,-0.069956,0,0.762303,-0.153992,-0.988072,3.421235
3,2.205839,-1.023897,-1.270137,-0.950174,-0.868712,-0.975492,-0.475464,-0.280564,0.503713,0.448173,...,-0.041177,0.089158,1.105794,-0.066285,-0.079881,0,0.87974,-0.998227,0.059524,1.072145
4,2.02709,-0.778666,-1.552755,-0.558679,0.020939,-0.026071,-0.20781,-0.124288,-0.635953,0.817757,...,0.033477,-0.157992,-0.606327,-0.003931,-0.039868,0,0.821649,-0.783558,-0.621319,3.97149


## Ensemble by hand (Hard voting)

In [11]:
def hard_vote_predict(estimators, X, weights=None):
    """
    Combine a dictionary of estimators to create a hard voting ensemble.
    Parameters
    ----------
    estimators : dict
        Dictionary with name (str): model entries with predict method.
        If the method predict returns probabilities, then the name should
        end with 'prob'.
    X : np.array
        Input.
    weights : list, tuple or np.array, default=None
        List of weights for each estimator. If None, then it is uniform.
    """
    if weights is None:
        weights = np.ones(len(estimators))
    else:
        assert len(weights) == len(
            estimators), 'Number of estimators should be the same as number of weights'
        weights = np.array(weights)
    weights = weights.reshape((-1, 1))
    y_preds = []
    for name, clf in estimators.items():
        y_pred = clf.predict(X)
        if name.endswith('prob'):
            y_pred = (1 * (y_pred > 0.5)).reshape((-1))
        y_preds.append(y_pred)

    y_preds = np.array(y_preds)
    y_final = 1 * (np.mean(weights * y_preds, axis=0) > 0.5)
    return y_final

In [9]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LeakyReLU

def create_clf(input_dim):
    clf1 = Sequential([
        Dense(8, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(4),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf1

In [31]:
INPUT_DIM = X.shape[1]

clf1 = create_clf(INPUT_DIM)
clf1.compile(optimizer='adam',
              loss='binary_crossentropy')
# clf2 = RandomForestClassifier(n_estimators=100,
#                               max_depth=6,
#                               random_state=0,n_jobs=-1, max_features=6)
clf2 = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf3 = xgb.sklearn.XGBClassifier(n_jobs=-1,max_depth=5, random_state=0)
# clf3 = LogisticRegression(n_jobs=-1)
sklearn_clfs = [clf2,clf3]
clfs = [clf1]+sklearn_clfs

In [32]:
metrics = []
accuracy = []
precision = []
metrics_train = []
accuracy_train = []
precision_train = []

for i, (idx_t, idx_v) in enumerate(cv.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    #Devuelve cuatro vectrores de dos elementos, el primero con los indices de train y el segundo con 
    #los de validacion 
    
    clf1.fit(X_train,y_train,batch_size=512,epochs=50,verbose=0)
    for clf_ in sklearn_clfs:
        clf_.fit(X_train,y_train)
    
    estimators = dict(zip(['nn_prob','rf','knn'],clfs))
    y_pred = hard_vote_predict(estimators,X_val)


    acc_va = accuracy_score(y_val, y_pred)
    pre_va = precision_score(y_val, y_pred)
#     error_va = mean_squared_error(y_val, y_pred)
    f1_va = f1_score(y_val, y_pred)
    #print('Recall:', acc)
    #print('Precision:', pre)
    #print('Error cuadratico medio:', error)
    
    y_pred_train = hard_vote_predict(estimators,X_train)

    acc_train = accuracy_score(y_train, y_pred_train)
    pre_train = precision_score(y_train, y_pred_train)
#     error_train = mean_squared_error(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    
    metrics.append(f1_va)
    accuracy.append(acc_va)
    precision.append(pre_va)
    
    metrics_train.append(f1_train)
    accuracy_train.append(acc_train)
    precision_train.append(pre_train)
    print('Fold {} has ended!'.format(i+1))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof = 1)
print('Metric value validation(va): {:.2f} +- {:.2f}'.format(metric_mean,metric_std))
#print('Mean validation: recall {:.4f} precision {:.4f}'.format(np.mean(accuracy), np.mean(precision)))


metric_train_mean = np.mean(metrics_train)
metric_train_std = np.std(metrics_train, ddof = 1)
print('Metric value train: {:.2f} +- {:.2f}'.format(metric_train_mean,metric_train_std))

Fold 1 has ended!
Fold 2 has ended!
Fold 3 has ended!
Fold 4 has ended!
Metric value validation(va): 0.82 +- 0.05
Metric value train: 0.88 +- 0.00
