# Oversampling

In [34]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours,RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline

In this case, if we want to use a keras NN in our Voting Ensemble, we cannot use the native sklearn function. We need to build the ensemble by hand.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy
import sklearn

# plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("notebook")
DATA_PATH = '../data/'

VAL_SPLITS = 4

In [4]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Using TensorFlow backend.


In [5]:
from plot_utils import plot_confusion_matrix
from cv_utils import run_cv_f1
from cv_utils import plot_cv_roc
from cv_utils import plot_cv_roc_prc

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
# Experimental: Based on LightGMB https://github.com/Microsoft/LightGBM
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
# Pipelines
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, accuracy_score, precision_score

For this part of the project, we will only work with the training set, that we will split again into train and validation to perform the hyperparameter tuning.

We will save the test set for the final part, when we have already tuned our hyperparameters.

In [7]:
df = pd.read_csv(os.path.join(DATA_PATH,'df_train.csv'))
df.drop(columns= df.columns[0:2],inplace=True)
df.head()
idx_to_feat = dict(enumerate([feat for feat in df.columns if feat is not 'Class']))
feat_to_idx = {feat : idx for idx,feat in idx_to_feat.items()}

## Feature selector Transformer

Different algorithms are trained on different features, so we need to take this into account when preparing an ensemble. This can be done through a Pipeline.

In [19]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [20]:
def select_features(X,list_names,feat_to_idx):
    list_idx = [feat_to_idx[feat] for feat in list_names]
    return X[:,list_idx]

list_features = ['V1','V2']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})

### Example of use

In [58]:
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
print('X.shape:',X.shape)

X.shape: (241167, 32)


## Comparison of sampling methods

In [18]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

scaler = StandardScaler()
over_sampler = SMOTE(random_state=0, n_jobs=-1)
clf_ = xgb.sklearn.XGBClassifier(n_jobs=-1,verbosity=0, 
                                 max_depth=5, learning_rate=0.1, 
                                 random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])
clf = make_pipeline(scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.54 ± 0.03
F1 value (Val): 0.43 ± 0.01


In [25]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
# over_sampler = SMOTEENN(random_state=0)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 1.00 ± 0.00
F1 value (Val): 0.82 ± 0.06


In [24]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = SMOTEENN(random_state=0)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.30 ± 0.01
F1 value (Val): 0.22 ± 0.01


In [26]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = SMOTE(random_state=0, n_jobs=-1)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 1.00 ± 0.00
F1 value (Val): 0.31 ± 0.01


In [29]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = SMOTETomek(random_state=0)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.96 ± 0.01
F1 value (Val): 0.30 ± 0.02


In [33]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
list_features = ['V3','V4','V12','V14','V16','V17']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = NearMiss(random_state=0, n_jobs=-1)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.00 ± 0.00
F1 value (Val): 0.00 ± 0.00


In [32]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = EditedNearestNeighbours(random_state=0, n_jobs=-1)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,over_sampler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.96 ± 0.01
F1 value (Val): 0.82 ± 0.05


In [35]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,enn,renn,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.95 ± 0.00
F1 value (Val): 0.81 ± 0.04


In [36]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,enn,renn,scaler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 0.95 ± 0.01
F1 value (Val): 0.80 ± 0.04


In [37]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

list_features = ['V9','V14','V16']
feat_select = FunctionTransformer(select_features,validate=True,
                    kw_args={'list_names':list_features,
                             'feat_to_idx':feat_to_idx})
scaler = StandardScaler()
over_sampler = EditedNearestNeighbours(random_state=0, n_jobs=-1)
clf_ = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)
# clf = Pipeline([('scaler',scaler), ('passthrough',over_sampler),('clf_',clf_)])

clf = make_pipeline(feat_select,scaler,clf_)

scores = cross_validate(clf,X,y,cv=cv,scoring='f1',n_jobs=-1, return_train_score=True)
print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(scores['train_score']),
                np.std(scores['train_score'], ddof=1)
            ))
print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(scores['test_score']),
                np.std(scores['test_score'], ddof=1)
            ))

F1 value (Train): 1.00 ± 0.00
F1 value (Val): 0.82 ± 0.06


## Ensemble by hand (Hard voting)

In [28]:
def hard_vote_predict(estimators, X, weights = None):
    """
    Combine a dictionary of estimators to create a hard voting ensemble.
    """
    if weights is None:
        weights = np.ones(len(estimators))
    else:
        weights = np.array(weights)
    weights = weights.reshape((-1,1))
    y_preds = []    
    for name,clf in estimators.items():
        y_pred = clf.predict(X)
        if name is 'nn':
            y_pred = (1*(y_pred>0.5)).reshape((-1))
        y_preds.append(y_pred)

    y_preds = np.array(y_preds)
    y_final = 1*(np.mean(weights*y_preds,axis=0) > 0.5)
    return y_final  

In [29]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LeakyReLU

def create_clf(input_dim):
    clf1 = Sequential([
        Dense(8, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(4),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf1

In [31]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = LogisticRegression(solver='sag',random_state=0,n_jobs=-1)

# In case we want to select a subset of features
df_ = df[['Class','V3','V4','V12','V14','V16','V17']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

INPUT_DIM = X.shape[1]

clf1 = create_clf(INPUT_DIM)
clf1.compile(optimizer='adam',
              loss='binary_crossentropy')
# clf2 = RandomForestClassifier(n_estimators=100,
#                               max_depth=6,
#                               random_state=0,n_jobs=-1, max_features=6)
clf2 = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf3 = xgb.sklearn.XGBClassifier(n_jobs=-1,max_depth=5, random_state=0)
# clf3 = LogisticRegression(n_jobs=-1)
sklearn_clfs = [clf2,clf3]
clfs = [clf1]+sklearn_clfs

In [32]:
metrics = []
accuracy = []
precision = []
metrics_train = []
accuracy_train = []
precision_train = []

for i, (idx_t, idx_v) in enumerate(cv.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    #Devuelve cuatro vectrores de dos elementos, el primero con los indices de train y el segundo con 
    #los de validacion 
    
    clf1.fit(X_train,y_train,batch_size=512,epochs=50,verbose=0)
    for clf_ in sklearn_clfs:
        clf_.fit(X_train,y_train)
    
    estimators = dict(zip(['nn','rf','knn'],clfs))
    y_pred = hard_vote_predict(estimators,X_val)


    acc_va = accuracy_score(y_val, y_pred)
    pre_va = precision_score(y_val, y_pred)
#     error_va = mean_squared_error(y_val, y_pred)
    f1_va = f1_score(y_val, y_pred)
    #print('Recall:', acc)
    #print('Precision:', pre)
    #print('Error cuadratico medio:', error)
    
    y_pred_train = hard_vote_predict(estimators,X_train)

    acc_train = accuracy_score(y_train, y_pred_train)
    pre_train = precision_score(y_train, y_pred_train)
#     error_train = mean_squared_error(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    
    metrics.append(f1_va)
    accuracy.append(acc_va)
    precision.append(pre_va)
    
    metrics_train.append(f1_train)
    accuracy_train.append(acc_train)
    precision_train.append(pre_train)
    print('Fold {} has ended!'.format(i+1))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof = 1)
print('Metric value validation(va): {:.2f} +- {:.2f}'.format(metric_mean,metric_std))
#print('Mean validation: recall {:.4f} precision {:.4f}'.format(np.mean(accuracy), np.mean(precision)))


metric_train_mean = np.mean(metrics_train)
metric_train_std = np.std(metrics_train, ddof = 1)
print('Metric value train: {:.2f} +- {:.2f}'.format(metric_train_mean,metric_train_std))

Fold 1 has ended!
Fold 2 has ended!
Fold 3 has ended!
Fold 4 has ended!
Metric value validation(va): 0.82 +- 0.05
Metric value train: 0.88 +- 0.00
