# Autoencoder (Semi-supervised)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Using TensorFlow backend.


In [3]:
import keras
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
# plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("notebook")


DATA_PATH = '../data/'
VAL_SPLITS = 4

In [4]:
from plot_utils import plot_confusion_matrix
from cv_utils import run_cv_f1
from cv_utils import plot_cv_roc
from cv_utils import plot_cv_roc_prc
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

For this part of the project, we will only work with the training set, that we will split again into train and validation to perform the hyperparameter tuning.

We will save the test set for the final part, when we have already tuned our hyperparameters.

In [5]:
df = pd.read_csv(os.path.join(DATA_PATH,'df_train.csv'))
df.drop(columns= df.columns[0:2],inplace=True)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24,V25,V26,V27,V28,Class,TimeScaled,TimeSin,TimeCos,AmountBC
0,-0.829392,1.118573,0.926038,1.163686,0.009824,0.527347,0.17337,0.723997,-0.638939,-0.162923,...,-0.298908,-0.060301,-0.217935,0.291312,0.120779,0,0.460069,-0.480989,0.876727,3.195062
1,-2.814527,1.613321,0.654307,0.581821,0.399491,0.73004,0.456233,-2.464347,0.654797,2.248682,...,-0.329526,-0.307374,-0.440007,-2.135657,0.011041,0,0.266395,-0.204567,-0.978853,3.125269
2,2.105028,-0.7004,-1.338043,-0.596395,-0.395217,-0.75505,-0.276951,-0.291562,-0.965418,1.107179,...,-0.278137,-0.040685,0.789267,-0.066054,-0.069956,0,0.762303,-0.153992,-0.988072,3.421235
3,2.205839,-1.023897,-1.270137,-0.950174,-0.868712,-0.975492,-0.475464,-0.280564,0.503713,0.448173,...,-0.041177,0.089158,1.105794,-0.066285,-0.079881,0,0.87974,-0.998227,0.059524,1.072145
4,2.02709,-0.778666,-1.552755,-0.558679,0.020939,-0.026071,-0.20781,-0.124288,-0.635953,0.817757,...,0.033477,-0.157992,-0.606327,-0.003931,-0.039868,0,0.821649,-0.783558,-0.621319,3.97149


## Preprocessing the data

Although we are always using cross validation with `VAL_SPLITS` folds, (in general, 4), here we are gonna set only one split in order to explore how the Autoencoder works and get intuition.

## Defining the model

In [12]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LeakyReLU


def create_clf(input_dim):
    clf = Sequential([
        Dense(40, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(16),
        LeakyReLU(),
        Dense(16),
        LeakyReLU(),
        Dense(8),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf

## Training with CV

In [13]:
# In case we want to select a subset of features
# df_ = df[['Class','V9','V14','V16','V2','V3','V17']]
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()

# this is the size of our encoded representations
ENCODED_DIM = 2
INPUT_DIM = X.shape[1]

In [24]:
from sklearn.metrics import f1_score

def run_cv_f1_nn(create_clf, cv, X, y, calculate_on_train=True, verbose=True, save_models = False):
    # We create two eampty lists to save the metrics at each fold for train
    # and validation.
    metrics = []
    if calculate_on_train:
        metrics_train = []
    # Loop over the different validation folds
    val_iterable = cv.split(X, y)
    for i, (idx_t, idx_v) in enumerate(val_iterable):
        X_train = X[idx_t]
        y_train = y[idx_t]
        X_val = X[idx_v]
        y_val = y[idx_v]
        
        clf = create_clf(INPUT_DIM)
        clf.compile(optimizer='adam',
              loss='binary_crossentropy')
        clf.fit(X_train, y_train,batch_size=512, epochs=50,shuffle=True,verbose=0)

        y_pred = clf.predict(X_val)
        y_pred = 1*(y_pred>0.5)
        metric = f1_score(y_val, y_pred)
        metrics.append(metric)
        if calculate_on_train:
            y_t_pred = clf.predict(X_train)
            y_t_pred = 1*(y_t_pred>0.5)
            metric_train = f1_score(y_train, y_t_pred)
            metrics_train.append(metric_train)
        if verbose:
            print('{}-fold / {} completed!'.format(i + 1,
                                                   cv.get_n_splits()))
        if save_models:
            # Save the models into files for future use
            clf.save('models_nn_clf/clf_nn_fold_'+str(i+1)+'.h5')
    
    if calculate_on_train:
        if verbose:
            print('F1 value (Train): {:.2f} ± {:.2f}'.format(
                np.mean(metrics_train),
                np.std(metrics_train, ddof=1)
            ))
            print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(metrics),
                np.std(metrics, ddof=1)
            ))
        return metrics, metrics_train
    else:
        if verbose:
            print('F1 value (Val): {:.2f} ± {:.2f}'.format(
                np.mean(metrics),
                np.std(metrics, ddof=1)
            ))
        return metrics

In [17]:
cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
run_cv_f1_nn(create_clf,cv,X,y)

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
F1 value (Train): 0.93 ± 0.02
F1 value (Val): 0.80 ± 0.07


([0.7454545454545455,
  0.7317073170731706,
  0.8793103448275861,
  0.8363636363636363],
 [0.9590288315629741,
  0.9165446559297219,
  0.9183359013867489,
  0.9256965944272445])

In [18]:
def create_clf(input_dim):
    clf = Sequential([
        Dense(32, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(16),
        LeakyReLU(),
        Dense(8),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf

cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
run_cv_f1_nn(create_clf,cv,X,y)

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
F1 value (Train): 0.91 ± 0.03
F1 value (Val): 0.79 ± 0.07


([0.7166666666666667,
  0.7592592592592592,
  0.8518518518518519,
  0.8468468468468469],
 [0.9219219219219219,
  0.9337442218798152,
  0.8709677419354839,
  0.9073783359497646])

In [25]:
def create_clf(input_dim):
    clf = Sequential([
        Dense(8, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(4),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf

cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
run_cv_f1_nn(create_clf,cv,X,y, save_models=True)

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
F1 value (Train): 0.84 ± 0.03
F1 value (Val): 0.81 ± 0.05


([0.8, 0.743362831858407, 0.8596491228070176, 0.8363636363636363],
 [0.8717156105100464,
  0.8502269288956128,
  0.8112324492979719,
  0.8291873963515755])

In [20]:
def create_clf(input_dim):
    clf = Sequential([
        Dense(7, input_shape=(input_dim,)),
        LeakyReLU(),
        Dense(5),
        LeakyReLU(),
        Dense(1, activation='sigmoid')
    ], name='clf')
    return clf

cv = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
run_cv_f1_nn(create_clf,cv,X,y)

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
F1 value (Train): 0.84 ± 0.02
F1 value (Val): 0.78 ± 0.06


([0.7272727272727272,
  0.7289719626168223,
  0.8360655737704918,
  0.8363636363636363],
 [0.8515497553017944,
  0.8055987558320373,
  0.8522550544323484,
  0.848780487804878])