<a href="https://colab.research.google.com/github/ctshiz/DEEP_LEARNING_STUDIES/blob/main/Extreme_Rare_Event_Classification_using_Autoencoders_in_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# All credits to Chitta Ranjan
# https://towardsdatascience.com/extreme-rare-event-classification-using-autoencoders-in-keras-a565b386f098

In [3]:
#import the desired libraries
import pandas as pd
import numpy as np
from pylab import rcParams

import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve, recall_score, classification_report, auc, roc_curve, precision_recall_fscore_support, f1_score

from numpy.random import seed
seed(1)
#from tensorflow import set_random_seed
tf.random.set_seed(2)

#used to help randomly select the data points
SEED = 123
DATA_SPLIT_PCT = 0.2

rcParams['figure.figsize'] = 8,6
LABELS = ['Normal','Break']

In [4]:
#read and prepare the data
df = pd.read_csv("processminer-rare-event-mts - data.csv")

In [20]:
#count number of positive labels
pos= len(df[df['y']==1])
neg = len(df[df['y']==0])
tot = len(df)
print("Percentual of positive labels are :", (pos/tot)*100)
print("Percentual of negative labels are :", (neg/tot)*100)

Percentual of positive labels are : 0.6739863028590064
Percentual of negative labels are : 99.326013697141


In [23]:
sign = lambda x:(1,-1)[x<0]

def curve_shift(df, shift_by):
  '''
    This function will shift the binary labels in a dataframe.
    The curve shift will be with respect to the 1s. 
    For example, if shift is -2, the following process
    will happen: if row n is labeled as 1, then
    - Make row (n+shift_by):(n+shift_by-1) = 1.
    - Remove row n.
    i.e. the labels will be shifted up to 2 rows up.
    
    Inputs:
    df       A pandas dataframe with a binary labeled column. 
             This labeled column should be named as 'y'.
    shift_by An integer denoting the number of rows to shift.
    
    Output
    df       A dataframe with the binary labels shifted by shift.
    '''
  vector = df['y'].copy()
  for s in range(abs(shift_by)):
    tmp = vector.shift(sign(shift_by))
    tmp = tmp.fillna(0)
    vector += tmp
  labelcol = 'y'
  # add vector to the df
  df.insert(loc=0,column=labelcol+'tmp', value=vector)
  #remove the rows ewith labelcol == 1
  df = df.drop(df[df[labelcol]==1].index)
  # drop labelcol and rename the tmpo col as labelcol
  df = df.drop(labelcol, axis=1)
  df = df.rename(columns={labelcol+'tmp':labelcol})
  #make the labelcol binary
  df.loc[df[labelcol] > 0, labelcol] = 1
  return df

In [24]:
#remove time column, and the categorial columns
df = df.drop(['time', 'x28', 'x61'], axis=1)

In [27]:
df_train, df_test = train_test_split(df, test_size=DATA_SPLIT_PCT, random_state=SEED)
df_train, df_valid = train_test_split(df_train, test_size=DATA_SPLIT_PCT, random_state=SEED)

df_train_0 = df_train.loc[df['y']==0]
df_train_1 = df_train.loc[df['y']==1]
df_train_0_x = df_train_0.drop(['y'], axis=1)
df_train_1_x = df_train_1.drop(['y'], axis=1)

df_valid_0 = df_valid.loc[df['y']==0]
df_valid_1 = df_valid.loc[df['y']==1]
df_valid_0_x = df_valid_0.drop(['y'], axis=1)
df_valid_1_x = df_valid_1.drop(['y'], axis=1)

df_test_0 = df_test.loc[df['y']==0]
df_test_1 = df_test.loc[df['y']==1]
df_test_0_x = df_test_0.drop(['y'], axis=1)
df_test_1_x = df_test_1.drop(['y'], axis=1)

In [28]:
#Standardization
scaler = StandardScaler().fit(df_train_0_x)
df_train_0_x_rescaled = scaler.transform(df_train_0_x)
df_valid_0_x_rescaled = scaler.transform(df_valid_0_x)
df_valid_x_rescaled = scaler.transform(df_valid.drop(['y'], axis=1))


df_test_0_x_rescaled = scaler.transform(df_test_0_x)
df_test_x_rescaled = scaler.transform(df_test.drop(['y'], axis=1))

In [29]:
#Autoencoder Classifier
#initialization
nb_epoch = 200
batch_size = 128
#num of predictor variables
input_dim = df_train_0_x_rescaled.shape[1]
encoding_dim = 32
hidden_dim = int(encoding_dim/2)
learning_rate = 1e-3

