<a href="https://colab.research.google.com/github/gideonler/Data-Science-in-Business/blob/main/iForest_and_Autoencoders_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries import

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Dense, Dropout
from keras.models import Model, Sequential
import tensorflow as tf

# Anomaly transformation function

In [None]:
# Assigning values of 1 to anomalies and values of 0 to non-anomalous data
def def_anomaly(df):
    if (df['y_pred'] in [-1]):
        val = 1
    else:
        val = 0
    return val

# Data import

Source: https://towardsdatascience.com/adrepository-anomaly-detection-datasets-with-real-anomalies-2ee218f76292

In [None]:
data = pd.read_csv('./bank.csv')

In [None]:
data.head()

Unnamed: 0,age,job=housemaid,job=services,job=admin.,job=blue-collar,job=technician,job=retired,job=management,job=unemployed,job=self-employed,...,previous,poutcome=nonexistent,poutcome=failure,poutcome=success,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,class
0,0.209877,0,0,0,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.882307,0.376569,0.98073,1.0,0
1,0.296296,0,0,1,0,0,0,0,0,0,...,0.0,1,0,0,1.0,0.484412,0.615063,0.981183,1.0,0
2,0.246914,1,0,0,0,0,0,0,0,0,...,0.0,1,0,0,0.9375,0.698753,0.60251,0.957379,0.859735,0
3,0.160494,0,1,0,0,0,0,0,0,0,...,0.142857,0,1,0,0.333333,0.26968,0.192469,0.150759,0.512287,0
4,0.530864,0,0,0,1,0,0,0,0,0,...,0.0,1,0,0,0.333333,0.340608,0.154812,0.17479,0.512287,1


In [None]:
data.shape

(41188, 63)

# iForest

In [None]:
# training iForest model with the same hyper-parameters as in the original iForest paper
# we train model on the data without labels
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=256, random_state=rng, n_estimators=100)
clf.fit(data.loc[:, data.columns != 'class'])
# predicting whether the observation is anomalous: y_pred = -1 for anomalies and y_pred = 1 for non-anomalous data
y_pred = clf.predict(data.loc[:, data.columns != 'class'])

In [None]:
# transforming anomaly score to 1s for anomalies and 0s for non-anomalies 
data['y_pred'] = y_pred
data['prediction'] = data.apply(def_anomaly, axis = 1)

In [None]:
# displaying confusion matrix
confusion_matrix(data['class'], data['prediction'])

array([[34293,  2255],
       [ 3355,  1285]], dtype=int64)

In [None]:
# displaying AUC
fpr, tpr, _ = metrics.roc_curve(data['class'], data['prediction'])
metrics.auc(fpr, tpr)

0.6076199862816211

In [None]:
# displaying precision and recall
print(classification_report(data['class'], data['prediction'], target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92     36548
           1       0.36      0.28      0.31      4640

    accuracy                           0.86     41188
   macro avg       0.64      0.61      0.62     41188
weighted avg       0.85      0.86      0.86     41188



# Deep Autoencoders

Source: https://www.analyticsvidhya.com/blog/2021/05/anomaly-detection-using-autoencoders-a-walk-through-in-python/

In [None]:
# making target and features dataframes 
features = data.drop(columns = ['class'])
target = data['class']

In [None]:
# min max scale the features data to the range [0,1]
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = min_max_scaler.fit_transform(features.copy())

In [None]:
# Building a deep autoencoder of 7 layes:
# an input layer with sigmoid activation function
# 5 hidden layers with relu activation function and 
# an output layer with sigmoid activation function 
class AutoEncoder(Model):

    def __init__(self, output_units):
        super().__init__()
        self.encoder = Sequential([
          Dense(31,input_dim = output_units, activation='sigmoid'),
          Dropout(0.1),
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(7, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(15, activation='relu'),
          Dropout(0.1),
          Dense(31,  activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
# defining function for finding threshold for an anomaly 
# threshold is calculated as the mean of reconstruction error + std of reconstruction errors
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
  # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
      + np.std(reconstruction_errors.numpy())
    return threshold

#defining function for predicting with the model and calculating reconstruction errors that serve as anomaly scores 
def get_predictions(model, x_train_scaled, threshold):
    predictions = model.predict(x_train_scaled)
  # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_train_scaled)
  # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1.0 if x == True else 0.0)
    return preds

In [None]:
# training the model and getting predictions 
model = AutoEncoder(output_units=x_train_scaled.shape[1])

model.compile(loss='msle', metrics=['mse'], optimizer='adam')

history = model.fit(
    x_train_scaled,
    x_train_scaled,
    epochs=20,
    batch_size=512,
)

threshold = find_threshold(model, x_train_scaled)
print(f"Threshold: {threshold}")

predictions = get_predictions(model, x_train_scaled, threshold)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Threshold: 0.037522189389156975


In [None]:
# displaying confusion matrix
confusion_matrix(target, predictions)

array([[32286,  4262],
       [ 3490,  1150]], dtype=int64)

In [None]:
# displaying AUC 
fpr, tpr, _ = metrics.roc_curve(target, predictions)
metrics.auc(fpr, tpr)

0.5656155296954783

In [None]:
# displaying precision and recall 
print(classification_report(target, predictions, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     36548
           1       0.21      0.25      0.23      4640

    accuracy                           0.81     41188
   macro avg       0.56      0.57      0.56     41188
weighted avg       0.82      0.81      0.82     41188

