# Anomaly detection on SherlockML

Data taken from: https://www.kaggle.com/c/anomaly-detection-challenges

Requirements:
- XGBoost (apply the `install-xgboost` environment or run `pip install xgboost`)
- Keras (`pip install keras`)

In [None]:
import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '/project/anomaly-detection/modules/')

from anomaly_detection_tools import *

init_notebook_mode(connected=True)

## Data ingestion

In [None]:
data_df = read_training_data()

In [None]:
data_df.head()

In [None]:
numerical_data_df = filter_numerical_only(data_df)

In [None]:
numerical_data_df.head()

## Dimensional reduction

To see whether an unsupervised learning approach is possible, let's perform dimensional reduction on the data and see what we get.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
X_red = pca.fit_transform(numerical_data_df.drop(['label'], axis=1))

In [None]:
X_red.shape

In [None]:
Y = np.array(numerical_data_df['label']).reshape((-1,1))

In [None]:
X_red[Y[:,0]==0].shape, X_red[Y[:,0]==1].shape

In [None]:
trace0 = go.Scatter(
    x = X_red[Y[:,0]==0][:,0],
    y = X_red[Y[:,0]==0][:,1],
    mode = 'markers',
    name = 'class 0'
)

trace1 = go.Scatter(
    x = X_red[Y[:,0]==1][:,0],
    y = X_red[Y[:,0]==1][:,1],
    mode = 'markers',
    name = 'class 1'
)

data = [trace0, trace1]

fig = go.Figure(data=data)

iplot(fig)

## Training an XGBoost classifier

Transform categorical features into numerical features using dummy variables.

In [None]:
data_df_dummies = pd.get_dummies(data_df)

data_df_dummies.shape

Cross validation: split dataset into train and test data.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = np.array(data_df_dummies.drop(['label'], axis=1))
Y = np.array(data_df_dummies['label'])

In [None]:
seed = 42
np.random.seed(seed)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

Train an XGBoost classifier.

In [None]:
from xgboost import XGBClassifier

In [None]:
xgbc = XGBClassifier()

In [None]:
xgbc.fit(X_train, Y_train)

## Evaluate the XGBoost classifier on the test data

Compute the confusion matrix on the test data.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
plot_confusion_matrix(confusion_matrix(Y_test, xgbc.predict(X_test)), [0,1], normalize=True)

## Train a neural network classifier

In [None]:
from keras.models import Sequential
from keras.layers import Dense

Define the graph.

In [None]:
model = Sequential()
model.add(Dense(10, input_shape=(X.shape[1],), activation='tanh'))

model.add(Dense(50, input_shape=(X.shape[1],), activation='sigmoid'))

model.add(Dense(10, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
seed = 42
np.random.seed(seed)

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Rescale the data.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train the model on the scaled data.

In [None]:
model.fit(X_train_scaled, Y_train.reshape((-1,1)), epochs=300, verbose=0)

## Evaluate the neural network on the test data

Compute the confusion matrix on the test data.

In [None]:
plot_confusion_matrix(confusion_matrix(Y_test, model.predict_classes(X_test_scaled)), [0,1], normalize=True)