# Setup

In [None]:
%%capture
! pip install autoembedder
! pip install ipywidgets==8.0.2
! pip install plotly==5.11.0
! pip install scikit-learn==1.1.3

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from autoembedder import Autoembedder, dataloader, fit

import plotly.offline as py
import plotly.graph_objs as go

In [None]:
py.init_notebook_mode(connected=True)

## Set `data_path`

A good dataset to test the autoencoder for outlier detection is the [`Credit Card Fraud Detection`](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) from [Kaggle](https://www.kaggle.com/). To use it in the notebook please download the dataset and set the `data_path` variable to the path of the downloaded dataset. 

In [None]:
data_path = "path/to/your/data"  # Path to your data

# Functions

### `plot_scatter`

In [None]:
def plot_scatter(X, y):
    X = TSNE(
        n_components=2, random_state=42, learning_rate="auto", init="random"
    ).fit_transform(X)

    traces = [
        go.Scatter(
            x=X[y == 0, 0],
            y=X[y == 0, 1],
            mode="markers",
            showlegend=True,
            name="Non Fraud (0)",
        ),
        go.Scatter(
            x=X[y == 1, 0],
            y=X[y == 1, 1],
            mode="markers",
            showlegend=True,
            name="Fraud (1)",
        ),
    ]
    layout = dict(title="TSNE")
    fig = go.Figure(data=traces, layout=layout)
    py.iplot(fig)

# Data overview

### Read CSV and scale `Time` column

In [None]:
df = pd.read_csv(data_path)
df["Time"] = df["Time"] / 3600 % 24

### Get sample DataFrame

In [None]:
df = (
    pd.concat([df.loc[df["Class"] == 1], df.loc[df["Class"] == 0].sample(3000)])
    .sample(frac=1)
    .reset_index(drop=True)
)
y = df.pop("Class")

### Plot data

In [None]:
plot_scatter(df, y)

# Autoembedder

In [None]:
df = pd.read_csv(data_path)
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df.drop("Class", axis=1), df["Class"], test_size=0.2
)

### Scale and prepare data

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

train_df = pd.DataFrame(X_train, columns=X_train_df.columns)
test_df = pd.DataFrame(X_test, columns=X_test_df.columns)
eval_df = pd.DataFrame(
    np.concatenate((X_test, y_test.to_numpy()[:, None]), axis=1), columns=df.columns
)

### Training parameters

Read more about the parameters [here](https://github.com/chrislemke/autoembedder#parameters).

In [None]:
parameters = {
    "hidden_layers": [[20, 15], [15, 10]],
    "epochs": 10,
    "verbose": 1,
}

### Create data loaders

In [None]:
train_dl = dataloader(train_df, parameters)
test_dl = dataloader(test_df, parameters)

### Create instance of the autoencoder model

In [None]:
model = Autoembedder(parameters, num_cont_features=30, embedding_sizes=[])

### Fit autoencoder

In [None]:
fit(parameters, model, train_dl, test_dl)

### Predict test data

In [None]:
X_non_fraud = eval_df.query("Class == 0").drop("Class", axis=1).to_numpy()
X_fraud = eval_df.query("Class == 1").drop("Class", axis=1).to_numpy()

with torch.no_grad():
    model.eval()
    non_fraud_encoded = model.encoder(torch.from_numpy(X_non_fraud))
    fraud_encoded = model.encoder(torch.from_numpy(X_fraud))

encoded_X = np.append(non_fraud_encoded[:8000], fraud_encoded, axis=0)
encoded_y = np.append(np.zeros(8000), np.ones(len(fraud_encoded)))

### Plot results from test data

In [None]:
plot_scatter(encoded_X, encoded_y)