# Setup

In [None]:
%%capture
! pip install autoembedder
! pip install ipywidgets==8.0.2
! pip install plotly==5.11.0
! pip install scikit-learn==1.1.3

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from autoembedder import Autoembedder, dataloader, fit
import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio

In [None]:
pio.templates.default = "none"
py.init_notebook_mode(connected=True)

## Set `data_path`

A good dataset to test the autoencoder for outlier detection is the [`Credit Card Fraud Detection`](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) dataset from [Kaggle](https://www.kaggle.com/). To use it in the notebook please download the dataset and set the `data_path` variable to the path of the downloaded data. 

In [None]:
data_path = "path/to/your/data.csv"  # Path to your data

# Functions

### `plot_scatter`

In [None]:
def plot_scatter(X, y):
    X = TSNE(
        n_components=2, random_state=42, learning_rate="auto", init="random"
    ).fit_transform(X)

    traces = [
        go.Scatter(
            x=X[y == 0, 0],
            y=X[y == 0, 1],
            mode="markers",
            showlegend=True,
            name="Non Fraud (0)",
        ),
        go.Scatter(
            x=X[y == 1, 0],
            y=X[y == 1, 1],
            mode="markers",
            showlegend=True,
            name="Fraud (1)",
        ),
    ]

    py.iplot(go.Figure(data=traces))

# Data overview

### Read CSV and scale `Time` column

In [None]:
df = pd.read_csv(data_path)
df["Time"] = df["Time"] / 3600 % 24

### Get sample DataFrame

In [None]:
df = (
    pd.concat([df.loc[df["Class"] == 1], df.loc[df["Class"] == 0].sample(5000)])
    .sample(frac=1)
    .reset_index(drop=True)
)
y = df.pop("Class")

### Plot data

In [None]:
plot_scatter(df, y)

# Autoembedder

### Read data and split by target

In [None]:
df = pd.read_csv(data_path)
train_df = df.sample(frac=0.8)
test_df = df.drop(train_df.index)

X_train_df = train_df.query("Class==0").drop("Class", axis=1)
X_test_df = test_df.drop("Class", axis=1)
y_test = test_df["Class"]

### Scale and prepare data

Some usual preprocessing steps are applied to the data.

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

X_train_df = pd.DataFrame(X_train, columns=X_train_df.columns)
X_test_df = pd.DataFrame(X_test, columns=X_test_df.columns)
eval_df = pd.DataFrame(
    np.concatenate((X_test, y_test.to_numpy()[:, None]), axis=1), columns=df.columns
)

### Create data loaders

First, we create two [`dataloaders`](https://chrislemke.github.io/autoembedder/autoembedder.data/#autoembedder.data.Dataset.__init__). One for training, and the other for validation data. As `source` they either accept a path to a Parquet file, to a folder of Parquet files or a [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)/[Dask](https://docs.dask.org/en/stable/dataframe.html) DataFrame.

In [None]:
train_dl = dataloader(X_train_df)
test_dl = dataloader(X_test_df)

### Training parameters

Now, we need to set the parameters. They are going to be used for handling the data and training the model. In this example, only parameters for the training are set. [Here](https://chrislemke.github.io/autoembedder/#parameters) you find a list of all possible parameters. This should do it:

In [None]:
parameters = {
    "hidden_layers": [[25, 20], [20, 10]],
    "epochs": 10,
    "lr": 0.0001,
    "verbose": 1,
    "target": "Class",
}

### Create instance of the autoencoder model

Then, we need to initialize the [autoembedder](https://chrislemke.github.io/autoembedder/autoembedder.model/#autoembedder.model.Autoembedder). In this example, we are not using any categorical features. So we can skip the `embedding_sizes` argument.

In [None]:
model = Autoembedder(parameters, num_cont_features=X_train_df.shape[1])

### Fit autoencoder

Everything is set up. Now we can [fit](https://chrislemke.github.io/autoembedder/autoembedder.learner/#autoembedder.learner.fit) the model. Since we are also passing data for [evaluation](https://chrislemke.github.io/autoembedder/autoembedder.evaluator/#autoembedder.evaluator.loss_delta) (`eval_df`), after the model is fitted it will be evaluated. 
`mean loss delta` and `median loss delta` express the differences between the mean/median loss of the `Class` `0` and `Class` `1` samples. A higher value indicates that the model is able to distinguish between the two classes.

In [None]:
fit(parameters, model, train_dl, test_dl, eval_df)

### Predict test data

We use the trained encoder form the model to predict the evaluation data.

In [None]:
X_non_fraud = eval_df.query("Class == 0").drop("Class", axis=1).to_numpy()
X_fraud = eval_df.query("Class == 1").drop("Class", axis=1).to_numpy()

with torch.no_grad():
    model.eval()
    non_fraud_encoded = model.encoder(torch.from_numpy(X_non_fraud))
    fraud_encoded = model.encoder(torch.from_numpy(X_fraud))
encoded_X = np.append(non_fraud_encoded[:5000], fraud_encoded, axis=0)
encoded_y = np.append(np.zeros(5000), np.ones(len(fraud_encoded)))

### Plot results from test data

To get an overview how the autoembedder performed we plot the data once more.

In [None]:
plot_scatter(encoded_X, encoded_y)