In [None]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import torch
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from autoembedder import Autoembedder, dataloader, fit

In [None]:
data_path = "/Users/ch.lemke/Downloads/creditcard.csv"

In [None]:
df = pd.read_csv(data_path)
df["Time"] = df["Time"] / 3600 % 24

In [None]:
df = (
    pd.concat([df.loc[df["Class"] == 1], df.loc[df["Class"] == 0].sample(3000)])
    .sample(frac=1)
    .reset_index(drop=True)
)

In [None]:
X = TSNE(
    n_components=2, random_state=42, learning_rate="auto", init="random"
).fit_transform(df)
fig = px.scatter(x=X[:, 0], y=X[:, 1], color=df["Class"])
fig.show()

In [None]:
df = pd.read_csv(data_path)
X_train_df, X_test_df, y_train, y_test = train_test_split(
    df.drop("Class", axis=1), df["Class"], test_size=0.2
)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

train_df = pd.DataFrame(X_train, columns=X_train_df.columns)
test_df = pd.DataFrame(X_test, columns=X_test_df.columns)
eval_df = pd.DataFrame(
    np.concatenate((X_test, y_test.to_numpy()[:, None]), axis=1), columns=df.columns
)

In [None]:
parameters = {
    "batch_size": 32,
    "target": "Class",
    "drop_cat_columns": 1,
    "pin_memory": 0,
    "num_workers": 0,
    "drop_last": 1,
    "hidden_layers": [[16, 8], [8, 4], [4, 2]],
    "layer_bias": 1,
    "use_mps": 0,
    "epochs": 30,
    "lr": 0.0001,
    "weight_decay": 0,
    "amsgrad": 0,
    "xavier_init": 0,
    "l1_lambda": 0,
    "tensorboard_log_path": None,
    "eval_input_path": None,
    "n_save_checkpoints": 0,
    "model_save_path": None,
    "load_checkpoint_path": None,
    "verbose": 1,
}

In [None]:
train_dl = dataloader(dd.from_pandas(train_df, npartitions=1), parameters)
test_dl = dataloader(dd.from_pandas(test_df, npartitions=1), parameters)

In [None]:
model = Autoembedder(parameters, 30, [])

In [None]:
fit(parameters, model, train_dl, test_dl)

In [None]:
X_nonfraud = eval_df.query("Class == 0").drop("Class", axis=1).to_numpy()
X_fraud = eval_df.query("Class == 1").drop("Class", axis=1).to_numpy().T

with torch.no_grad():
    model.eval()
    non_fraud_encoded = model.encoder(torch.from_numpy(X_nonfraud))
    fraud_encoded = model.encoder(torch.from_numpy(X_fraud))

nrows = 3000
sample_encoded_X = np.append(non_fraud_encoded[:nrows], fraud_encoded, axis=0)
sample_encoded_y = np.append(np.zeros(nrows), np.ones(len(fraud_encoded)))