In [1]:
%load_ext autoreload
%autoreload 2
from new_aeb_gplvm import *
import warnings
import json
import random
import os
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
# import the necessary package
from tqdm import trange
from utils.data_generator import DataGenerator
from utils.myutils import Utils
import matplotlib.pyplot as plt
import numpy as np
import torch
from tqdm import trange

datagenerator = DataGenerator()  # data generator
utils = Utils()  # utils function

2023-12-05 20:33:04.415558: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-05 20:33:04.483995: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
dataset_list = os.listdir("datasets/Classical")


def fix_name(name):
    fixed = name.replace(".npz", "")
    return fixed


datasets = list(map(fix_name, dataset_list))

In [3]:
df_hyper = pd.read_json("experiments/complete/gplvm/000_gplvm_normal_03_best.json")

In [4]:
def get_hypers(dataset):
    hypers = df_hyper[df_hyper.dataset == dataset]
    hp = hypers[
        [
            "kernel",
            "batch_size",
            "learning_rate",
            "latent_dim",
            "layers",
            "n_inducing",
            "n_epochs",
        ]
    ].to_dict(orient="records")
    return hp[0]

In [5]:
from baseline.PyOD import PYOD

model_dict = {
    "IForest": PYOD,
    "KNN": PYOD,
    "CBLOF": PYOD,
    "PCA": PYOD,
    "ECOD": PYOD,
    "GPLVM": AD_GPLVM,
}

In [None]:
# seed for reproducible results
seed = 42
df_AUCROC = pd.DataFrame(data=None, index=datasets, columns=model_dict.keys())
df_AUCPR = pd.DataFrame(data=None, index=datasets, columns=model_dict.keys())
for mode in ["normal", "contaminated"]:
    for noise_ratio in [1, 5, 10, 25, 50]:
        for dataset in datasets:
            # import the dataset
            datagenerator.dataset = dataset  # specify the dataset name
            data = datagenerator.generator(
                la=1.0,
                realistic_synthetic_mode=None,
                noise_type="irrelevant_features",
                noise_ratio=noise_ratio/100,
                stdscale=True,
                minmax=False,
            )
            ratio = data["y_train"].sum() / len(data["y_train"])
            Y_train, Y_test, lb_train, lb_test = (
                data["X_train"],
                data["X_test"],
                data["y_train"],
                data["y_test"],
            )

            if mode == "normal":
                idx_n = np.where(lb_train == 0)[0]
                Y_train = Y_train[idx_n]
                lb_train = lb_train[idx_n]

            for name, clf in model_dict.items():
                try:
                    if name != "GPLVM":
                        clf = clf(seed=seed, model_name=name)
                        clf = clf.fit(X_train=Y_train, y_train=lb_train)
                        score = clf.predict_score(Y_test)
                    else:
                        hp = get_hypers(dataset)
                        clf = AD_GPLVM(
                            latent_dim=hp["latent_dim"],
                            n_inducing=hp["n_inducing"],
                            n_epochs=hp["n_epochs"],
                            nn_layers=tuple(map(int, hp["layers"].split(","))),
                            lr=hp["learning_rate"],
                            batch_size=hp["batch_size"],
                            kernel=hp["kernel"],
                        )
                        clf.fit(torch.tensor(Y_train, dtype=torch.float32))
                        score = clf.predict_score(
                            torch.tensor(Y_test, dtype=torch.float32)
                        )

                    result = utils.metric(y_true=lb_test, y_score=score)

                    # save results
                    df_AUCROC.loc[dataset, name] = result["aucroc"]
                    df_AUCPR.loc[dataset, name] = result["aucpr"]

                except Exception as error:
                    print("An exception occurred:", error)
                    df_AUCROC.loc[dataset, name] = 0.0
                    df_AUCPR.loc[dataset, name] = 0.0
        df_AUCROC.to_json(
            f"experiments/complete/adbench/irrelevant_features/ad_vs_gp_aucroc_{mode}_{noise_ratio}.json",
            orient="records",
        )
        df_AUCPR.to_json(
            f"experiments/complete/adbench/irrelevant_features/ad_vs_gp_aucpr_{mode}_{noise_ratio}.json",
            orient="records",
        )

subsampling for dataset 01_ALOI...
subsampling for dataset 03_backdoor...
generating duplicate samples for dataset 04_breastw...
subsampling for dataset 05_campaign...
subsampling for dataset 08_celeba...


In [None]:
# df_AUCROC.to_json("experiments/complete/002_adbench_vs_gplvm_aucroc_contaminated.json", orient = "records")
# df_AUCROC.to_csv("experiments/complete/002_adbench_vs_gplvm_aucroc_contaminated.csv")
# df_AUCPR.to_json("experiments/complete/002_adbench_vs_gplvm_aucpr_contaminated.json", orient = "records")
# df_AUCPR.to_csv("experiments/complete/002_adbench_vs_gplvm_aucpr_contaminated.csv")

In [None]:
import seaborn as sns

sns.boxplot(data=df_AUCROC)
None

In [None]:
sns.boxplot(data=df_AUCPR)
None

In [None]:
df_AUCROC

In [None]:
# df_AUCROC.reset_index().rename(columns={"index": "dataset"}).to_json(
#    "experiments/complete/002_adbench_auc_roc_normal_results.json", orient="records"
# )

In [None]:
# df_AUCPR.reset_index().rename(columns={"index": "dataset"}).to_json(
#    "experiments/complete/002_adbench_auc_pr_normal_results.json", orient="records"
# )