In [1]:
# from transformers import BertTokenizer, BertModel
import torch
import json
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, average_precision_score
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler

from pyod.models.knn import KNN  # K-Nearest Neighbors
from pyod.models.ocsvm import OCSVM  # One-Class SVM
from pyod.models.iforest import IForest  # Isolation Forest
from pyod.models.lof import LOF  # Local Outlier Factor
from pyod.models.ecod import ECOD  # Empirical Cumulative Outlier Detection
from pyod.models.inne import INNE  # Isolation-based Nearest Neighbor Ensemble
from pyod.models.lunar import LUNAR

In [None]:
import os

directory = "embeddings"
data_name = 'olid'
model_name = ['bert', 'minilm', 'openai_ada', 'openai_small', 'openai_large','llama', 'stella', 'qwen']

file_list = []
for i in model_name:
    file_name = f"{i}_{data_name}.npy"
    file_path = os.path.join(directory, data_name,file_name)
    file_list.append(file_path)

len(np.load(file_list[3]))

In [None]:
data_dir = 'data/olid.npz'

loaded = np.load(data_dir, allow_pickle=True)
y = loaded['label']

In [13]:
classifiers = {
    "KNN": KNN(),
    "OCSVM": OCSVM(),
    "iForest": IForest(),
    "LOF": LOF(),
    "ECOD": ECOD(),
    "iNNE": INNE(),
    "LUNAR": LUNAR(),
}

classifiers_indices = {
    "KNN": 0,
    "OCSVM": 1,
    "iForest": 2,
    "LOF": 3,
    "ECOD": 4,
    "iNNE": 5,
    "LUNAR": 6,
}

In [None]:
df_columns = [
    "Data",
    "# Samples",
    "# Dimensions",
    "Outlier Perc",
    "KNN",
    "OCSVM",
    "iForest",
    "LOF",
    "ECOD",
    "INNE",
    "LUNAR",
]

roc_df = pd.DataFrame(columns=df_columns)
n_ite = 1
n_classifiers = len(df_columns)-4

for j in tqdm(range(len(file_list))):
    mat_file = file_list[j]
    print("\n... Processing", mat_file, "...")

    X = np.load(mat_file)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    roc_mat = np.zeros(n_classifiers)
    random_state = np.random.RandomState()

    for clf_name, clf in classifiers.items():
        roc_scores = []  
        for _ in range(5):
            if clf_name == "ECOD":
                clf = ECOD()
            clf.fit(X)
            test_scores = clf.decision_function(X)
            roc = roc_auc_score(y, test_scores)
            roc_scores.append(roc)
        mean_roc = round(np.mean(roc_scores), ndigits=4)
        roc_mat[classifiers_indices[clf_name]] = mean_roc

    roc_list = roc_list + list(roc_mat)
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

In [15]:
roc_df

Unnamed: 0,Data,# Samples,# Dimensions,Outlier Perc,KNN,OCSVM,iForest,LOF,ECOD,INNE,LUNAR
0,embeddings\olid\bert_olid,641,768,3.2761,0.5137,0.4866,0.4783,0.4967,0.4933,0.474,0.5208
0,embeddings\olid\minilm_olid,641,384,3.2761,0.5063,0.452,0.4531,0.5442,0.4208,0.4824,0.4877
0,embeddings\olid\openai_ada_olid,641,1536,3.2761,0.5243,0.5048,0.4891,0.5368,0.4986,0.5102,0.5298
0,embeddings\olid\openai_small_olid,641,1536,3.2761,0.5587,0.5547,0.558,0.5556,0.5295,0.5382,0.5602
0,embeddings\olid\openai_large_olid,641,3072,3.2761,0.5497,0.4971,0.5068,0.5141,0.4967,0.517,0.5242
0,embeddings\olid\llama_olid,641,2048,3.2761,0.4081,0.4057,0.4082,0.3961,0.3998,0.3917,0.4117
0,embeddings\olid\stella_olid,641,1024,3.2761,0.5016,0.4439,0.4325,0.5259,0.4395,0.4532,0.5114
0,embeddings\olid\qwen_olid,641,1536,3.2761,0.4602,0.4882,0.4869,0.4515,0.4773,0.4693,0.4691
