# Imports

In [164]:
import numpy as np
from fcf import fair_cut_forest, score_point
from sklearn.datasets import fetch_openml
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import time
import eif_old as iso
from pyod.models.lof import LOF
import rrcf
import pandas as pd
from pyod.models.iforest import IForest


# Models

## FCF

In [165]:
def test_fcf(X, y):
    n_trees = 200
    sample_size = 256
    n_splits = 2

    roc_scores = []
    pr_scores = []
    times = []

    for _ in range(10):
        t1 = time.time()
        fcf_trees, fcf_q = fair_cut_forest(X, p=n_splits, t=n_trees, s=sample_size)
        fcf_scores = np.array([score_point(x, fcf_trees, fcf_q) for x in X])
        t2 = time.time()

        roc_auc = roc_auc_score(y, fcf_scores)
        precision, recall, _ = precision_recall_curve(y, fcf_scores)
        pr_auc = auc(recall, precision)

        print("ROC-AUC:", roc_auc)
        print("PR-AUC:", pr_auc)
        print("Time:", t2 - t1)
        print()

        roc_scores.append(roc_auc)
        pr_scores.append(pr_auc)
        times.append(t2 - t1)

    print("Average ROC-AUC:", np.mean(roc_scores))
    print("Average PR-AUC:", np.mean(pr_scores))
    print("Average Time:", np.mean(times))

## IF

In [166]:
def test_if(X, y, contamination):
    n_trees = 100
    sample_size = 256

    roc_scores = []
    pr_scores = []
    times = []

    for i in range(10):
        t1 = time.time()
        iforest = IForest(
            n_estimators=n_trees,
            max_samples=sample_size,
            contamination=contamination,
            random_state=i
        )

        iforest.fit(X)
        iforest_scores = iforest.decision_function(X)
        t2 = time.time()

        roc_auc = roc_auc_score(y, iforest_scores)
        precision, recall, _ = precision_recall_curve(y, iforest_scores)
        pr_auc = auc(recall, precision)

        print(f"ROC-AUC: {roc_auc:.3f}")
        print(f"PR-AUC:  {pr_auc:.3f}")
        print(f"Time:    {t2 - t1:.3f}")
        print()

        roc_scores.append(roc_auc)
        pr_scores.append(pr_auc)
        times.append(t2 - t1)

    print("Average ROC-AUC:", np.mean(roc_scores))
    print("Average PR-AUC:", np.mean(pr_scores))
    print("Average Time:", np.mean(times))
    print()

## EIF

In [167]:
def test_eif(X, y):
    roc_scores = []
    pr_scores = []
    times = []

    for _ in range(10):
        t1 = time.time()

        eif  = iso.iForest(X, ntrees=100, sample_size=256, ExtensionLevel=1)
        eif_scores = eif.compute_paths(X_in=X)

        t2 = time.time()
        times.append(t2 - t1)

        roc_auc = roc_auc_score(y, eif_scores)
        precision, recall, _ = precision_recall_curve(y, eif_scores)
        pr_auc = auc(recall, precision)

        print("ROC-AUC:", roc_auc)
        print("PR-AUC:", pr_auc)
        print("Time: ", t2 - t1)
        print()


        roc_scores.append(roc_auc)
        pr_scores.append(pr_auc)

    print("Average ROC-AUC:", np.mean(roc_scores))
    print("Average PR-AUC:", np.mean(pr_scores))
    print("Average Time:", np.mean(times))
    print()

## RRCF

In [168]:
def test_rrcf(X, y):
    n = len(y)
    num_trees = 200 
    tree_size = 256

    for _ in range(10):
        t1 = time.time()
        forest = []
        ixs = [np.random.choice(n, size=tree_size, replace=False) for _ in range(num_trees)]

        for ix in ixs:
            tree = rrcf.RCTree(X[ix], index_labels=ix)  
            forest.append(tree)

        avg_codisp = pd.Series(0.0, index=np.arange(n))
        index = np.zeros(n)

        for tree in forest:
            codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
            avg_codisp[codisp.index] += codisp
            np.add.at(index, codisp.index.values, 1)

        index[index == 0] = 1
        avg_codisp /= index

        avg_codisp = avg_codisp.fillna(0)

        t2 = time.time()

        roc_auc = roc_auc_score(y, -avg_codisp)
        precision, recall, _ = precision_recall_curve(y, -avg_codisp)
        pr_auc = auc(recall, precision)

        print("ROC-AUC:", roc_auc)
        print("PR-AUC:", pr_auc)
        print("Time:", t2 - t1)
        print()

    print("Average ROC-AUC:", np.mean(roc_auc))
    print("Average PR-AUC:", np.mean(pr_auc))
    print("Average Time:", np.mean(t2 - t1))

## LOF

In [169]:
def test_lof(X, y, contamination):
    t1 = time.time()

    lof_model = LOF(n_neighbors=20, contamination=contamination)
    lof_model.fit(X)
    y_scores = lof_model.decision_function(X)

    t2 = time.time()

    roc_auc = roc_auc_score(y, y_scores)
    precision, recall, _ = precision_recall_curve(y, y_scores)
    pr_auc = auc(recall, precision)

    print("ROC-AUC:", roc_auc)
    print("PR-AUC:", pr_auc)
    print("Time:", t2 - t1)

# Test SpamBase Dataset

In [195]:
X, y = fetch_openml('spambase', version=1, as_frame=False, return_X_y=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = y.astype(int)

## FCF

In [171]:
test_fcf(X, y)

ROC-AUC: 0.7088267739528243
PR-AUC: 0.5334411219101873
Time: 110.90148067474365

ROC-AUC: 0.6960665281274012
PR-AUC: 0.5148976604261465
Time: 101.60204744338989

ROC-AUC: 0.7070606752918702
PR-AUC: 0.5301848980295599
Time: 105.56783866882324

ROC-AUC: 0.7138901374656652
PR-AUC: 0.5391268420393855
Time: 105.78251147270203

ROC-AUC: 0.6995398686831358
PR-AUC: 0.5236208960466724
Time: 105.21367907524109

ROC-AUC: 0.7001403066170436
PR-AUC: 0.5241058447408925
Time: 108.60266995429993

ROC-AUC: 0.7010566916285301
PR-AUC: 0.5220258099337711
Time: 103.46378755569458

ROC-AUC: 0.696482877923747
PR-AUC: 0.5197261576150267
Time: 107.60030031204224

ROC-AUC: 0.6954897318188977
PR-AUC: 0.5190956917387207
Time: 105.37427091598511

ROC-AUC: 0.7054002220532247
PR-AUC: 0.5274743811978346
Time: 106.62130403518677

Average ROC-AUC: 0.702395381356234
Average PR-AUC: 0.5253699303678198
Average Time: 106.07298901081086


## IF

In [172]:
test_if(X, y, 0.39)

ROC-AUC: 0.658
PR-AUC:  0.496
Time:    0.228

ROC-AUC: 0.642
PR-AUC:  0.483
Time:    0.224

ROC-AUC: 0.612
PR-AUC:  0.466
Time:    0.233

ROC-AUC: 0.614
PR-AUC:  0.459
Time:    0.211

ROC-AUC: 0.670
PR-AUC:  0.536
Time:    0.194

ROC-AUC: 0.636
PR-AUC:  0.473
Time:    0.196

ROC-AUC: 0.589
PR-AUC:  0.434
Time:    0.207

ROC-AUC: 0.646
PR-AUC:  0.494
Time:    0.194

ROC-AUC: 0.625
PR-AUC:  0.480
Time:    0.207

ROC-AUC: 0.624
PR-AUC:  0.465
Time:    0.215

Average ROC-AUC: 0.6315994657586173
Average PR-AUC: 0.4785391725486243
Average Time: 0.21086931228637695



## EIF

In [173]:
test_eif(X, y)

ROC-AUC: 0.5766756274032355
PR-AUC: 0.42702192073245726
Time:  11.028805494308472

ROC-AUC: 0.5936231117364547
PR-AUC: 0.4458067171693249
Time:  8.715203046798706

ROC-AUC: 0.6028214845595457
PR-AUC: 0.4587066730964612
Time:  8.604262351989746

ROC-AUC: 0.6568196494154682
PR-AUC: 0.5068189171743378
Time:  8.546306610107422

ROC-AUC: 0.6003593922737189
PR-AUC: 0.4498263064086005
Time:  8.704461097717285

ROC-AUC: 0.5770459798949243
PR-AUC: 0.42319615525795085
Time:  8.611206769943237

ROC-AUC: 0.6162891986062717
PR-AUC: 0.46955944511896797
Time:  8.73010778427124

ROC-AUC: 0.5905946096302727
PR-AUC: 0.4482110360109939
Time:  8.450064182281494

ROC-AUC: 0.6144875880477438
PR-AUC: 0.4620903231251577
Time:  8.633589029312134

ROC-AUC: 0.6221908011721498
PR-AUC: 0.4691994486977541
Time:  8.50170373916626

Average ROC-AUC: 0.6050907442739786
Average PR-AUC: 0.45604369427920066
Average Time: 8.8525710105896



## RRCF

In [174]:
test_rrcf(X, y)

ROC-AUC: 0.5210265055264031
PR-AUC: 0.3877797705713233
Time: 5.337709665298462

ROC-AUC: 0.5376415233199411
PR-AUC: 0.39957185643278
Time: 6.102735280990601

ROC-AUC: 0.53471975474435
PR-AUC: 0.3953004877614904
Time: 6.219537019729614

ROC-AUC: 0.5310065357718565
PR-AUC: 0.392022084429285
Time: 6.359196901321411

ROC-AUC: 0.5306410303079702
PR-AUC: 0.395173977799273
Time: 4.47104811668396

ROC-AUC: 0.5215668996669202
PR-AUC: 0.3846429546836371
Time: 4.593787908554077

ROC-AUC: 0.5266782388631128
PR-AUC: 0.38795270430003315
Time: 4.3847057819366455

ROC-AUC: 0.5273398086986938
PR-AUC: 0.3896383002816982
Time: 5.46354866027832

ROC-AUC: 0.5307920795213273
PR-AUC: 0.3943711880489241
Time: 4.451530694961548

ROC-AUC: 0.5255317684094073
PR-AUC: 0.3890488903197213
Time: 4.43478798866272

Average ROC-AUC: 0.5255317684094073
Average PR-AUC: 0.3890488903197213
Average Time: 4.43478798866272


## LOF

In [175]:
test_lof(X, y, 0.39)

ROC-AUC: 0.4564684476295462
PR-AUC: 0.3542794752963152
Time: 0.338702917098999


# Test Pima dataset

In [192]:
import pandas as pd

df = pd.read_csv('diabetes.csv')

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## IF

In [177]:
test_if(X, y, contamination=0.35)

ROC-AUC: 0.660
PR-AUC:  0.487
Time:    0.232

ROC-AUC: 0.663
PR-AUC:  0.492
Time:    0.170

ROC-AUC: 0.669
PR-AUC:  0.497
Time:    0.185

ROC-AUC: 0.677
PR-AUC:  0.506
Time:    0.187

ROC-AUC: 0.683
PR-AUC:  0.509
Time:    0.202

ROC-AUC: 0.673
PR-AUC:  0.501
Time:    0.169

ROC-AUC: 0.666
PR-AUC:  0.488
Time:    0.181

ROC-AUC: 0.678
PR-AUC:  0.510
Time:    0.180

ROC-AUC: 0.664
PR-AUC:  0.487
Time:    0.174

ROC-AUC: 0.674
PR-AUC:  0.499
Time:    0.182

Average ROC-AUC: 0.6706753731343285
Average PR-AUC: 0.497518788590697
Average Time: 0.18622210025787353



## FCF

In [178]:
test_fcf(X, y)

ROC-AUC: 0.7328358208955223
PR-AUC: 0.534329981610817
Time: 43.373560190200806

ROC-AUC: 0.7329440298507462
PR-AUC: 0.5382715435145773
Time: 42.05085062980652

ROC-AUC: 0.7322537313432835
PR-AUC: 0.5468481422784844
Time: 41.047685623168945

ROC-AUC: 0.7342238805970149
PR-AUC: 0.5407186608932844
Time: 41.09322166442871

ROC-AUC: 0.7316417910447761
PR-AUC: 0.532643438183313
Time: 43.40599179267883

ROC-AUC: 0.7295671641791045
PR-AUC: 0.5301682201347875
Time: 45.53294801712036

ROC-AUC: 0.7366007462686568
PR-AUC: 0.5406080962804329
Time: 41.37129521369934

ROC-AUC: 0.7401529850746268
PR-AUC: 0.5454580430615369
Time: 43.78364586830139

ROC-AUC: 0.7267126865671641
PR-AUC: 0.5165459596654212
Time: 45.54153084754944

ROC-AUC: 0.7452126865671642
PR-AUC: 0.5514238954248767
Time: 45.101874589920044

Average ROC-AUC: 0.7342145522388059
Average PR-AUC: 0.5377015981047532
Average Time: 43.23026044368744


## EIF

In [179]:
test_eif(X, y)

ROC-AUC: 0.6787910447761195
PR-AUC: 0.5107622358608976
Time:  1.93174147605896

ROC-AUC: 0.6738432835820896
PR-AUC: 0.5096751592597213
Time:  2.2912609577178955

ROC-AUC: 0.6522985074626867
PR-AUC: 0.490425101662713
Time:  2.0517563819885254

ROC-AUC: 0.6798880597014925
PR-AUC: 0.5128533372266384
Time:  1.7375619411468506

ROC-AUC: 0.675276119402985
PR-AUC: 0.5147193112592702
Time:  1.757962703704834

ROC-AUC: 0.6741119402985074
PR-AUC: 0.514451721445454
Time:  1.6211018562316895

ROC-AUC: 0.6642313432835821
PR-AUC: 0.4992541925054931
Time:  1.8359591960906982

ROC-AUC: 0.6559328358208956
PR-AUC: 0.49203524121718517
Time:  2.6260716915130615

ROC-AUC: 0.685955223880597
PR-AUC: 0.5183486896099676
Time:  2.1797196865081787

ROC-AUC: 0.668820895522388
PR-AUC: 0.5002936518841063
Time:  1.7510058879852295

Average ROC-AUC: 0.6709149253731345
Average PR-AUC: 0.5062818641931447
Average Time: 1.9784141778945923



## RRCF

In [180]:
test_rrcf(X, y) 

ROC-AUC: 0.40688805970149255
PR-AUC: 0.3056428926494168
Time: 3.9223380088806152

ROC-AUC: 0.4061865671641791
PR-AUC: 0.29879388763576686
Time: 3.699434518814087

ROC-AUC: 0.412544776119403
PR-AUC: 0.305804810269101
Time: 3.685863733291626

ROC-AUC: 0.4076492537313433
PR-AUC: 0.305332103304889
Time: 4.162519216537476

ROC-AUC: 0.40964179104477616
PR-AUC: 0.2996170851308317
Time: 3.638122320175171

ROC-AUC: 0.40953731343283584
PR-AUC: 0.2973656602271676
Time: 3.8645873069763184

ROC-AUC: 0.4231940298507463
PR-AUC: 0.30611538376471287
Time: 4.677422523498535

ROC-AUC: 0.40917164179104476
PR-AUC: 0.30224911199000626
Time: 3.2932207584381104

ROC-AUC: 0.4101641791044776
PR-AUC: 0.2974353376296419
Time: 3.4457943439483643

ROC-AUC: 0.40113432835820895
PR-AUC: 0.29902612223220426
Time: 3.4000234603881836

Average ROC-AUC: 0.40113432835820895
Average PR-AUC: 0.29902612223220426
Average Time: 3.4000234603881836


## LOF

In [181]:
test_lof(X, y, contamination=0.35)

ROC-AUC: 0.5384477611940299
PR-AUC: 0.36659301128046895
Time: 0.011255979537963867


# Test Annthyroid Dataset

In [205]:
train_data = pd.read_csv('ann-train.data', header=None, sep='\s+')
test_data = pd.read_csv('ann-test.data', header=None, sep='\s+')

data = pd.concat([train_data, test_data], ignore_index=True)

X = data.iloc[:, :-1].values  
y = data.iloc[:, -1].values   

scaler = StandardScaler()
X = scaler.fit_transform(X)

y = (y != 3).astype(int) 

# FCF

In [207]:
test_fcf(X, y)

ROC-AUC: 0.7139277129960186
PR-AUC: 0.15456617403484446
Time: 71.86426734924316

ROC-AUC: 0.6931580798529292
PR-AUC: 0.13046991657825352
Time: 73.19705557823181

ROC-AUC: 0.7162089804486067
PR-AUC: 0.15508584953243934
Time: 72.56828808784485

ROC-AUC: 0.703747481489722
PR-AUC: 0.13243800826249794
Time: 74.90538454055786

ROC-AUC: 0.6993540646199452
PR-AUC: 0.13735962019068465
Time: 79.72801852226257

ROC-AUC: 0.7145498257690939
PR-AUC: 0.15113772947306808
Time: 75.73452663421631

ROC-AUC: 0.7068865594424611
PR-AUC: 0.1467530153458406
Time: 78.5564193725586

ROC-AUC: 0.7081017090473092
PR-AUC: 0.14434731528832184
Time: 75.110830783844

ROC-AUC: 0.7213135639406638
PR-AUC: 0.15765258584164732
Time: 78.28871393203735

ROC-AUC: 0.7149502871635479
PR-AUC: 0.14856786057578134
Time: 79.5807671546936

Average ROC-AUC: 0.7092198264770297
Average PR-AUC: 0.1458378075123379
Average Time: 75.95342719554901


# IF

In [208]:
test_if(X, y, 0.07)

ROC-AUC: 0.620
PR-AUC:  0.118
Time:    0.266

ROC-AUC: 0.614
PR-AUC:  0.110
Time:    0.247

ROC-AUC: 0.614
PR-AUC:  0.108
Time:    0.225

ROC-AUC: 0.636
PR-AUC:  0.113
Time:    0.219

ROC-AUC: 0.612
PR-AUC:  0.111
Time:    0.223

ROC-AUC: 0.618
PR-AUC:  0.108
Time:    0.226

ROC-AUC: 0.630
PR-AUC:  0.122
Time:    0.236

ROC-AUC: 0.613
PR-AUC:  0.107
Time:    0.233

ROC-AUC: 0.617
PR-AUC:  0.113
Time:    0.233

ROC-AUC: 0.641
PR-AUC:  0.135
Time:    0.247

Average ROC-AUC: 0.6215819194278979
Average PR-AUC: 0.11451444523161836
Average Time: 0.23555619716644288



# EIF

In [209]:
test_eif(X, y)

ROC-AUC: 0.6105471502206401
PR-AUC: 0.10959405607038106
Time:  14.34892201423645

ROC-AUC: 0.5980128911767582
PR-AUC: 0.09355101463752119
Time:  12.152156352996826

ROC-AUC: 0.6174113197836637
PR-AUC: 0.11519611949321304
Time:  11.464537143707275

ROC-AUC: 0.6408107102845115
PR-AUC: 0.11678682279309094
Time:  11.44958782196045

ROC-AUC: 0.5926178291986501
PR-AUC: 0.10165551837097689
Time:  11.502840042114258

ROC-AUC: 0.6433351200288568
PR-AUC: 0.11421164802236221
Time:  11.80776572227478

ROC-AUC: 0.6084597223767321
PR-AUC: 0.09805353470843275
Time:  11.618387937545776

ROC-AUC: 0.6194509900428246
PR-AUC: 0.11446084435629016
Time:  11.553335905075073

ROC-AUC: 0.6237802151001617
PR-AUC: 0.11378982965888898
Time:  11.681152582168579

ROC-AUC: 0.6001926035300159
PR-AUC: 0.10046102248928478
Time:  12.960887670516968

Average ROC-AUC: 0.6154618551742815
Average PR-AUC: 0.1077760410600442
Average Time: 12.053957319259643



# RRCF

In [210]:
test_rrcf(X, y)

ROC-AUC: 0.344286394931628
PR-AUC: 0.052853257923627245
Time: 3.8869826793670654

ROC-AUC: 0.3452859330876908
PR-AUC: 0.05136232567593285
Time: 3.984043836593628

ROC-AUC: 0.3485562039350002
PR-AUC: 0.05424271072140012
Time: 4.637409687042236

ROC-AUC: 0.33663225873149116
PR-AUC: 0.0506274842633191
Time: 3.783252000808716

ROC-AUC: 0.33922914763386447
PR-AUC: 0.0510047718390825
Time: 4.2120137214660645

ROC-AUC: 0.3311880345337904
PR-AUC: 0.050271921742194456
Time: 3.6009325981140137

ROC-AUC: 0.33434804154572756
PR-AUC: 0.050793854521779064
Time: 4.3384435176849365

ROC-AUC: 0.33770933273102594
PR-AUC: 0.05078742806789633
Time: 3.741025686264038

ROC-AUC: 0.339834545252278
PR-AUC: 0.052131595447450496
Time: 3.776258707046509

ROC-AUC: 0.3492992557682734
PR-AUC: 0.05186045791586816
Time: 4.518470764160156

Average ROC-AUC: 0.3492992557682734
Average PR-AUC: 0.05186045791586816
Average Time: 4.518470764160156


# LOF

In [211]:
test_lof(X, y, 0.07)

ROC-AUC: 0.6723361942935866
PR-AUC: 0.1271064200560557
Time: 0.4491424560546875
