### Running CapyMOA models with the paper's datasets

In [1]:
import os
import gzip
import pandas as pd
import numpy as np
from capymoa.stream import stream_from_file
from capymoa.anomaly import HalfSpaceTrees, OnlineIsolationForest, Autoencoder
from capymoa.evaluation import AnomalyDetectionEvaluator
from sklearn.metrics import average_precision_score, roc_auc_score, roc_curve, auc
from scipy.stats import sem
import time

In [9]:

# Define datasets, their corresponding run counts, and models
datasets = {
    "abalone": 10,
    "annthyroid": 10,
    "magicgamma": 4,
    "kdd_ftp": 10,
    "mammography": 10,
    "thyroid": 10,
    "mnist": 10,
    "musk": 5,
    "satellite": 5,
    "satimages": 5,
    "spambase": 5,
    "shuttle_odds": 2
}

models = {
    "HalfSpaceTrees": HalfSpaceTrees,
    "Autoencoder": Autoencoder,
    "OnlineIsolationForest": OnlineIsolationForest
}

# Define dataset path
#local path Cristian
#dataset_path = r"C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public"

#remote server path Cristian
# In Jupyter Notebook or interactive environments, use the current working directory
current_dir = os.getcwd()

# Navigate to the public folder relative to the current working directory
dataset_path = os.path.abspath(os.path.join(current_dir, '../../../../../public'))

print("Dataset path:", dataset_path)


Dataset path: /home/infres/cchavez-23/public


In [None]:
# Results storage
all_results = []

# Main loop
for dataset_name, n_runs in datasets.items():
    print(f"Dataset: {dataset_name} (Runs: {n_runs})")

    input_path = os.path.join(dataset_path, f"{dataset_name}.gz")
    output_path = os.path.join(dataset_path, f"{dataset_name}.csv")

    # Unzip the dataset if needed
    if not os.path.exists(output_path):
        with gzip.open(input_path, 'rt') as gz_file:
            df = pd.read_csv(gz_file)
            df.to_csv(output_path, index=False)
            print(f"CSV saved to: {output_path}")

    stream = stream_from_file(output_path, dataset_name=dataset_name)
    schema = stream.get_schema()

    # Load labels for metrics
    df = pd.read_csv(output_path)
    labels = df['label'].to_numpy(dtype='float32')

    for model_name, ModelClass in models.items():
        print(f"Running model: {model_name}")
        ap_scores = []
        auc_scores = []
        auc_paper_scores = []
        auc_capymoa_scores = []
        execution_times = []
        model_results = []

        for run in range(n_runs):
            learner = ModelClass(schema)
            evaluator = AnomalyDetectionEvaluator(schema)

            stream.restart()  # Restart stream for each run
            anomaly_scores = []
            start_time = time.time()

            while stream.has_more_instances():
                instance = stream.next_instance()
                proba = learner.score_instance(instance)
                #We do 1-proba because for capyMOA models 1 means normal and 0 means anomaly, inverse as in streamrhf
                anomaly_scores.append(1-proba)
                evaluator.update(instance.y_index, proba)
                learner.train(instance)

            # Get AUC from evaluator
            auc_score_capymoa = evaluator.auc()

            #####################################
            anomaly_scores = np.array(anomaly_scores)
            ap_score = average_precision_score(labels, anomaly_scores)
            auc_score = roc_auc_score(labels, anomaly_scores)
            fpr, tpr, thresholds = roc_curve(labels, anomaly_scores)
            auc_paper = auc(fpr, tpr)
            #####################################

            execution_time = time.time() - start_time

            print(f"Run {run + 1}: AP = {ap_score:.4f}, AUC = {auc_score:.4f}, Time = {execution_time:.2f}s")

            # Save run results
            run_result = {
                'Dataset': dataset_name,
                'Model': model_name,
                'Run': run + 1,
                'AP': ap_score,
                'AUC_capymoa': auc_score_capymoa,
                'AUC (sklearn)': auc_score,
                'AUC (paper)': auc_paper,
                'Execution Time (s)': execution_time
            }
            model_results.append(run_result)
            all_results.append(run_result)

        # Save checkpoint after each model
        results_df = pd.DataFrame(all_results)
        results_df.to_csv("all_run_results_checkpoint.csv", index=False)
        print(f"Checkpoint saved for model {model_name}")

        # Summarize results for the model
        ap_scores = np.array([res['AP'] for res in model_results])
        auc_scores = np.array([res['AUC (sklearn)'] for res in model_results])
        auc_paper_scores = np.array([res['AUC (paper)'] for res in model_results])
        auc_capymoa_scores = np.array([res['AUC_capymoa'] for res in model_results])
        execution_times = np.array([res['Execution Time (s)'] for res in model_results])

        mean_ap = np.mean(ap_scores)
        mean_auc = np.mean(auc_scores)
        mean_auc_paper = np.mean(auc_paper_scores)
        mean_auc_capymoa = np.mean(auc_capymoa_scores)
        mean_time = np.mean(execution_times)
        ap_sem = sem(ap_scores)
        auc_sem = sem(auc_scores)
        auc_paper_sem = sem(auc_paper_scores)
        auc_capymoa_sem = sem(auc_capymoa_scores)
        time_sem = sem(execution_times)
        confidence_level = 1.96
        ap_ci = confidence_level * ap_sem
        auc_ci = confidence_level * auc_sem
        auc_paper_ci = confidence_level * auc_paper_sem
        auc_capymoa_ci = confidence_level * auc_capymoa_sem
        time_ci = confidence_level * time_sem

        print(f"Summary for {model_name}:")
        print(f"AP: {mean_ap:.4f} ± {ap_ci:.4f} (95% CI)")
        print(f"AUC (sklearn): {mean_auc:.4f} ± {auc_ci:.4f} (95% CI)")
        print(f"AUC (paper): {mean_auc_paper:.4f} ± {auc_paper_ci:.4f} (95% CI)")
        print(f"AUC (CapyMOA): {mean_auc_capymoa:.4f} ± {auc_capymoa_ci:.4f} (95% CI)")
        print(f"Time: {mean_time:.2f} ± {time_ci:.2f} seconds (95% CI)")

        # Save summary
        summary = {
            'Dataset': dataset_name,
            'Model': model_name,
            'Metric': ['AP', 'AUC (sklearn)', 'AUC (paper)', 'AUC (CapyMOA)', 'Execution Time'],
            'Mean': [mean_ap, mean_auc, mean_auc_paper, mean_auc_capymoa, mean_time],
            'CI (95%)': [ap_ci, auc_ci, auc_paper_ci, auc_capymoa_ci, time_ci]
        }
        summary_df = pd.DataFrame(summary)

        # Create a folder for each model in the current working directory if it doesn't exist
        model_folder = os.path.join(os.getcwd(), model_name)
        os.makedirs(model_folder, exist_ok=True)
        
        # Save the summary in the respective model's folder
        summary_df.to_csv(os.path.join(model_folder, f"{dataset_name}_summary.csv"), index=False)



Dataset: abalone (Runs: 10)
Running model: HalfSpaceTrees




Run 1: AP = 0.5313, AUC = 0.9628, Time = 0.25s
Run 2: AP = 0.5313, AUC = 0.9628, Time = 0.30s
Run 3: AP = 0.5313, AUC = 0.9628, Time = 0.09s
Run 4: AP = 0.5313, AUC = 0.9628, Time = 0.19s
Run 5: AP = 0.5313, AUC = 0.9628, Time = 0.20s
Run 6: AP = 0.5313, AUC = 0.9628, Time = 0.20s
Run 7: AP = 0.5313, AUC = 0.9628, Time = 0.20s
Run 8: AP = 0.5313, AUC = 0.9628, Time = 0.18s
Run 9: AP = 0.5313, AUC = 0.9628, Time = 0.22s
Run 10: AP = 0.5313, AUC = 0.9628, Time = 0.06s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.5313 ± 0.0000 (95% CI)
AUC (sklearn): 0.9628 ± 0.0000 (95% CI)
AUC (paper): 0.9628 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.9628 ± 0.0000 (95% CI)
Time: 0.19 ± 0.04 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.1163, AUC = 0.8546, Time = 0.71s
Run 2: AP = 0.1163, AUC = 0.8546, Time = 0.58s
Run 3: AP = 0.1163, AUC = 0.8546, Time = 0.59s
Run 4: AP = 0.1163, AUC = 0.8546, Time = 0.59s
Run 5: AP = 0.1163, AUC = 0.8546, Time = 0.59s
Run 6: AP = 0.1



Run 1: AP = 0.5767, AUC = 0.9738, Time = 0.23s
Run 2: AP = 0.5767, AUC = 0.9738, Time = 0.23s
Run 3: AP = 0.5767, AUC = 0.9738, Time = 0.22s
Run 4: AP = 0.5767, AUC = 0.9738, Time = 0.22s
Run 5: AP = 0.5767, AUC = 0.9738, Time = 0.38s
Run 6: AP = 0.5767, AUC = 0.9738, Time = 0.20s
Run 7: AP = 0.5767, AUC = 0.9738, Time = 0.21s
Run 8: AP = 0.5767, AUC = 0.9738, Time = 0.20s
Run 9: AP = 0.5767, AUC = 0.9738, Time = 0.20s
Run 10: AP = 0.5767, AUC = 0.9738, Time = 0.21s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.5767 ± 0.0000 (95% CI)
AUC (sklearn): 0.9738 ± 0.0000 (95% CI)
AUC (paper): 0.9738 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.9738 ± 0.0000 (95% CI)
Time: 0.23 ± 0.03 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.0856, AUC = 0.5341, Time = 2.17s
Run 2: AP = 0.0856, AUC = 0.5341, Time = 2.18s
Run 3: AP = 0.0856, AUC = 0.5341, Time = 2.17s
Run 4: AP = 0.0856, AUC = 0.5341, Time = 2.17s
Run 5: AP = 0.0856, AUC = 0.5341, Time = 2.17s
Run 6: AP = 0.0



Running model: HalfSpaceTrees
Run 1: AP = 0.2723, AUC = 0.3824, Time = 0.57s
Run 2: AP = 0.2723, AUC = 0.3824, Time = 0.79s
Run 3: AP = 0.2723, AUC = 0.3824, Time = 0.56s
Run 4: AP = 0.2723, AUC = 0.3824, Time = 0.56s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.2723 ± 0.0000 (95% CI)
AUC (sklearn): 0.3824 ± 0.0000 (95% CI)
AUC (paper): 0.3824 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.3824 ± 0.0000 (95% CI)
Time: 0.62 ± 0.11 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.3516, AUC = 0.5000, Time = 5.81s
Run 2: AP = 0.3516, AUC = 0.5000, Time = 5.81s
Run 3: AP = 0.3516, AUC = 0.5000, Time = 5.79s
Run 4: AP = 0.3516, AUC = 0.5000, Time = 5.81s
Checkpoint saved for model Autoencoder
Summary for Autoencoder:
AP: 0.3516 ± 0.0000 (95% CI)
AUC (sklearn): 0.5000 ± 0.0000 (95% CI)
AUC (paper): 0.5000 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.5000 ± 0.0000 (95% CI)
Time: 5.80 ± 0.01 seconds (95% CI)
Running model: OnlineIsolationForest
Run 1: AP = 0.2868, AUC = 0.3956



Run 1: AP = 0.4392, AUC = 0.6622, Time = 0.16s
Run 2: AP = 0.4392, AUC = 0.6622, Time = 0.16s
Run 3: AP = 0.4392, AUC = 0.6622, Time = 0.18s
Run 4: AP = 0.4392, AUC = 0.6622, Time = 0.17s
Run 5: AP = 0.4392, AUC = 0.6622, Time = 0.16s
Run 6: AP = 0.4392, AUC = 0.6622, Time = 0.16s
Run 7: AP = 0.4392, AUC = 0.6622, Time = 0.17s
Run 8: AP = 0.4392, AUC = 0.6622, Time = 0.17s
Run 9: AP = 0.4392, AUC = 0.6622, Time = 0.17s
Run 10: AP = 0.4392, AUC = 0.6622, Time = 0.17s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.4392 ± 0.0000 (95% CI)
AUC (sklearn): 0.6622 ± 0.0000 (95% CI)
AUC (paper): 0.6622 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.6622 ± 0.0000 (95% CI)
Time: 0.17 ± 0.00 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.1574, AUC = 0.1241, Time = 1.56s
Run 2: AP = 0.1574, AUC = 0.1241, Time = 1.56s
Run 3: AP = 0.1574, AUC = 0.1241, Time = 1.56s
Run 4: AP = 0.1574, AUC = 0.1241, Time = 1.56s
Run 5: AP = 0.1574, AUC = 0.1241, Time = 1.55s
Run 6: AP = 0.1



Run 1: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 2: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 3: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 4: AP = 0.1835, AUC = 0.9119, Time = 0.45s
Run 5: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 6: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 7: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 8: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 9: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Run 10: AP = 0.1835, AUC = 0.9119, Time = 0.33s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.1835 ± 0.0000 (95% CI)
AUC (sklearn): 0.9119 ± 0.0000 (95% CI)
AUC (paper): 0.9119 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.9119 ± 0.0000 (95% CI)
Time: 0.34 ± 0.02 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.1572, AUC = 0.7655, Time = 3.38s
Run 2: AP = 0.1572, AUC = 0.7655, Time = 3.37s
Run 3: AP = 0.1572, AUC = 0.7655, Time = 3.51s
Run 4: AP = 0.1572, AUC = 0.7655, Time = 3.42s
Run 5: AP = 0.1572, AUC = 0.7655, Time = 3.38s
Run 6: AP = 0.1



Run 2: AP = 0.5592, AUC = 0.9789, Time = 0.15s
Run 3: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 4: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 5: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 6: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 7: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 8: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 9: AP = 0.5592, AUC = 0.9789, Time = 0.12s
Run 10: AP = 0.5592, AUC = 0.9789, Time = 0.25s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.5592 ± 0.0000 (95% CI)
AUC (sklearn): 0.9789 ± 0.0000 (95% CI)
AUC (paper): 0.9789 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.9789 ± 0.0000 (95% CI)
Time: 0.14 ± 0.02 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.1035, AUC = 0.8232, Time = 1.14s
Run 2: AP = 0.1035, AUC = 0.8232, Time = 1.14s
Run 3: AP = 0.1035, AUC = 0.8232, Time = 1.14s
Run 4: AP = 0.1035, AUC = 0.8232, Time = 1.15s
Run 5: AP = 0.1035, AUC = 0.8232, Time = 1.14s
Run 6: AP = 0.1035, AUC = 0.8232, Time = 1.14s
Run 7: AP = 0.1



Running model: HalfSpaceTrees
Run 1: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 2: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 3: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 4: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 5: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 6: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 7: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 8: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 9: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Run 10: AP = 0.1168, AUC = 0.5583, Time = 0.24s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.1168 ± 0.0000 (95% CI)
AUC (sklearn): 0.5583 ± 0.0000 (95% CI)
AUC (paper): 0.5583 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.5583 ± 0.0000 (95% CI)
Time: 0.24 ± 0.00 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.0921, AUC = 0.5000, Time = 2.82s
Run 2: AP = 0.0921, AUC = 0.5000, Time = 2.83s
Run 3: AP = 0.0921, AUC = 0.5000, Time = 2.83s
Run 4: AP = 0.0921, AUC = 0.5000, Time = 2.81s
Run 5: AP = 0.0921, AUC = 0.5000



Running model: HalfSpaceTrees
Run 1: AP = 0.0207, AUC = 0.0258, Time = 0.11s
Run 2: AP = 0.0207, AUC = 0.0258, Time = 0.11s
Run 3: AP = 0.0207, AUC = 0.0258, Time = 0.11s
Run 4: AP = 0.0207, AUC = 0.0258, Time = 0.11s
Run 5: AP = 0.0207, AUC = 0.0258, Time = 0.24s
Checkpoint saved for model HalfSpaceTrees
Summary for HalfSpaceTrees:
AP: 0.0207 ± 0.0000 (95% CI)
AUC (sklearn): 0.0258 ± 0.0000 (95% CI)
AUC (paper): 0.0258 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.0258 ± 0.0000 (95% CI)
Time: 0.14 ± 0.05 seconds (95% CI)
Running model: Autoencoder
Run 1: AP = 0.0317, AUC = 0.5000, Time = 1.29s
Run 2: AP = 0.0317, AUC = 0.5000, Time = 1.29s
Run 3: AP = 0.0317, AUC = 0.5000, Time = 1.29s
Run 4: AP = 0.0317, AUC = 0.5000, Time = 1.30s
Run 5: AP = 0.0317, AUC = 0.5000, Time = 1.29s
Checkpoint saved for model Autoencoder
Summary for Autoencoder:
AP: 0.0317 ± 0.0000 (95% CI)
AUC (sklearn): 0.5000 ± 0.0000 (95% CI)
AUC (paper): 0.5000 ± 0.0000 (95% CI)
AUC (CapyMOA): 0.5000 ± 0.0000 (95% CI)
Time: 1.29

In [None]:
# Save all results to a final CSV
results_df = pd.DataFrame(all_results)
results_df.to_csv("all_run_results.csv", index=False)
print("Final results saved to 'all_run_results.csv'")
