In [1]:
from capymoa.datasets import ElectricityTiny
from capymoa.anomaly import HalfSpaceTrees, TreeBasedUnsupervised, Autoencoder, StreamRHF
from capymoa.evaluation import AnomalyDetectionEvaluator
from capymoa.stream import stream_from_file
import pandas as pd
import gzip
import os

In [2]:
from capymoa.anomaly import StreamRHFParallel

### Convert dataset from paper to .csv file

In [3]:
# Variables
path_to_dataset = r"C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public"
dataset_name = "abalone"

# Construct input and output paths
input_path = os.path.join(path_to_dataset, f"{dataset_name}.gz")
output_path = os.path.join(path_to_dataset, f"{dataset_name}.csv")

# Read the gzipped file and save it as a CSV
with gzip.open(input_path, 'rt') as gz_file:
    df = pd.read_csv(gz_file)
    df.to_csv(output_path, index=False)

print(f"CSV saved to: {output_path}")

CSV saved to: C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public\abalone.csv


### Create stream from .csv file

In [10]:
stream = stream_from_file(output_path, dataset_name="Abalone")
schema = stream.get_schema()
instance = stream.next_instance()
actual_value = instance.x
label = instance.y_index
print(actual_value)
print(label)
print(schema.get_num_classes())
print(schema.get_num_attributes())
stream.restart()

[0.53   0.42   0.135  0.677  0.2565 0.1415 0.21  ]
0
2
7




In [13]:
import time
schema = stream.get_schema()
learner = StreamRHF(schema, num_trees=100, max_height=10)
evaluator = AnomalyDetectionEvaluator(schema)
start_time = time.time()
processed_instances = 0
#while stream.has_more_instances():
while processed_instances < 1000:
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    if proba > 1:
        print('probability bigger than 1 for instance ', instance.x)
        print(proba)
    #print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
    processed_instances += 1
    # Print progress and elapsed time every 100 instances
    if processed_instances % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {processed_instances} instances. Time spent: {elapsed_time:.2f} seconds.")
        start_time = time.time()  # Reset start time for the next 100 instances
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with StreamRHF")
stream.restart()


our StreamRHF initialized
20
Processed 100 instances. Time spent: 109.63 seconds.
Processed 200 instances. Time spent: 130.64 seconds.
Processed 300 instances. Time spent: 129.74 seconds.
Processed 400 instances. Time spent: 132.18 seconds.
Processed 500 instances. Time spent: 121.18 seconds.
Processed 600 instances. Time spent: 106.81 seconds.
Processed 700 instances. Time spent: 104.51 seconds.


KeyboardInterrupt: 

In [11]:
schema = stream.get_schema()
learner = Autoencoder(schema)
evaluator = AnomalyDetectionEvaluator(schema)
while stream.has_more_instances():
    instance = stream.next_instance()
    #print('x ' + str(instance.x))
    #print("type: ", type(instance.x))
    #print('label ' + str(instance.y_index))
    #print("type: ", type(instance.y_index))
    proba = learner.score_instance(instance)
    #print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with Autoencoder")
stream.restart()

AUC: 0.85 with Autoencoder


In [9]:
stream = ElectricityTiny()
stream.restart()
schema = stream.get_schema()
learner = Autoencoder(schema)
evaluator = AnomalyDetectionEvaluator(schema)
#while stream.has_more_instances():
while processed_instances < 1000:    
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    if proba > 1:
        print('probability bigger than 1 for instance ', instance.x)
        print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
auc = evaluator.auc()
print(f"AUC: {auc:.2f}")

java.lang.NullPointerException: java.lang.NullPointerException: Cannot invoke "moa.evaluation.BasicAUCImbalancedPerformanceEvaluator$Estimator.getAUC()" because "this.aucEstimator" is null

In [7]:
stream = ElectricityTiny()
schema = stream.get_schema()
learner = StreamRHF(schema)
evaluator = AnomalyDetectionEvaluator(schema)
start_time = time.time()
processed_instances = 0
#while stream.has_more_instances():
while processed_instances < 100:
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    if proba > 1:
        print('probability bigger than 1 for instance ', instance.x)
        print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
    processed_instances += 1
    # Print progress and elapsed time every 100 instances
    if processed_instances % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {processed_instances} instances. Time spent: {elapsed_time:.2f} seconds.")
        start_time = time.time()  # Reset start time for the next 100 instances
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with StreamRHF")
stream.restart()

our StreamRHF initialized
probability bigger than 1 for instance  [0.170213 0.041161 0.13493  0.003467 0.422915 0.414912]
1.5532632506624904
probability bigger than 1 for instance  [0.191489 0.041161 0.140583 0.003467 0.422915 0.414912]
1.979999999929308
probability bigger than 1 for instance  [0.212766 0.044374 0.168997 0.003467 0.422915 0.414912]
1.2929100772106497
probability bigger than 1 for instance  [0.361702 0.040711 0.493306 0.003467 0.422915 0.414912]
1.0452681454222659
probability bigger than 1 for instance  [0.382979 0.041041 0.53258  0.003467 0.422915 0.414912]
1.2386234337800872
probability bigger than 1 for instance  [0.404255 0.041161 0.546415 0.003467 0.422915 0.414912]
1.4093322616094195
Processed 100 instances. Time spent: 17.63 seconds.
AUC: 0.74 with StreamRHF


In [8]:
print("ElectricityTiny type: ", type(stream))
print(schema)

ElectricityTiny type:  <class 'capymoa.datasets._datasets.ElectricityTiny'>
@relation electricity

@attribute period numeric
@attribute nswprice numeric
@attribute nswdemand numeric
@attribute vicprice numeric
@attribute vicdemand numeric
@attribute transfer numeric
@attribute class {0,1}

@data


In [12]:
# Define the function to evaluate a given learner on the stream
def evaluate_anomaly_learner(stream, learner, label):
    schema = stream.get_schema()
    evaluator = AnomalyDetectionEvaluator(schema)
    while stream.has_more_instances():
        instance = stream.next_instance()
        proba = learner.score_instance(instance)
        evaluator.update(instance.y_index, proba)
        learner.train(instance)
    auc = evaluator.auc()
    print(f"{label}: AUC = {auc:.2f}")
    return {
        "learner": label,
        "auc": auc,
        "evaluator": evaluator,
    }

# Stream setup
stream = Electricity()

# Define configurations of TreeBasedUnsupervised to evaluate
learners = [
    {"label": "Default Configuration", "params": {}},
    {"label": "More Trees", "params": {"num_trees": 100}},
    {"label": "Deeper Trees", "params": {"max_height": 1000}},
    {"label": "Smaller Window", "params": {"window_size": 50}},
]

# Evaluate all configurations
results = []
for config in learners:
    stream = Electricity()  # Reset the stream for each learner
    learner = TreeBasedUnsupervised(schema=stream.get_schema(), **config["params"])
    result = evaluate_anomaly_learner(stream, learner, config["label"])
    results.append(result)

# Print summary of all results
print("\nSummary of AUC Scores:")
for result in results:
    print(f"{result['learner']}: {result['auc']:.2f}")

# Optional: Access metrics for further analysis or visualization
for result in results:
    print(f"\nMetrics for {result['learner']}:")
    print(result["evaluator"].metrics())


Default Configuration: AUC = 0.50
More Trees: AUC = 0.50
Deeper Trees: AUC = 0.50
Smaller Window: AUC = 0.50

Summary of AUC Scores:
Default Configuration: 0.50
More Trees: 0.50
Deeper Trees: 0.50
Smaller Window: 0.50

Metrics for Default Configuration:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for More Trees:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for Deeper Trees:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for Smaller Window:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]
