# Streaming usecase

Requirements:
- Can access only instances as they arrive (not all data)
- Limited memory 
- Fast processing time
- Approximation algorithms: sketches

Streaming library for anomaly detection: https://pysad.readthedocs.io/en/latest/api.html#module-pysad.core

Paper: https://arxiv.org/pdf/2009.02572

In [1]:
import sys
sys.path.append('../src')
import evaluation_utils, data_utils

from pysad.models import xStream, LODA
from pysad.utils import ArrayStreamer
from pysad.transform.postprocessing import RunningAveragePostprocessor
from pysad.transform.preprocessing import InstanceUnitNormScaler
from pysad.transform.ensemble import AverageScoreEnsembler
from tqdm import tqdm
import numpy as np
np.float = np.float64
np.int = np.int64

import warnings
# Ignore DeprecationWarning
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [2]:
X_all, y_all = data_utils.get_data('../data/6_cardio.npz')

In [3]:

iterator = ArrayStreamer(shuffle=False)  # Init streamer to simulate streaming data.

preprocessor = InstanceUnitNormScaler()  # Init normalizer.
postprocessor = RunningAveragePostprocessor(window_size=5)  # Init running average postprocessor.

models = [  # Models to be ensembled.
    xStream(),
    LODA()
]
ensembler = AverageScoreEnsembler()  # Ensembler module.

y_pred = []
for X, y in tqdm(iterator.iter(X_all, y_all)):  # Iterate over examples.
    model_scores = np.empty(len(models), dtype=np.float64)

    # Fit & Score via for each model.
    for i, model in enumerate(models):
        model.fit_partial(X)
        model_scores[i] = model.score_partial(X)

    score = ensembler.fit_transform_partial(model_scores)  # fit to ensembler model and get ensembled score.
    y_pred.append(score)

1831it [04:05,  7.46it/s]


# Evaluation

In [None]:
print(evaluation_utils.run_evaluation(
    y_all, np.array(y_pred).reshape(-1),
    do_point_adjustment=True))