In [1]:
import warnings
warnings.filterwarnings('ignore')

import time
import math
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets

from micoes import ClustreamExplainer
from micoes import DenstreamExplainer

from micoes.explainer.common import map_feature_scores

from micoes.microclusters.clustream import CluStream
from micoes.microclusters.clustream import update_clu_microcluster
from micoes.microclusters.denstream import DenStream
from micoes.microclusters.denstream import update_den_microcluster

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)


%matplotlib notebook
%matplotlib inline

In [2]:
def gen_blob_dataset():
    np.random.seed(0)
    n_samples = 1500
    data, labels = datasets.make_blobs(n_samples=n_samples, random_state=8)
    return data, labels

In [3]:
def test_clustream_explainer():
    data, labels = gen_blob_dataset()
    alpha = 3
    n_attributes = data.shape[1]
    n_microclusters = 12
    tau = 60 * 0.01
    initial = math.floor(0.1 * data.shape[0])
    initial_points = data[:initial+1]
    logging.info(f'initial_points average {np.mean(initial_points, axis=0)}')
    logging.info(f'initial_points max {np.max(initial_points, axis=0)}')
    logging.info(f'initial_points std by column {np.std(initial_points, axis=0)}')
    logging.info(f'initial_points std {np.std(initial_points)}')
    
    clu = CluStream(n_microclusters, n_attributes, alpha, tau)
    clu.initialize(initial_points)

    update_clu_microcluster(clu, data[initial+1:])
    total_points = np.sum(np.array([mc.n_points for mc in clu.microclusters]))
    
    max_distance = np.std(initial_points) * 2
    logging.info(f'max_distance {max_distance}')
    
    explainer = ClustreamExplainer(clu.microclusters, max_distance, n_features=n_attributes,
                                   regularization='l1', regularization_param=1,
                                   intercept_scaling=1, simple_feature_contribution=True)
    outlier = np.max(data, axis=0) * 3
    logging.info(f'outlier is {outlier}')
    feature_scores = explainer.explain_outlier(outlier, round_flag=True, prior_knowledge=1)
    feature_scores_map = map_feature_scores(('A1', 'A2'), feature_scores)
    logging.info(f'feature_scores f{feature_scores_map}')
    
    outlier = np.min(data, axis=0) * -3
    logging.info(f'outlier is {outlier}')
    feature_scores = explainer.explain_outlier(outlier, round_flag=False, prior_knowledge=1)
    feature_scores_map = map_feature_scores(('A1', 'A2'), feature_scores)
    logging.info(f'feature_scores map{feature_scores_map}')
    
test_clustream_explainer()

initial_points average [4.58133511 1.30865514]
initial_points max [10.40777751 11.14310633]
initial_points std by column [5.54342896 7.1671134 ]
initial_points std 6.612576515391461
max_distance 13.225153030782922
outlier is [31.58716726 36.81522639]
feature_scores f{'A1': 0.04908885838137147, 'A2': 0.0}
outlier is [23.77037111 40.77084067]
feature_scores map{'A1': 0.3381123101250965, 'A2': 0.22012857062188063}


In [4]:
def test_denstream_microcluster():
    data, labels = gen_blob_dataset()
    n_attributes = data.shape[1]
    mu=data.shape[0]/(len(np.unique(labels))*data.shape[1])
    logging.info(f'mu is {mu}')
    den = DenStream(lamda=1, mu=mu, beta=0.01, eta=np.std(data), n_attributes=data.shape[1])
    initial = math.floor(0.1 * data.shape[0])
    initial_points = data[:initial+1]
    den.initialize(initial_points)
    logging.info(f'Initialization:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')
    logging.info(f'Online update')
    
    update_den_microcluster(den, data[initial+1:])
    logging.info(f'After online update:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')
    
    for mc in den.c_microclusters:
        logging.info(f'{mc.n_tpoints}')
    den.check_microclusters_status(0.001)
    logging.info(f'After some period of time:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')

    max_distance = np.std(initial_points) * 2
    logging.info(f'max_distance {max_distance}')
    
    explainer = DenstreamExplainer(den.p_microclusters, den.c_microclusters,
                                   max_distance, n_features=n_attributes,
                                   regularization='l1', regularization_param=1,
                                   intercept_scaling=1, simple_feature_contribution=True)
    outlier = np.max(data, axis=0) * 3
    logging.info(f'outlier is {outlier}')
    feature_scores = explainer.explain_outlier(outlier, round_flag=True, prior_knowledge=1)
    feature_scores_map = map_feature_scores(('A1', 'A2'), feature_scores)
    logging.info(f'feature_scores f{feature_scores_map}')
    
    outlier = np.min(data, axis=0) * -3
    logging.info(f'outlier is {outlier}')
    feature_scores = explainer.explain_outlier(outlier, round_flag=False, prior_knowledge=1)
    feature_scores_map = map_feature_scores(('A1', 'A2'), feature_scores)
    logging.info(f'feature_scores f{feature_scores_map}')
    
test_denstream_microcluster()

mu is 250.0
Initialization:
 OMC is 2
 PMC is 0
 CMC is 0
Online update
After online update:
 OMC is 2
 PMC is 3
 CMC is 3
250.99573923853978
250.9960348101234
250.9959792839316
After some period of time:
 OMC is 0
 PMC is 3
 CMC is 3
max_distance 13.225153030782922
outlier is [31.58716726 36.81522639]
feature_scores f{'A1': 0.048988586065161176, 'A2': 0.0}
outlier is [23.77037111 40.77084067]
feature_scores f{'A1': 0.2983207954861661, 'A2': 0.19638531698126108}
