In [2]:
import warnings
warnings.filterwarnings('ignore')

import time
import math
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from micoes.experiment.generate_explainer_report import build_temporal_df_microcluster_explainer
from micoes.experiment.generate_explainer_report import build_df_summary_temporal_microcluster_explainer

from micoes.experiment.run_evaluation import FeatureScoresEvaluation
from micoes.experiment.run_evaluation import run_evaluation_on_count_based_window

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)


%matplotlib notebook
%matplotlib inline


In [3]:
def run_wine_evaluation_on_count_based_window():
    filepath = '../data/odds/wine.csv' 
    check = pd.read_csv(filepath)
    groundtruth_path = '../groundtruth/pickles/wine_60.pickle'
    detection_results, cluexplainer_results, denexplainer_results, matched_result = run_evaluation_on_count_based_window(filepath, groundtruth_path, 
                                                                                      window_size=60, init_percentage=10, 
                                                                                      alpha=3, precise=0.01, lamda=1)
    
    df_clu = build_temporal_df_microcluster_explainer(detection_results, cluexplainer_results, explainer_type='clu')
    df_clu_summary = build_df_summary_temporal_microcluster_explainer(stream_names=('wine',), 
                                                                      dfs=(df_clu, ), 
                                                                      explainer_type='clu', 
                                                                      n_features=(check.shape[1]-1,))
    df_clu_summary['clu_matched'] = matched_result['clu_matched_percentage']
    df_clu_summary['coin_matched'] = matched_result['coin_matched_percentage']
    
    df_den = build_temporal_df_microcluster_explainer(detection_results, denexplainer_results, explainer_type='den')
    df_den_summary = build_df_summary_temporal_microcluster_explainer(stream_names=('wine',), 
                                                                      dfs=(df_den, ), 
                                                                      explainer_type='den', 
                                                                      n_features=(check.shape[1]-1,))
    df_den_summary['den_matched'] = matched_result['den_matched_percentage']
    df_den_summary['coin_matched'] = matched_result['coin_matched_percentage']
    return df_clu_summary, df_den_summary

In [4]:
def run_evaluation_on_a_stream(filepath, groundtruth_path):
    detection_results, cluexplainer_results, denexplainer_results, matched_result = run_evaluation_on_count_based_window(filepath, groundtruth_path, 
                                                  window_size=60, init_percentage=10, 
                                                  alpha=3, precise=0.01, lamda=1)
    
    df_clu = build_temporal_df_microcluster_explainer(detection_results, cluexplainer_results, explainer_type='clu')
    df_den = build_temporal_df_microcluster_explainer(detection_results, denexplainer_results, explainer_type='den')
    return df_clu, df_den, matched_result

def run_all_streams_window_60():
    streams = ('wine', 'mammography', 'shuttle', 'smtp', 'http')
    folder = '../data/odds/'
    gt_folder = '../groundtruth/pickles/'
    dfs_clu = list()
    dfs_den = list()
    matched_clu = list()
    matched_den = list()
    matched_coin = list()
    n_features = list()
    for stream in streams:
        filepath = f'{folder}{stream}.csv'
        check = pd.read_csv(filepath)
        groundtruth_path = f'{gt_folder}{stream}_60.pickle'
        df_clu, df_den, matched_result= run_evaluation_on_a_stream(filepath, groundtruth_path)
        dfs_clu.append(df_clu)
        dfs_den.append(df_den)
        matched_clu.append(matched_result['clu_matched_percentage'])
        matched_den.append(matched_result['den_matched_percentage'])
        matched_coin.append(matched_result['coin_matched_percentage'])
        n_features.append(check.shape[1]-1)
    df_clu_summary = build_df_summary_temporal_microcluster_explainer(stream_names=streams, 
                                                                      dfs=dfs_clu, 
                                                                      explainer_type='clu', 
                                                                      n_features=n_features)
    df_clu_summary['clu_matched'] = matched_clu
    df_clu_summary['coin_matched'] = matched_coin
    
    df_den_summary = build_df_summary_temporal_microcluster_explainer(stream_names=streams, 
                                                                      dfs=dfs_den, 
                                                                      explainer_type='den', 
                                                                      n_features=n_features)
    df_den_summary['den_matched'] = matched_den
    df_den_summary['coin_matched'] = matched_coin
    return df_clu_summary, df_den_summary

In [5]:
# df_clu_summary, df_den_summary = run_all_streams_window_60()

In [6]:
# df_clu_summary.to_pickle('evaluation_results/clu_5streams.pickle')
# df_den_summary.to_pickle('evaluation_results/den_5streams.pickle')

In [7]:
df_clu_summary = pd.read_pickle('evaluation_results/clu_5streams.pickle')
df_den_summary = pd.read_pickle('evaluation_results/den_5streams.pickle')

In [8]:
pd.set_option('display.float_format','{:.2f}'.format)
df_clu_summary

Unnamed: 0,stream_name,n_features,total_data,total_outliers,total_detection_duration,total_microcluster_duration,total_explanation_duration,total_clu_coin_explainer_duration,total_coin_duration,ratio_microcluster_explainer_over_coin,clu_matched,coin_matched
0,wine,13,129,16,0.0,0.02,0.1,0.11,0.57,0.2,0.62,0.0
1,mammography,6,11183,1714,0.22,2.02,8.84,10.87,75.01,0.14,0.8,0.75
2,shuttle,9,49097,11020,0.99,11.95,100.21,112.17,493.46,0.23,0.48,0.56
3,smtp,3,95156,13561,1.6,13.58,129.09,142.67,523.0,0.27,0.83,0.88
4,http,3,567498,103323,10.59,135.66,1150.92,1286.6,4392.27,0.29,0.99,0.72


In [8]:
df_den_summary

Unnamed: 0,stream_name,n_features,total_data,total_outliers,total_detection_duration,total_microcluster_duration,total_explanation_duration,total_den_coin_explainer_duration,total_coin_duration,ratio_microcluster_explainer_over_coin,den_matched,coin_matched
0,wine,13,129,16,0.0,0.01,0.1,0.11,0.57,0.19,0.62,0.0
1,mammography,6,11183,1714,0.22,6.44,10.27,16.7,75.01,0.22,0.67,0.75
2,shuttle,9,49097,11020,0.99,3.04,74.35,77.39,493.46,0.16,0.7,0.56
3,smtp,3,95156,13561,1.6,4.35,63.62,67.97,523.0,0.13,1.0,0.88
4,http,3,567498,103323,10.59,27.43,562.67,590.13,4392.27,0.13,0.99,0.72
