In [1]:
import pandas as pd
import os
import pickle

In [2]:
def exos_running_time_df(results):
    df_dict = {}
    output = results['output']
    windows = output.keys()
    neigh_times = list()
    est_times = list()
    out_attrs_times = list()
    for window in windows:
        est_times.append(output[window]['est_time'])
        del output[window]['est_time']
        neigh_time = max([output[window][stream_id]['temporal_neighbor_time'] for stream_id in output[window].keys()])
        neigh_times.append(neigh_time)
        out_attrs_time = max([output[window][stream_id]['out_attrs_time'] for stream_id in output[window].keys()])
        out_attrs_times.append(out_attrs_time)
    df_dict['windows'] = windows
    df_dict['est_times'] = est_times
    df_dict['neigh_times'] = neigh_times
    df_dict['out_attrs_times'] = out_attrs_times
    return pd.DataFrame.from_dict(df_dict)

In [3]:
def unpickled_results(filename):
    exos_file = open(filename, 'rb')
    results = pickle.load(exos_file)
    return results

In [4]:
def match_attribute_based_on_contribution_score(outlying_attributes, ground_truth):
    """
    Inputs
    ------
    outlying_attributes: dict
        key-value pair, key=attribute's name, value=attribute's contribution score
    ground_truth : list
        list of outlying attributes
    outlying attributes are ordered by their contribution scores from the highest to the lowest
    """
    high_score_attr = list(outlying_attributes.keys())[0]
    if high_score_attr in ground_truth:
        return True
    return False

In [5]:
def compute_performance(gt_folder, gt_filename, result_folder, result_filename,
                     n_streams, window_size, score_precision = 0.1):
    result_path = f'{result_folder}/{result_filename}'
    results = unpickled_results(result_path)
    windows = tuple(results['output'].keys()) ## get tuple of window ids : (window_0, window_1, ...)
    n_outliers = 0
    matched = list()
    matched2 = list()
    for i in range(n_streams):
        gt_path = f'{gt_folder}/{i}_{gt_filename}' #ground truth filepath
        df = pd.read_pickle(gt_path)
        df = df[['label', 'outlying_attributes']]
        for j, window in enumerate(windows):
            outlier_indices = results['output'][window][i]['outlier_indices']
            if outlier_indices is not None:
                outlier_indices = outlier_indices[i]
                new_df = df.iloc[j*window_size:(j+1)*window_size].reset_index(drop=True)
                n_outliers += len(outlier_indices)
                ground_truth = new_df.iloc[outlier_indices].reset_index(drop=True)
                outlying_attributes = results['output'][window][i]['out_attrs']
                for idx , gt in ground_truth.iterrows():
                    check = match_attribute_based_on_contribution_score(outlying_attributes[idx],
                                                                   gt['outlying_attributes'])
                    matched.append(check)
    matched = [True for item in matched if item == True]
    return matched, n_outliers, results['simulator_time']

In [6]:
def recap_performance_info(rel_path =  'pickles/nstreams', 
                           n_streams=(5,10,15),
                           bname = 'k1_100K_Case1'):
    cwd = os.getcwd()
    avg_accuracies = list()
    avg_running_time = list()
    streams = list()
    for nstreams in n_streams:
        file_path = f'{rel_path}/{nstreams}/performance_{nstreams}_{bname}.pkl'
        path = os.path.join(cwd, file_path)
        df = pd.read_pickle(path)
        mean_accuracy = df['accuracy'].mean()
        avg_accuracies.append(mean_accuracy)
        mean_running_time = df['running_time'].mean()
        avg_running_time.append(mean_running_time)
        streams.append(nstreams)
    performance = {'nstreams' : streams, 
                   'avg_accuracy' : avg_accuracies,
                   'avg_running_time' : avg_running_time}
    df = pd.DataFrame(performance)
    df.to_pickle(f'{rel_path}/avg_performance_{bname}.pkl')
    return df
recap_performance_info(rel_path  =  'pickles/nstreams', 
                       n_streams = (5,10,15,20,25,30),
                       bname = 'k1_100K_Case1')

Unnamed: 0,nstreams,avg_accuracy,avg_running_time
0,5,73.442,36.022518
1,10,76.211,57.036936
2,15,79.982667,77.151869
3,20,81.844,108.403863
4,25,81.9476,132.502229
5,30,82.998667,161.924292


In [7]:
def get_confusion_matrix(out_attrs, ground_truth, total_attributes):
    TP = len(set(out_attrs) & set(ground_truth))
    FP = len(set(out_attrs) - set(ground_truth))
    FN = len(set(ground_truth) - set(out_attrs))
    TN = total_attributes - (TP + FP + FN)
    confusion_matrix = {'TP' : TP,
                        'FP' : FP,
                        'FN' : FN,
                        'TN' : TN}
    return confusion_matrix

def compute_precision(confusion_matrix):
    precision = confusion_matrix['TP'] / ( confusion_matrix['TP'] +  confusion_matrix['FP'])
    return precision

def compute_recall(confusion_matrix):
    recall = confusion_matrix['TP'] / ( confusion_matrix['TP'] +  confusion_matrix['FN'])
    return recall

def compute_f1_score(precision, recall):
    if precision+recall == 0:
        return 0
    f1_score = (2 * precision * recall ) / (precision + recall)
    return f1_score

In [8]:
def compute_performance_v2(gt_folder, gt_filename, result_folder, result_filename,
                           n_streams, window_size, non_data_attr=2):
    """
    compute TP, FP, TN and FN and then compute precision, recall and F1 score
    """
    result_path = f'{result_folder}/{result_filename}'
    results = unpickled_results(result_path)
    windows = tuple(results['output'].keys()) ## get tuple of window ids : (window_0, window_1, ...)
    n_outliers = 0
    accuracies = {}
    for i in range(n_streams):
        precision_list = list()
        recall_list = list()
        f1_score_list = list()
        gt_path = f'{gt_folder}/{i}_{gt_filename}' #ground truth filepath
        df = pd.read_pickle(gt_path)
        n_attributes = df.shape[1] - non_data_attr
        df = df[['label', 'outlying_attributes']]
        for j, window in enumerate(windows):
            outlier_indices = results['output'][window][i]['outlier_indices']
            if outlier_indices is not None:
                outlier_indices = outlier_indices[i]
                new_df = df.iloc[j*window_size:(j+1)*window_size].reset_index(drop=True)
                n_outliers += len(outlier_indices)
                ground_truth = new_df.iloc[outlier_indices].reset_index(drop=True)
                outlying_attributes = results['output'][window][i]['out_attrs']
                for idx , gt in ground_truth.iterrows():
                    confusion_matrix = get_confusion_matrix(outlying_attributes[idx], 
                                                            gt['outlying_attributes'], 
                                                            n_attributes)
                    precision = compute_precision(confusion_matrix)
                    recall = compute_recall(confusion_matrix)
                    f1_score = compute_f1_score(precision, recall)
                    precision_list.append(precision)
                    recall_list.append(recall)
                    f1_score_list.append(f1_score)
        accuracies[i] = {'precision' : precision_list,
                         'recall' : recall_list,
                         'f1_score' : f1_score_list,}
    return n_outliers, accuracies, results['simulator_time']

In [9]:
def aggregate_performance(gt_folder, gt_filename, result_folder, result_filename,
                          performance_folder,
                          n_streams, window_size, non_data_attr=2):
    n_outliers, accuracies, simulation_time = compute_performance_v2(gt_folder, 
                                                                     gt_filename, 
                                                                     result_folder, 
                                                                     result_filename,
                                                                     n_streams, 
                                                                     window_size, 
                                                                     non_data_attr)
    df= pd.DataFrame(accuracies[0])
    for i in range(1, n_streams):
        ndf = pd.DataFrame(accuracies[i])
        df = df.append(ndf, ignore_index = True)
    ### for sanity checking
    if n_outliers == df.shape[0]:
        df.to_pickle(f"{performance_folder}/{result_filename}")
    return df, simulation_time