# Comparative Analysis
In this notebook, we compare ClaSS with different competitors regarding scalability.

In [1]:
import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
import numpy as np
import pandas as pd
import daproli as dp

import pandas as pd
pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_color_codes()

import gc
import Orange
import sys
sys.path.append("../")

from tqdm import tqdm
from scipy import stats

In [3]:
from src.profile_visualization import plot_profile, plot_profile_with_ts
from src.utils import load_combined_dataset, load_benchmark_dataset

Let's choose a score for evaluation (F1 or Covering).

In [4]:
eval_score = "covering_score"

In [5]:
df_comb = load_combined_dataset()
df_comb['ts_len'] = df_comb.time_series.apply(len)
# takes too much space
df_comb.drop("time_series", axis=1, inplace=True)
df_comb.head()

Unnamed: 0,name,window_size,change_points,ts_len
0,Adiac,10,"[572, 1012, 1232]",1408
1,ArrowHead,10,[753],1506
2,Beef,50,[705],1410
3,BeetleFly,10,[1280],2560
4,BirdChicken,20,[1280],2560


Let's load the scores from the different methods.

In [6]:
datasets = ["UTSA", "TSSB", "PAMAP", "mHealth", "MIT-BIH-VE", "MIT-BIH-Arr", "SleepDB", "WESAD"] # 
converters = dict([(column, lambda s: np.sum(eval(s))) for column in ["runtimes"]])
dfs = list()
    
name, file_name = "ClaSS", "FlinkClaSS.csv"

df = []

for d in tqdm(datasets):
    tmp = pd.read_csv(f"../experiments/flink_{d}/{file_name}.gz", usecols=["dataset", "runtime", "memory"], converters=converters, compression="gzip")
    
    df_data = pd.DataFrame()
    df_data["dataset"] = tmp.dataset
    df_data["throughput"] = tmp.runtime
    df_data["name"] = d
    
    df.append(df_data)
    gc.collect()

df = pd.concat(df)    
df.sort_values(by="dataset", inplace=True)
df.reset_index(drop=True, inplace=True)

df.throughput = df_comb.ts_len / df.throughput

df

100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 13.64it/s]


Unnamed: 0,dataset,throughput,name
0,Adiac,1010.623053,TSSB
1,ArrowHead,993.916846,TSSB
2,Beef,818.364938,TSSB
3,BeetleFly,776.401981,TSSB
4,BirdChicken,697.32607,TSSB
5,CBF,666.921652,TSSB
6,Cane,244.513919,UTSA
7,Car,1124.939081,TSSB
8,Chinatown,524.169386,TSSB
9,ChlorineConcentration,1104.866666,TSSB


Let's calculate the mean/median/std score.

In [7]:
np.round(df.throughput.mean(axis=0), 0)

538.0

In [8]:
np.round(df.throughput.median(axis=0), 0)

472.0

In [9]:
np.round(df.throughput.std(axis=0), 0)

177.0

In [10]:
np.round(df.throughput.max(axis=0), 0)

1296.0