In [None]:
import sys 
sys.path.insert(1, "../")
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import ast

import dask.dataframe as dd
from dask.dataframe import from_pandas

from workloads.util import use_results, use_dataset

%load_ext autoreload
%autoreload 2

In [None]:
experiment = "stl-yahoo-A4-keys-100-interval-10000-events-200000-queries-200000"

results_dir = use_results(experiment)
dataset_dir = use_dataset(experiment, redownload=True)
print(results_dir)
print(dataset_dir)

In [None]:
os.listdir(dataset_dir)

In [None]:
sorted(os.listdir(results_dir))

In [None]:
#queries_df = from_pandas(pd.read_csv(f"{dataset_dir}/queries.csv").head(100000), npartitions=1000)
queries_df = pd.read_csv(f"{dataset_dir}/queries.csv")
events_df = pd.read_csv(f"{dataset_dir}/events.csv")
oracle_df = pd.read_csv(f"{dataset_dir}/oracle_features_672.csv").dropna()
oracle_df["seasonality"] = oracle_df['seasonality'].dropna().apply(ast.literal_eval)

In [None]:
oracle_df[oracle_df["key_id"] == 1]

In [None]:
queries_df[queries_df["key_id"] == 1]

In [None]:
oracle_df["seasonality"] = oracle_df['seasonality'].dropna().apply(ast.literal_eval)

In [None]:
from workloads.util import join_queries_features

In [None]:
results_df = pd.read_csv(f"{results_dir}/results_workers_4_lifo_window_128_slide_24.csv")

In [None]:
joined_df = join_queries_features(queries_df, results_df).set_index("query_id")

In [None]:
queries_df.join(joined_df, on="query_id", lsuffix='_query', rsuffix='_feature')

In [None]:
joined_df

In [None]:
queries_df.sort_values(by=["key_id", "timestamp_ms"]).iloc[237]

In [None]:
queries_df.iloc[237]

## Feature Results for Queries 

In [None]:
result_files = [
 'results_workers_1_lifo_window_672_slide_12.csv',
 'results_workers_1_lifo_window_672_slide_192.csv',
 'results_workers_1_lifo_window_672_slide_24.csv',
 'results_workers_1_lifo_window_672_slide_384.csv',
 'results_workers_1_lifo_window_672_slide_48.csv',
 'results_workers_1_lifo_window_672_slide_672.csv',
 'results_workers_1_lifo_window_672_slide_96.csv',
]
# result_files = os.listdir(results_dir)

In [None]:
results = {}

In [None]:
for result_file in result_files: 
    results_df = pd.read_csv(f"{results_dir}/{result_file}")
    print(result_file, "updates:", len(results_df.index))
    #results_df["seasonality"] = results_df['seasonality'].apply(ast.literal_eval)
    
    joined_df = join_queries_features(queries_df, results_df).set_index("query_id")
    joined_df.to_csv(f"{results_dir}/query_{result_file}")

## Oracle Feature Results for Queries 

In [None]:
#joined_df = join_queries_features(queries_df, oracle_df).set_index("query_id")

In [None]:
results["oracle"] = queries_df.merge(oracle_df, on=["timestamp_ms", "key_id", "value"]) #, lsuffix='_query', rsuffix='_oracle')

In [None]:
results["oracle"][results["oracle"]["key_id"] == 1]

In [None]:
oracle_predictions = results["oracle"].dropna().apply(lambda x: predict(
        int(x["value"]), 
        int(x["trend"]), 
        x["seasonality"], 
        int(x["timestamp_ms"]), 
        int(x["timestamp_ms"]), 
        interval=5
    ), axis=1)

In [None]:
results["oracle"]["prediction"] = oracle_predictions

# Evaluate Approximated Features 

In [None]:
from workloads.stl.stl_util import predict, predict_seasonality

### Run predictions 

In [None]:
for key in result_files:
    
    if key == "oracle": continue 
        
    joined_df = pd.read_csv(f"{results_dir}/query_{key}")
    joined_df["seasonality"] = joined_df['seasonality'].apply(ast.literal_eval)
    print(joined_df.timestamp_ms.mean())
    df = queries_df.merge(joined_df, on=["query_id", "key_id"], suffixes=('_query', '_feature'))


    results[key] = df
    print(key)
    staleness = df["timestamp_ms_query"] - df["timestamp_ms_feature"]
    print("staleness", staleness.mean())
    
    results[key]["prediction"] = df.dropna().apply(lambda x: predict(
        int(x["value"]), 
        int(x["trend"]), 
        x["seasonality"], 
        int(x["timestamp_ms_query"]), 
        int(x["timestamp_ms_feature"]), 
        interval=5
    ), axis=1)
    print("residual", residuals.abs().mean())
    
    results[key]["prediction_seasonality"] = df.dropna().apply(lambda x: predict_seasonality(
        x["seasonality"], 
        int(x["timestamp_ms_query"]), 
        int(x["timestamp_ms_feature"]), 
        interval=5
    ), axis=1)
    print()

In [None]:
from sktime.performance_metrics.forecasting import mean_squared_scaled_error
from tqdm import tqdm 

In [None]:
losses = {}
for key, df in results.items():
    
    if key == "oracle": continue 
    print(key)
    losses[key] = {}
    
    joined_df = results[key].join(results["oracle"].set_index("query_id"), on="query_id", rsuffix='_oracle')
    #print(joined_df.key_id_query.value_counts())
    for key_id in tqdm(range(1, 101, 1)): 
        
        df = joined_df[joined_df["key_id"] == key_id].dropna(subset=["prediction", "prediction_oracle", "value"])
         
        if len(df.index) == 0: 
            print(key, key_id, "no results")
            continue 
        
        y_true = df["prediction_oracle"]
        y_pred = df["prediction"]
        value = df["value"]
        
        #y_true = results["oracle"][results["oracle"]["key_id_query"] == key_id]["prediction"]
        #y_pred = results[key][results[key]["key_id_query"] == key_id]["prediction"]
        #value = results[key][results[key]["key_id_query"] == key_id]["value"]
        loss = mean_squared_scaled_error(
            y_true=y_true, y_pred=y_pred, y_train=value
        )
        losses[key][key_id] = loss

In [None]:
for key in losses.keys(): 
    print(key, len(list(losses[key].values())))
    print(sum(list(losses[key].values())))
    print(list(losses[key].keys()))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (10,5)

heights_a = list(losses["results_workers_1_lifo_window_672_slide_24.csv"].values())
heights_b = list(losses["results_workers_1_lifo_window_672_slide_48.csv"].values())
heights_c =  list(losses["results_workers_1_lifo_window_672_slide_96.csv"].values())

#position = np.arange(L)
position = range(1, 30, 1)
colors = ['C0', 'C1', 'C2']
labels = ["slide=24", "slide=48", "slide=96"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (15,3)

heights_a = list(losses["results_workers_1_lifo_window_672_slide_192.csv"].values())
heights_b = list(losses["results_workers_1_lifo_window_672_slide_672.csv"].values())

#position = np.arange(L)
position = range(1, 101, 1)
colors = ['blue', 'red']
labels = ["slide=192", "slide=672"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (15,3)

heights_a = list(losses["results_workers_1_lifo_window_672_slide_48.csv"].values())
heights_b = list(losses["results_workers_1_lifo_window_672_slide_192.csv"].values())

#position = np.arange(L)
position = range(1, 101, 1)
colors = ['blue', 'red']
labels = ["slide=48", "slide=192"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (15,3)

heights_a = list(losses["results_workers_1_lifo_window_672_slide_24.csv"].values())
heights_b = list(losses["results_workers_1_lifo_window_672_slide_48.csv"].values())

#position = np.arange(L)
position = range(1, 101, 1)
colors = ['blue', 'red']
labels = ["slide=24", "slide=48"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (15,5)

heights_a = list(losses["results_workers_1_lifo_window_672_slide_24.csv"].values())
heights_b = list(losses["results_workers_1_lifo_window_672_slide_192.csv"].values())

#position = np.arange(L)
position = range(1, 101, 1)
colors = ['blue', 'red']
labels = ["slide=24", "slide=192"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
# plot key=1, 6, 20

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict

L = 5
plt.rcParams["figure.figsize"] = (10,5)

heights_a = list(losses["results_workers_2_lifo_window_672_slide_96.csv"].values())
heights_b = list(losses["results_workers_2_lifo_window_672_slide_192.csv"].values())
heights_c =  list(losses["results_workers_2_lifo_window_672_slide_384.csv"].values())

#position = np.arange(L)
position = range(1, 30, 1)
colors = ['C0', 'C1', 'C2']
labels = ["slide=96", "slide=192", "slide=384"]

plt.figure()

for x, ha, hb, hc in zip(position, heights_a, heights_b, heights_c):
    for i, (h, c, l) in enumerate(sorted(zip([ha, hb, hc], colors, labels))):
        plt.bar(x, h, color=c, zorder=-i, label=l)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.xlabel("Key")
plt.ylabel("MASE Loss")

plt.show()

In [None]:
residuals = query_oracle_df.apply(lambda x: predict(
        int(x["value_query"]), 
        int(x["trend"]), 
        x["seasonality"], 
        int(x["timestamp_ms"]), 
        int(x["timestamp_ms"]), 
        interval=5
), axis=1)
print(residuals.abs().mean())


In [None]:
results["results_workers_4_lifo_window_128_slide_24.csv"]

## Per-key Evaluation 

In [None]:
results["results_workers_4_lifo_window_128_slide_128.csv"]

In [None]:
results["oracle"][results["oracle"]["key_id_query"] == 1]

In [None]:
d = results["results_workers_4_lifo_window_672_slide_96.csv"].join(results["oracle"].set_index("query_id"), on="query_id", rsuffix="_oracle")

In [None]:
d[d["key_id_query"] == 1]

In [None]:
results["results_workers_4_lifo_window_672_slide_96.csv"][results["results_workers_4_lifo_window_672_slide_96.csv"]["key_id_query"] == 1]

In [None]:
results["oracle"][results["oracle"]["key_id_oracle"] == 1].sort_values(by="timestamp_ms_oracle")

In [None]:
results["results_workers_4_lifo_window_672_slide_96.csv"][results["results_workers_4_lifo_window_672_slide_96.csv"]["key_id_feature"] == 1]

In [None]:
results["results_workers_4_lifo_window_672_slide_192.csv"][results["results_workers_4_lifo_window_672_slide_192.csv"]["key_id_feature"] == 1]

In [None]:
results.keys()

In [None]:
results['results_workers_2_lifo_window_672_slide_192.csv'][results['results_workers_2_lifo_window_672_slide_192.csv']["key_id_feature"]== 40]

In [None]:
results['results_workers_2_lifo_window_672_slide_672.csv'][results['results_workers_2_lifo_window_672_slide_672.csv']["key_id_feature"]== 40]

In [None]:
df = pd.read_csv(f'{results_dir}/results_workers_2_lifo_window_672_slide_672.csv')

In [None]:
df[df["key_id"] == 40]

In [None]:
joined_df = join_queries_features(queries_df, df).set_index("query_id")

In [None]:
joined_df[joined_df["key_id"] == 40]

In [None]:
query_results_df = queries_df.merge(joined_df, on=["query_id", "key_id"], suffixes=('_query', '_feature'))

In [None]:
query_results_df[query_results_df["key_id"] == 40]

In [None]:
queries_df