In [1]:
import sys 
sys.path.insert(1, "../")
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import ast

import dask.dataframe as dd
from dask.dataframe import from_pandas

from workloads.util import use_results, use_dataset, join_queries_features

%load_ext autoreload
%autoreload 2

In [2]:
experiment = "stl-yahoo-A4-keys-100-interval-10000-events-200000-queries-200000"

results_dir = use_results(experiment)
dataset_dir = use_dataset(experiment, redownload=True)
print(results_dir)
print(dataset_dir)

/data/wooders/ralf-vldb//datasets/stl-yahoo-A4-keys-100-interval-10000-events-200000-queries-200000
Downloading from aws: vldb
/data/wooders/ralf-vldb//results/stl-yahoo-A4-keys-100-interval-10000-events-200000-queries-200000
/data/wooders/ralf-vldb//datasets/stl-yahoo-A4-keys-100-interval-10000-events-200000-queries-200000


In [3]:
events_df = pd.read_csv(f"{dataset_dir}/events.csv")
oracle_df = pd.read_csv(f"{dataset_dir}/oracle_features_672.csv").dropna()
oracle_df["seasonality"] = oracle_df['seasonality'].dropna().apply(ast.literal_eval)

In [4]:
queries_df = pd.read_csv(f"{dataset_dir}/queries.csv").set_index("query_id", drop=False)

## Check Single Result 

In [5]:
def join_data(result_df, timestamp_df, queries_df, oracle_df):
    start_ts = timestamp_df.timestamp.min()
    
    queries_df = queries_df.merge(timestamp_df, on="timestamp_ms").set_index("query_id", drop=False)
    #queries_df.timestamp = queries_df.timestamp - start_ts
    
    oracle_df = oracle_df.merge(timestamp_df, on="timestamp_ms")
    #oracle_df.timestamp = oracle_df.timestamp - start_ts
    
    joined_oracle_df = queries_df.merge(oracle_df, on=["timestamp_ms", "key_id", "value"]).set_index("query_id", drop=False)
    joined_df = join_queries_features(queries_df, result_df).set_index("query_id", drop=False)
    
    return joined_oracle_df, joined_df

In [6]:
result_file = "results_workers_4_lifo_window_672_slide_48.csv"
timestamp_file = "results_workers_4_lifo_window_672_slide_48_timestamps.csv"
result_df = pd.read_csv(f"{results_dir}/{result_file}")
result_df['trend'] = result_df['trend'].dropna().apply(ast.literal_eval)
result_df['seasonality'] = result_df['seasonality'].dropna().apply(ast.literal_eval)
timestamp_df = pd.read_csv(f"{results_dir}/{timestamp_file}")

In [7]:
oracle_features_df, features_df = join_data(result_df, timestamp_df, queries_df, oracle_df)

KeyError: 'timestamp'

In [None]:
from workloads.stl.stl_util import predict, predict_seasonality
from tqdm import tqdm 

In [None]:
def evaluate(features_df, oracle_features_df, queries_df): 

    predictions = [] 
    oracle_predictions = [] 
    values = [] 

    for query_id in tqdm(features_df.index): 
        #print(query_id)
        of = oracle_features_df.loc[query_id]
        f = features_df.loc[query_id]
        q = queries_df.loc[query_id]

        #print(features_df.loc[query_id].trend)

        feature_prediction = predict(
            float(q.value), 
            float(f.trend[-1]), 
            f.seasonality, 
            int(q.timestamp_ms), 
            int(f.timestamp_ms), 
            interval=5
        )

        oracle_prediction = predict(
            q.value, 
            of.trend, 
            of.seasonality, 
            q.timestamp_ms, 
            of.timestamp_ms, 
            interval=5
        )

        assert of.timestamp_ms == q.timestamp_ms
        assert of.value == q.value 

        #print(query_id, "staleness", q.timestamp_ms - f.timestamp_ms)
        #print(feature_prediction, oracle_prediction)

        feature_prediction_seasonality = predict_seasonality(
            f.seasonality, 
            q.timestamp_ms, 
            of.timestamp_ms, 
            interval=5
        )

        oracle_prediction_seasonality = predict_seasonality(
            of.seasonality, 
            q.timestamp_ms, 
            of.timestamp_ms, 
            interval=5
        )

        #print("seasonal", feature_prediction_seasonality, oracle_prediction_seasonality)

        predictions.append(feature_prediction)
        oracle_predictions.append(oracle_prediction)
        values.append(q.value)

    features_df["predictions"] = predictions
    features_df["oracle_predictions"] = oracle_predictions
    features_df["value"] = values
    return features_df 
    

In [None]:
prediction_results_df = evaluate(features_df, oracle_features_df, queries_df)

In [None]:
prediction_results_df

In [None]:
from sktime.performance_metrics.forecasting import mean_squared_scaled_error

In [None]:
key_loss = prediction_results_df.groupby("key_id").apply(
    lambda x: mean_squared_scaled_error(
        y_true=x.predictions.to_numpy(), 
        y_pred=x.oracle_predictions.to_numpy(),
        y_train=x.value.to_numpy(),
    )
)

In [None]:
key_loss.sum() 

In [None]:
len(prediction_results_df.index)

## Evaluate and Plot Multiple Results 