In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

import seaborn as sns
sns.set_theme(style="whitegrid", palette="muted")

In [2]:
import duckdb

In [3]:
conn = duckdb.connect("/home/ubuntu/azure_long_series_with_ts_array.duckdb", read_only=True)
conn.execute("PRAGMA enable_progress_bar")

<duckdb.DuckDBPyConnection at 0x7fe7496e8ab0>

In [193]:
df = conn.execute("""
SELECT *, (timestamp[-1] - timestamp[1]+1)/288-2 as num_windows
  FROM readings
  USING SAMPLE 1000
""").df()



In [194]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.forecasting.stl import STLForecast
import warnings

def fit(arr):
    with warnings.catch_warnings():
        # catch warning for ML fit
        warnings.filterwarnings("ignore")
        return STLForecast(
            pd.Series(arr).interpolate(),
            ARIMA,
            model_kwargs=dict(order=(1, 1, 0), trend="t"),
            period=12 * 24,  # 5 min timestamp interval, period of one day
        ).fit().forecast(7775).values # return array

In [195]:
WINDOW_SIZE = 864
SLIDE_SIZE = 288
TOTAL_LENGTH = 8639 
FINAL_IDX = 8638
DEBUG_MODE = True


def window(arr):
    num_windows = len(arr) // SLIDE_SIZE - 2
    start_idx_to_window = {
        SLIDE_SIZE * i: arr[SLIDE_SIZE * i : SLIDE_SIZE * i + WINDOW_SIZE]
        for i in range(num_windows)
    }
    if DEBUG_MODE:
        for w in start_idx_to_window.values():
            assert len(w) == WINDOW_SIZE
    return start_idx_to_window


In [196]:
df["num_windows"].sum() * 0.8 # total runtime s

524.275

In [197]:
from multiprocessing import Pool
from tqdm import tqdm

def map_row(row: pd.Series):
    return [
        {
            "int_id": row["int_id"],
            "start_idx": k,
            "window_arr": v,
            "forecast_arr": fit(v),
        }
        for k, v in window(row["avg_cpu"]).items()
    ]


with Pool() as pool:
    results = list(tqdm(pool.imap(map_row, [row for _, row in df.iterrows()]), total=len(df)))

100%|██████████| 1000/1000 [08:00<00:00,  2.08it/s]


In [198]:
import itertools
fdf = pd.DataFrame(list(itertools.chain.from_iterable(results)))

In [199]:
mse_rows = []
for int_id, s in fdf.groupby("int_id"):
    pred_arr = np.zeros(TOTAL_LENGTH, dtype="float32")
    for _, row in s.iterrows():
        pred_start_idx = row["start_idx"]+WINDOW_SIZE
        remaining_pred_size = TOTAL_LENGTH - pred_start_idx
        pred_arr[pred_start_idx:] = row["forecast_arr"][:remaining_pred_size]
    pred_start_idx = s.iloc[0]["start_idx"]+WINDOW_SIZE

    ground_truth = np.nan_to_num(np.array(df.loc[df["int_id"] == int_id, "ts_array"].squeeze()).astype("float32"))
    assert pred_arr.shape == ground_truth.shape, (pred_arr.shape, ground_truth.shape)
    mse_rows.append((pred_arr - ground_truth)**2)
mse_mat = np.array(mse_rows)

In [205]:
mse_per_timestamp = mse_mat.mean(axis=0)
mse_per_timestamp_weighted = mse_mat.sum(axis=0)/np.count_nonzero(mse_mat, axis=0)
mse_per_key = mse_mat.mean(axis=1)
mse_per_key_weighted = mse_mat.sum(axis=1)/np.count_nonzero(mse_mat, axis=1)
mse_per_timestamp.mean(), mse_per_key.mean(), mse_per_key_weighted.mean(), mse_per_timestamp_weighted.mean()

(665.9685, 665.9689, 712.7763670643623, 670.7146598730616)

In [206]:
# Spot checking array
# pd.Series(pred_arr).rolling(12).mean().plot(label="pred")
# pd.Series(ground_truth).rolling(12).mean().plot(label="true")
# plt.legend()