# Trying out ARIMA

## Possible improvements
- investigate how seasonalities may change during higher-order seasons, maybe train ARIMA only on the latest part?
  (how does ARIMA actually learn the fitting?)
- Differentiating
- Also include daily seasonality

In [None]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima
from green_city.modelling import fit_tri
from green_city.utils import metrics_dict, index2datetime

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
## MLFLOW ##
import mlflow
from green_city.mlflow_config import get_mlflow_config

flow_conf = get_mlflow_config()
tracking_uri = flow_conf["TRACKING_URI"]
mlflow.set_tracking_uri(flow_conf["TRACKING_URI"])
mlflow.set_experiment(flow_conf["EXPERIMENT_NAME"]);

In [None]:
## DB CONNECTION ##
from sqlalchemy import create_engine
from decouple import Config, RepositoryEnv

config = Config(RepositoryEnv("../.db_credentials"))

db_connection_credentials = {
    "database": config('POSTGRES_DB'),
    "user": config('POSTGRES_USER'),
    "password": config('POSTGRES_PASSWORD'),
    "host": config('POSTGRES_HOST'),
    "port": config('POSTGRES_PORT'),
}
DB_STRING = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**db_connection_credentials)
db = create_engine(DB_STRING)

In [None]:
global_params = {
    "feature": "net_load_kWh",
    "building_nr": 5,
    "resolution": "H",
    "pred_steps": 24,
}

In [None]:
# Load data for building 5
building_nr = 5
df = (pd.read_csv(f"../data/preprocessed/Building_{building_nr}.csv")
      .astype({'datetime': 'datetime64'})[['datetime', 'net_load_kWh']]
      .rename(columns={'net_load_kWh': 'actual'})
)

# Remove yearly season
_, approximation_result = fit_tri(df.actual, 4)
df["season_Y"] = approximation_result

#Remove trend
#this is not perfect, but a good placeholder
trend = (df.actual.rolling(24*28*3, win_type = 'hamming', center=True, min_periods=1).mean() - df.season_Y).rolling(24*600, center=True, min_periods=1).mean()
df["trend"] = trend
#df.season_Y.plot()
df["detrended_Y"] = df.actual - df.season_Y - df.trend

In [None]:
# Remove weekly season
df['weekday'] = df.datetime.dt.weekday
df['daily_rolling'] = df['detrended_Y'].rolling(24, center=True, min_periods=1).mean()
df['daily_rolling']
nr_weeks = len(df)//(24*7)
avg_week = (sum([df['daily_rolling'].shift(-i*24*7) for i in range(nr_weeks)])/nr_weeks)[:24*7]
#from three average weeks, do smoothing and select the middle one
one_week_season = pd.concat([avg_week]*3, ignore_index=True).rolling(24, center=True).mean()[24*7:2*24*7].reset_index(drop=True)
df['season_W'] = df.index.map(lambda x: one_week_season[x%(24*7)])
df['deseasoned_W'] = df.actual - df.trend - df.season_Y - df.season_W
df['trend_seasons'] =  df.trend + df.season_Y + df.season_W

## [Here it would be interesting how the week-seasonality changes in the course of times]

In [None]:
#try fitting ARIMA with previously doing diff by hand
df['diff1'] = df.deseasoned_W.diff(1)
aar = auto_arima(df.diff1[1:])


In [None]:
display(aar)
#ARIMA(order=(4, 0, 3), scoring_args={}, suppress_warnings=True,
#      with_intercept=False)

In [None]:
#de-season-and-trended + diff + auto arima
fitted_model = aar.fit(df.diff1[1:])
print(fitted_model.summary())

In [None]:
plot_acf(df.diff1, lags=20);
#plot_acf(df.deseasoned_W.diff(), lags=20);
#plot_pacf(df['deseasoned_W'], lags=50);

In [None]:

plot_acf(df.diff1[2:], lags=20);
#plot_acf(df.deseasoned_W.diff(), lags=20);
#plot_pacf(df['deseasoned_W'], lags=50);

In [None]:
from pmdarima.arima import ARIMA
#(order,
# seasonal_order=(0, 0, 0, 0),
# start_params=None,
# method='lbfgs',
# maxiter=50,
# suppress_warnings=False, out_of_sample_size=0, scoring='mse', scoring_args=None, trend=None, with_intercept=True, **sarimax_kwargs)

In [None]:
train_data = df.deseasoned_W[:365*24*3]
test_data = df.deseasoned_W[365*24*3:]

def get_auto_arima(X_train):
    arima_model = auto_arima(X_train, **arima_params)
#arima_params = {
#    "start_p":5,
#    "start_q":3,
#    "d":1,
#    "max_p":5,
#    "max_q":3,
#    "seasonal":False,
#    "with_intercept": False,
#}
#2.12

#arima_model = auto_arima(train_data, **arima_params)
#returns order=(5, 1, 3), scoring_args={}, suppress_warnings=True, with_intercept=False

In [None]:
pred_indices = [32135, 33311, 26478, 33357, 30387, 30794, 31800, 28783]

SMALL_WINDOW = True

for index in pred_indices:
    actual = df.loc[index+1:index+24, "actual"].to_numpy()
    correction = df.loc[index+1:index+24, "trend_seasons"].to_numpy()

    #if True:
    with mlflow.start_run(run_name=f"decompose_and_ARIMA_5_1_3_s") as r:
        params = global_params.copy()
        params.update({
            "index": index,
            "datetime": index2datetime(index),
            "pred_steps": 24,
            "model": "ARIMA",
            "seasonalities": [365*24, 7*24],
        })
        
        arima_model = ARIMA((5,1,3), with_intercept=False, maxiter=200)
        if SMALL_WINDOW:
            fitted_model = arima_model.fit(df.deseasoned_W[index-(14*24):index+1])
        else:
            fitted_model = arima_model.fit(train_data)
            #print(fitted_model.summary())
            fitted_model.fit(df.deseasoned_W[:index+1])
        prediction = fitted_model.predict(24) + correction
        
        #print(actual)
        #print(prediction)
        
        metrics = metrics_dict(actual, prediction, ["mae", "mse", "r2_score"])
        print("mae: {mae}, mse: {mse}, r2: {r2_score}".format(**metrics))
        
        mlflow.log_metrics(metrics)
        mlflow.log_params(params)
        
        ## save result to database ##
        #needs to be a table with ["id", "prediction", "run_id"] as columns (which can include the index-column)
        forecasts = pd.DataFrame({'prediction': prediction}, index=list(range(index+1, index+25))).assign(run_id = r.info.run_id)
        forecasts.index.name = "id"
        #display(forecasts)

        forecasts.to_sql("forecast", con=db, if_exists="append")

In [None]:


corrected_prediction = prediction + df.trend_seasons[index+1:index+25]
plt.plot(corrected_prediction)
plt.plot(df.actual[index+1:index+25])

print(mean_squared_error(df.actual[index+1:index+25], corrected_prediction))

## Checking how the weekly trend is changing

In [None]:
def normalize_cols(cols):
    mini = np.min(cols)
    maxi = np.max(cols)
    rng = maxi-mini
    return (cols - mini)/rng

def stdize_cols(cols, scale=True):
    mean = np.mean(cols)
    if scale:
        std = np.std(cols)
    else:
        std = 1
    return (cols - mean)/std

def stdize_cols_without_scaling(cols):
    return stdize_cols(cols, scale=False)

def col_for_each_weekday(s):
    #display(s)
    value = s.detrended_Y
    weekday = int(s.weekday)
    rv = np.array([np.nan]*7)
    rv[weekday] = value
    return rv

In [None]:
df["day_tot"] = df.index.map(lambda x: x//24)
df_ww = (df
 .groupby("day_tot").mean()
 .loc[5:1453]
 .astype({'weekday': 'int'})
 [["detrended_Y", "weekday"]]
 .assign(d_tot = range(1449))
 .set_index("d_tot")
 .assign(w_tot = lambda x: x.index // 7)
 )
df_ww[[f"a_{i}" for i in range(7)]] = df_ww.apply(col_for_each_weekday, axis=1, result_type='expand').interpolate()
df_ww = df_ww.dropna()
df_ww[[f"a_{i}" for i in range(7)]] = df_ww[[f"a_{i}" for i in range(7)]].rolling(100).mean()

df_ww[[f"a_{i}" for i in range(7)]] = df_ww[[f"a_{i}" for i in range(7)]].apply(stdize_cols_without_scaling, axis=1)

plt.rcParams['figure.figsize'] = [25, 8]
df_ww[[f"a_{i}" for i in range(7)]].plot();

## TODO: Could we get the same insight with a simple box plot?