In [None]:
!pip install pmdarima==1.8.0

In [None]:
import pandas as pd
import numpy as np
import pmdarima as pmd
import pickle
import matplotlib.pyplot as plt
import time
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 9999
pd.options.display.max_columns = 100

def show_ts(ts, forecast=None, forecast2 = None, title="Forecast Plot"):
    ax = ts.plot(label = "Observed", figsize=(10,3))
    if not (forecast is None):
        forecast.plot(ax=ax, label='Forecast')
        plt.legend()
    if not (forecast2 is None):
        forecast2.plot(ax=ax, label='Forecast')
        plt.legend()
        
    ax.set_xlabel('Date')
    ax.set_ylabel('Messages/Second')
    plt.title(title)
    plt.show()
    
def opt_arima(ts,m):
    combs = [(0,0),(0,1),(1,0),(1,1)]
    best_model = None
    best_aic = np.inf
    for c in combs:
        print(c)
        model = fit_arima(ts,m,c[0],c[1])
        aic = model.aic()
        if aic < best_aic:
            best_aic = aic
            best_model = model
    return best_model

def fit_arima(ts, m, d , D):
    return pmd.auto_arima(ts.t, d=d, start_p=0, start_q=0,
                             max_p=5, max_q=5, m=m,
                             D=D, start_P=0, start_Q=0, 
                             max_P=5, max_Q=5,
                             method = "nm", max_iter=30,
                             error_action='ignore',
                             suppress_warnings=True,
                             stepwise=True, trace=0)

durations_df = pd.read_csv("results/durations.csv")

In [None]:
data_names = ["avazu","IoT","wiki_de","wiki_en","horton","retailrocket","taxi", "alibaba", "google"]

sampling_rates = ["1h","15min"]
multipliers = [1,4]
forecast_horizons = [12,4]

train_test_split = 0.8

for data_name in data_names:
    for i,sampling_rate in enumerate(sampling_rates):
        print()
        print()
        print(data_name, sampling_rate)
        multiplier = multipliers[i]
        fh = forecast_horizons[i]
        df = pd.read_csv("../data/"+data_name+"_"+sampling_rate+".csv", index_col=0, parse_dates=True)

        df["t"] = df.messages
        df = df.drop(["messages"], axis=1)
        df = df.dropna()
        df = df.astype(np.int)

        train = df.iloc[:int(len(df)*train_test_split)]
        test = df.iloc[int(len(df)*train_test_split):]
        train = train.iloc[-200:]
        
        print("Train shape:", train.shape)
        print("Test shape:", test.shape)
        start_time = time.time()
        model = opt_arima(train, multiplier*24)
        end_time = time.time()
        training_duration = end_time-start_time

        durations_df.loc[(durations_df.dataset == data_name) & (durations_df.sampling_rate == sampling_rate)\
                         , "SARIMA_opt"] = training_duration
            
        
        try:
            results_df = pd.read_csv("results/"+ data_name + "_" + sampling_rate + "_results.csv", index_col=0, parse_dates=True)
        except:
            results_df = test.t.to_frame()
        # update SARIMA every sample to new samples.
        results_df["SARIMA"] = 0
        results_df["SARIMA"].iloc[:fh] = model.predict(fh)
        
        i = 1
        start_time = time.time()
        while i < len(results_df):
            model.update(test.t.iloc[i-1])
            try:
                results_df["SARIMA"].iloc[i:i+fh] += model.predict(fh)
            except ValueError:
                results_df["SARIMA"].iloc[i:] += model.predict(len(results_df)-i)
            i += 1
            if i % 100 ==0:
                print(i,"/",len(test))
                print((datetime.now()).strftime("%d.%m.%Y %H:%M:%S"))
        end_time = time.time()
        
        tuning_duration = (end_time - start_time) / len(results_df)
        durations_df.loc[(durations_df.dataset == data_name) & (durations_df.sampling_rate == sampling_rate)\
                         , "SARIMA_tune"] = tuning_duration

        great_divider = list(range(1,len(results_df)+1))
        great_divider = list(map(lambda x: min(x,fh), great_divider))
        
        results_df["SARIMA"] /= great_divider

        show_ts(results_df.t, results_df.SARIMA)
        
        results_df.to_csv("results/"+data_name+"_"+sampling_rate+"_results.csv")