In [1]:
# grid search sarima hyperparameters for daily female dataset
import pandas as pd
import numpy as np

import joblib
import multiprocessing as mp


from math import sqrt
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [2]:
# one-step sarima forecast
def sarima_forecast(history, config):
    order, sorder, trend = config
    # define model
    model = SARIMAX(history, order=order, seasonal_order=sorder, trend=trend, enforce_stationarity=False, enforce_invertibility=False)
    # fit model
    model_fit = model.fit(disp=False)
    # make one step forecast
    yhat = model_fit.predict(len(history),len(history))
    return yhat[0]

In [3]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [4]:
# split a univariate dataset into train/test sets
def train_test_split(val_1_data, n_test):
    return val_1_data[:-n_test], val_1_data[-n_test:]

In [5]:
# walk-forward validation for univariate data
def walk_forward_validation(val_1_data, n_test, cfg):
    predictions = list()
    # split dataset
    print("hellooooo")
    train, test = train_test_split(val_1_data, n_test)
    print("Train dataset = ", train)
    print("Test dataset = ", test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = sarima_forecast(history, cfg)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
    # estimate prediction error
    error = measure_rmse(test, predictions)
  #  error2 = measure_mape(test, n_test)
    return error

In [6]:
# score a model, return None on failure
def score_model(val_1_data, n_test, cfg, debug=False):
    result = None
    # convert config to a key
    key = str(cfg)
    # show all warnings and fail on exception if debugging
    if debug:
        result = walk_forward_validation(val_1_data, n_test, cfg)
    else:
        # one failure during model validation suggests an unstable config
        # never show warnings when grid searching, too noisy
        with catch_warnings():
            filterwarnings("ignore")
            result = walk_forward_validation(val_1_data, n_test, cfg)
    # check for an interesting result
    if result is not None:
        print(' > Model[%s] %.3f' % (key, result))
    return (key, result)

In [7]:
# grid search configs
def grid_search(val_1_data, cfg_list, n_test, parallel=True):
    scores = None
    if parallel:
        # execute configs in parallel
        executor = joblib.Parallel(n_jobs=mp.cpu_count(), backend='multiprocessing')
        print("executer = ", executor)
        tasks = (delayed(score_model)(val_1_data, n_test, cfg) for cfg in cfg_list)
        print("tasks = ", tasks)
        scores = executor(tasks)
        print("Scores in if clause = ", scores)
    else:
        scores = [score_model(val_1_data, n_test, cfg) for cfg in cfg_list]
        print("Scores in else case = ", scores)

    # remove empty results
    scores = [r for r in scores if r[1] != None]
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores

In [8]:
# create a set of sarima configs to try
def sarima_configs(seasonal=[0]):
    models = list()
    # define config lists
    p_params = [0, 1, 2]
    d_params = [0, 1]
    q_params = [0, 1, 2]
    t_params = ['n','c','t','ct']
    P_params = [0, 1, 2]
    D_params = [0, 1]
    Q_params = [0, 1, 2]
    m_params = seasonal
    # create config instances
    for p in p_params:
        for d in d_params:
            for q in q_params:
                for t in t_params:
                    for P in P_params:
                        for D in D_params:
                            for Q in Q_params:
                                for m in m_params:
                                    cfg = [(p,d,q), (P,D,Q,m), t]
                                    models.append(cfg)
    return models

In [10]:
if __name__ == '__main__':
    # load dataset

    test_data = pd.read_csv("../../data/test.csv", delimiter=";", decimal=",")
    test_data.head(10).append(test_data.tail(10))

    val_1_data = test_data["Val_1"]
    val_2_data = test_data["Val_2"]
    val_3_data = test_data["Val_3"]

    print("Values shape = ", val_1_data.shape)
    # data split
    print("***********************************")
    print("data values : ", val_1_data.values)
    print("***********************************")
    n_test = 4
    # model configs
    cfg_list = sarima_configs()
    print(cfg_list)
    # grid search
    scores = grid_search(val_1_data, cfg_list, n_test)
    print('done')
    # list top 3 configs
    print('scoresasd', scores)
    for cfg, error in scores[:3]:
        print("hello")
        print(cfg, error)


Values shape =  (43,)
***********************************
data values :  [26.95  46.662 30.47  24.794 35.849 42.119 48.004 39.413 26.873 22.352
 69.058 72.248 58.003 44.154 25.982 23.903 25.443 46.387 32.538 36.102
 19.723 26.015 73.854 96.448 81.774 57.981 41.921 36.036 44.583 57.673
 40.964 56.144 27.973 31.779 70.345 71.456 86.064 87.318 43.208 41.481
 50.798 67.705 63.734]
***********************************
[[(0, 0, 0), (0, 0, 0, 0), 'n'], [(0, 0, 0), (0, 0, 1, 0), 'n'], [(0, 0, 0), (0, 0, 2, 0), 'n'], [(0, 0, 0), (0, 1, 0, 0), 'n'], [(0, 0, 0), (0, 1, 1, 0), 'n'], [(0, 0, 0), (0, 1, 2, 0), 'n'], [(0, 0, 0), (1, 0, 0, 0), 'n'], [(0, 0, 0), (1, 0, 1, 0), 'n'], [(0, 0, 0), (1, 0, 2, 0), 'n'], [(0, 0, 0), (1, 1, 0, 0), 'n'], [(0, 0, 0), (1, 1, 1, 0), 'n'], [(0, 0, 0), (1, 1, 2, 0), 'n'], [(0, 0, 0), (2, 0, 0, 0), 'n'], [(0, 0, 0), (2, 0, 1, 0), 'n'], [(0, 0, 0), (2, 0, 2, 0), 'n'], [(0, 0, 0), (2, 1, 0, 0), 'n'], [(0, 0, 0), (2, 1, 1, 0), 'n'], [(0, 0, 0), (2, 1, 2, 0), 'n'], [(0, 0,

hellooooo
hellooooo
hellooooo
hellooooo
Train dataset =  0     26.950
1     46.662
2     30.470
3     24.794
4     35.849
5     42.119
6     48.004
7     39.413
8     26.873
9     22.352
10    69.058
11    72.248
12    58.003
13    44.154
14    25.982
15    23.903
16    25.443
17    46.387
18    32.538
19    36.102
20    19.723
21    26.015
22    73.854
23    96.448
24    81.774
25    57.981
26    41.921
27    36.036
28    44.583
29    57.673
30    40.964
31    56.144
32    27.973
33    31.779
34    70.345
35    71.456
36    86.064
37    87.318
38    43.208
Name: Val_1, dtype: float64Train dataset =  0     26.950
1     46.662
2     30.470
3     24.794
4     35.849
5     42.119
6     48.004
7     39.413
8     26.873
9     22.352
10    69.058
11    72.248
12    58.003
13    44.154
14    25.982
15    23.903
16    25.443
17    46.387
18    32.538
19    36.102
20    19.723
21    26.015
22    73.854
23    96.448
24    81.774
25    57.981
26    41.921
27    36.036
28    44.583
29    57.673
30

KeyError: 0