Going to try out sklearn's TimeSeriesSplit, and other types of splitting techniques to further improve my model's robustness, capabilities, and usability.

### Using sklearn TimeSeriesSplit

In [95]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import math
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.stats import ttest_ind
%run -i 'mod5_functions.py'
import numpy as np

In [9]:
daty = pd.read_csv('remastered.csv')
daty['date'] = pd.to_datetime(daty['date'])

In [33]:
daty.head()

Unnamed: 0,date,interest
0,2004-01-01,29.333333
1,2004-02-01,35.333333
2,2004-03-01,46.0
3,2004-04-01,65.0
4,2004-05-01,39.0


In [56]:
# from matplotlib import pyplot
# %matplotlib inline
X = daty['interest']
rmse_cv = []
splits = TimeSeriesSplit(n_splits=5)
# pyplot.figure(1)
index = 1
for train_index, test_index in splits.split(X,y):
    train = y[train_index]
    test = y[test_index]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    train = train.to_frame()
    train = rename_column(train, 0, 'interest')
    test = test.to_frame()
    test = rename_column(test, 0, 'interest')
    train['date'] = daty['date'][train.index[0]:train.index[-1]+1]
    test['date'] = daty['date'][test.index[0]:test.index[-1]+1]
    train.set_index('date', inplace=True)
    test.set_index('date', inplace=True)
    model = ARIMA(train, order=(0,1,0))
    fit = model.fit()
    forecast = fit.forecast(len(test))
    rmse = np.sqrt(mean_squared_error(test['interest'], forecast[0]))
    rmse_cv.append(rmse)
#     print(chart)
#     pyplot.subplot(310 + index)
#     pyplot.plot(train)
#     pyplot.plot([None for i in train] + [x for x in test])
    index += 1
# pyplot.show()

Observations: 69
Training Observations: 37
Testing Observations: 32
Observations: 101
Training Observations: 69
Testing Observations: 32
Observations: 133
Training Observations: 101
Testing Observations: 32
Observations: 165
Training Observations: 133
Testing Observations: 32
Observations: 197
Training Observations: 165
Testing Observations: 32




In [60]:
np.mean(rmse_cv)

7.263015135073324

In [62]:
X = daty['interest']
rmse_cv = []
splits = TimeSeriesSplit(n_splits=5)
# pyplot.figure(1)
index = 1
for train_index, test_index in splits.split(X,y):
    train = y[train_index]
    test = y[test_index]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    train = train.to_frame()
    train = rename_column(train, 0, 'interest')
    test = test.to_frame()
    test = rename_column(test, 0, 'interest')
    train['date'] = daty['date'][train.index[0]:train.index[-1]+1]
    test['date'] = daty['date'][test.index[0]:test.index[-1]+1]
    train.set_index('date', inplace=True)
    test.set_index('date', inplace=True)
    model = SARIMAX(train, order=(0,1,0), seasonal_order=(0,1,0,12), trend='t')
    fit = model.fit()
    forecast = fit.predict(start=test.index[0], end=test.index[-1])
    rmse = np.sqrt(mean_squared_error(test['interest'], forecast))
    rmse_cv.append(rmse)
    index += 1

Observations: 69
Training Observations: 37
Testing Observations: 32
Observations: 101
Training Observations: 69
Testing Observations: 32
Observations: 133
Training Observations: 101
Testing Observations: 32
Observations: 165
Training Observations: 133
Testing Observations: 32




Observations: 197
Training Observations: 165
Testing Observations: 32




In [64]:
np.mean(rmse_cv)

24.67438969528687

In [73]:
X = daty['interest']
rmse_cv = []
splits = TimeSeriesSplit(n_splits=5)
# pyplot.figure(1)
index = 1
for train_index, test_index in splits.split(X):
    train = X[train_index]
    test = X[test_index]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    train = train.to_frame()
    train = rename_column(train, 0, 'interest')
    test = test.to_frame()
    test = rename_column(test, 0, 'interest')
    train['date'] = daty['date'][train.index[0]:train.index[-1]+1]
    test['date'] = daty['date'][test.index[0]:test.index[-1]+1]
    train.set_index('date', inplace=True)
    test.set_index('date', inplace=True)
    model = SARIMAX(train, order=(0,1,1), seasonal_order=(0,1,1,12), trend='t')
    fit = model.fit()
    forecast = fit.predict(start=test.index[0], end=test.index[-1])
    rmse = np.sqrt(mean_squared_error(test['interest'], forecast))
    rmse_cv.append(rmse)
    index += 1

Observations: 69
Training Observations: 37
Testing Observations: 32
Observations: 101




Training Observations: 69
Testing Observations: 32


  warn('Non-invertible starting seasonal moving average'


Observations: 133
Training Observations: 101
Testing Observations: 32




Observations: 165
Training Observations: 133
Testing Observations: 32




Observations: 197
Training Observations: 165
Testing Observations: 32




In [74]:
np.mean(rmse_cv)

17.23603182631321

In [79]:
y

0      29.333333
1      35.333333
2      46.000000
3      65.000000
4      39.000000
         ...    
192    39.666667
193    51.666667
194    46.833333
195    71.833333
196    74.166667
Name: interest, Length: 197, dtype: float64

In [80]:
splits = TimeSeriesSplit(n_splits=5)
# pyplot.figure(1)
rmse_cv = []
index = 1
for train_index, test_index in splits.split(X):
    train = X[train_index]
    test = X[test_index]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    train = train.to_frame()
    train = rename_column(train, 0, 'interest')
    test = test.to_frame()
    test = rename_column(test, 0, 'interest')
    train['date'] = daty['date'][train.index[0]:train.index[-1]+1]
    test['date'] = daty['date'][test.index[0]:test.index[-1]+1]
    train.set_index('date', inplace=True)
    test.set_index('date', inplace=True)
    model = SARIMAX(train, order=(0,1,1), seasonal_order=(2,0,2,12), trend='t')
    fit = model.fit()
    forecast = fit.predict(start=test.index[0], end=test.index[-1])
    rmse = np.sqrt(mean_squared_error(test['interest'], forecast))
    rmse_cv.append(rmse)
    index += 1

Observations: 69
Training Observations: 37
Testing Observations: 32




Observations: 101
Training Observations: 69
Testing Observations: 32




Observations: 133
Training Observations: 101
Testing Observations: 32




Observations: 165
Training Observations: 133
Testing Observations: 32




Observations: 197
Training Observations: 165
Testing Observations: 32




In [81]:
np.mean(rmse_cv)

10.22028815177892

In [11]:
tscv = TimeSeriesSplit(n_splits=5)

In [16]:
tscv.split(X)

<generator object TimeSeriesSplit.split at 0x11adbb350>


In [96]:
cross_val_ts(daty, 5, (1,0,1), (0,1,1,12))

Observations: 69
Training Observations: 37
Testing Observations: 32
Observations: 101
Training Observations: 69
Testing Observations: 32




Observations: 133
Training Observations: 101
Testing Observations: 32




Observations: 165
Training Observations: 133
Testing Observations: 32




Observations: 197
Training Observations: 165
Testing Observations: 32


Cross validated RMSE on test data is:  8.23455665407131

 Cross validated AIC for this model is:  602.02508464752
