In [None]:
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
import math

In [None]:
time_df = pd.read_csv('dc.csv')
time_df[time_df.columns[0]] = pd.to_datetime(time_df[time_df.columns[0]], format="%Y/%m/%d")
time_df.set_index(time_df.columns[0], inplace=True)
time_df.sort_index(inplace=True)
time_df.info()

## Stationarity of data

In [None]:
from statsmodels.tsa.stattools import adfuller

def timeSeriesStationaryInfo(series, window):

    # Plot Rolling Statistics
    movingAverage = series.rolling(window).mean()
    movingStd = series.rolling(window).std()

    fig = plt.figure(figsize=(20, 10))

    orig = plt.plot(series, label='Original')
    ma = plt.plot(movingAverage, label='Moving Average')
    mstd = plt.plot(movingStd, label='Moving Standard Deviation')
    plt.title('Checking stationary of time series data by comparing original data with moving average and standard deviation')
    plt.legend(loc='best')
    plt.show()

    # Perform Dickey-Fuller Test
    print('Results from Dickey-Fuller Test')
    dftest = adfuller(series, autolag='AIC') # Find out abt AIC
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    return dfoutput

In [None]:
vol_series = time_df['volume']
timeSeriesStationaryInfo(vol_series, 60)

## Seasonal Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
decompose_vol = seasonal_decompose(x=time_df[['volume']], model='additive')
seasonal_vol = decompose_vol.seasonal
n_period = len(np.unique(seasonal_vol))
period_val = np.unique(seasonal_vol)
print(n_period)
pylab.rcParams['figure.figsize'] = (20,9)
decompose_vol.plot()
plt.show()

## Feature Correlation

In [None]:
mask = np.triu(np.ones_like(time_df.corr(), dtype=bool))
sns.heatmap(time_df.corr(), mask=mask, square=True, linewidth=5)

## ACF

In [None]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plot_acf(time_df['volume'], lags=100)
plt.show()

## Model Building

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.filterwarnings('ignore')

In [None]:
def time_to_supervised(data, n_in=1, seasonal_n_in=0, period=0, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols = []
    insert_names = []
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        for col_name in data.columns:
            insert_names.append(col_name + (f" (t-{i})" if i > 0 else " (t)"))

    # input sequene for seasonal (t-n*period, ... t-period)
    if period > 0:
        for i in range(seasonal_n_in, 0, -1):
            cols.append(df.shift(i*period))
            for col_name in data.columns:
                insert_names.append(col_name + (f" (t-{i*period})" if i > 0 else " (t)"))

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        for col_name in data.columns:
            insert_names.append(col_name + (f" (t+{i})" if i > 0 else " (t)"))
    # put it all together
    agg = pd.concat(cols, axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    return pd.DataFrame(agg.values, columns=insert_names)

### XGBOOST

In [None]:
class XGBOOST_Time_Series:
    def __init__(self, n_estimators=1000, max_depth=10, eta=0.1, subsample=1, colsample_bytree=1):
        self.xgb_model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, eta=eta, subsample=subsample, colsample_bytree=colsample_bytree)

    def fit(self, target_data, data, n_lag=1, seasonal_n_lag=0, period=0, n_forecast=1):
        self.train_data = data
        self.n_lag = n_lag
        self.seasonal_n_lag = seasonal_n_lag
        self.period = period
        self.n_forecast = n_forecast
        self.new_df = time_to_supervised(self.train_data, self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast)
        self.new_df.drop(columns=[self.train_data.columns[-1] + " (t)"], inplace=True)
        self.xgb_model.fit(self.new_df, target_data.iloc[len(target_data) - len(self.new_df):])

    def predict(self, pred_data):
        self.pred_data = pd.concat([self.train_data, pred_data], axis=0)
        self.prediction_arr = []
        for i in range(len(pred_data)):
            self.pred_fmt_data = time_to_supervised(self.pred_data.iloc[:len(self.pred_data)-len(pred_data)+i+1], self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast, dropnan=False)
            self.predicting_data = self.pred_fmt_data.drop(columns=[self.train_data.columns[-1] + " (t)"])
            self.predicting_data.dropna(inplace=True)
            self.pred = self.xgb_model.predict(self.predicting_data.iloc[-1:])
            self.prediction_arr.append(self.pred[-1].tolist())
            self.pred_data.at[self.pred_data.index[len(self.pred_data)-len(pred_data)+i], self.train_data.columns[-1]] = self.pred[-1]
        return self.prediction_arr

In [None]:
new_time_df = time_df[['open_USD', 'volume']]

y = new_time_df[['volume']]
X = new_time_df

y_train_arr = []
X_train_arr = []

y_test_arr = []
X_test_arr = []

tscv = TimeSeriesSplit(n_splits=4, test_size=100)
for train_index, test_index in tscv.split(X):
    y_train_arr.append(y.iloc[train_index])
    X_train_arr.append(X.iloc[train_index])
    y_test_arr.append(y.iloc[test_index])
    X_test_arr.append(X.drop(columns=['volume']).iloc[test_index])
    # X_test_arr.append(X.iloc[test_index])

In [None]:
scores = []

order_arr = []
# for n_lag in range(1,5):
#     for s_n_lag in range(1,5):
#         order_arr.append([n_lag, s_n_lag])

for n_lag in range(1,10):
    order_arr.append([n_lag])

for order in tqdm(order_arr):
    rmse_arr = []
    for i in range(3):
        xgbts_model = XGBOOST_Time_Series(n_estimators=1000)
        # xgbts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0], seasonal_n_lag=order[1], period=7)
        xgbts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0])
        pred = xgbts_model.predict(X_test_arr[i])
        rmse_arr.append(mean_squared_error(pred, y_test_arr[i]) ** 0.5)

    # scores.append([order[0], order[1], np.mean(rmse_arr)])
    scores.append([order[0], np.mean(rmse_arr)])

# scores_df = pd.DataFrame(scores, columns=['n lag', 'seasonal lag', 'rmse'])
scores_df = pd.DataFrame(scores, columns=['n lag', 'rmse'])
scores_df.sort_values(by='rmse', ascending=True).head(1)

In [None]:
v = 3
xgbts_model = XGBOOST_Time_Series(n_estimators=1000)
xgbts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=3, seasonal_n_lag=2, period=7)
# xgbts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=6)
pred = xgbts_model.predict(X_test_arr[v])

print("RMSE: ", mean_squared_error(pred, y_test_arr[v].values) ** 0.5)

plt.plot(pred, label='predicted')
plt.plot(y_test_arr[v].values, label='actual')
plt.legend()
plt.show()

- RMSE: 51327.08772845146
    - n_lags = 3
    - seasonal_n_lags = 2
    - period = 7
    - test size = 100
    - v = 3
    - has seasonality, high variance (most closely matching that of the actual data)
- RMSE: 56851.1820771072
    - n_lags = 6
    - test size = 100
    - v = 3
    - has some seasonality, very low variance
- RMSE: 47565.09111563736
    - no lags
    - test size = 100
    - flat

In [None]:
xgb_model = XGBRegressor(n_estimators=1000, max_depth=10, eta=0.1, subsample=1, colsample_bytree=1)
xgb_model.fit(time_df.drop(columns=['volume']).iloc[:-100], time_df['volume'].iloc[:-100])
pred = xgb_model.predict(time_df.drop(columns=['volume']).iloc[-100:])
print(len(pred))
print("RMSE: ", mean_squared_error(pred, time_df['volume'].iloc[-100:]) ** 0.5)
plt.plot(pred, label='predicted')
plt.plot(y_test_arr[3].values, label='actual')
plt.legend()
plt.show()

### KNN

In [None]:
class KNN_Time_Series:
    def __init__(self, n_neighbors):
        self.knn_model = KNeighborsRegressor(n_neighbors=n_neighbors)

    def fit(self, target_data, data, n_lag=1, seasonal_n_lag=0, period=0, n_forecast=1):
        self.train_data = data
        self.n_lag = n_lag
        self.seasonal_n_lag = seasonal_n_lag
        self.period = period
        self.n_forecast = n_forecast
        self.new_df = time_to_supervised(self.train_data, self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast)
        self.new_df.drop(columns=[self.train_data.columns[-1] + " (t)"], inplace=True)
        self.knn_model.fit(self.new_df, target_data.iloc[len(target_data) - len(self.new_df):])

    def predict(self, pred_data):
        self.pred_data = pd.concat([self.train_data, pred_data], axis=0)
        self.prediction_arr = []
        for i in range(len(pred_data)):
            self.pred_fmt_data = time_to_supervised(self.pred_data.iloc[:len(self.pred_data)-len(pred_data)+i+1], self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast, dropnan=False)
            self.predicting_data = self.pred_fmt_data.drop(columns=[self.train_data.columns[-1] + " (t)"])
            self.predicting_data.dropna(inplace=True)
            self.pred = self.knn_model.predict(self.predicting_data.iloc[-1:])
            self.prediction_arr.append(self.pred[-1].tolist())
            self.pred_data.at[self.pred_data.index[len(self.pred_data)-len(pred_data)+i], self.train_data.columns[-1]] = self.pred[-1]
        return self.prediction_arr

In [None]:
new_time_df = time_df[['open_USD', 'volume']]

y = new_time_df[['volume']]
X = new_time_df

y_train_arr = []
X_train_arr = []

y_test_arr = []
X_test_arr = []

tscv = TimeSeriesSplit(n_splits=4, test_size=100)
for train_index, test_index in tscv.split(X):
    y_train_arr.append(y.iloc[train_index])
    X_train_arr.append(X.iloc[train_index])
    y_test_arr.append(y.iloc[test_index])
    X_test_arr.append(X.drop(columns=['volume']).iloc[test_index])

In [None]:
scores = []

order_arr = []
# for n_lag in range(1,5):
#     for seasonal_n_lag in range(1,5):
#         order_arr.append([n_lag, seasonal_n_lag])

for n_lag in range(1, 10):
    order_arr.append([n_lag])

for k in tqdm(range(2, 15), desc='k loop'):
    for order in tqdm(order_arr, desc='lag loop'):
        rmse_arr = []
        for i in range(3):
            knnts_model = KNN_Time_Series(n_neighbors=k)
            # knnts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0], seasonal_n_lag=order[1], period=7)
            knnts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0])
            pred = knnts_model.predict(X_test_arr[i])
            # period_pred_val = np.tile(period_val, (1, math.ceil(len(pred) / n_period)))[0]
            # period_pred_val = period_pred_val[:-(len(period_pred_val) - len(pred))]
            # pred_period = pred + period_pred_val
            rmse_arr.append(mean_squared_error(pred, y_test_arr[i]) ** 0.5)

        # scores.append([k, order[0], order[1], np.mean(rmse_arr)])
        scores.append([k, order[0], np.mean(rmse_arr)])

# scores_df = pd.DataFrame(scores, columns=['k', 'n_lag', 'seasonal_n_lag', 'rmse'])
scores_df = pd.DataFrame(scores, columns=['k', 'n_lag', 'rmse'])
scores_df.sort_values(by='rmse', ascending=True).head(1)

In [None]:
v = 3
knnts_model = KNN_Time_Series(n_neighbors=13)
# knnts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=1, seasonal_n_lag=2, period=7)
knnts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=1)
pred = knnts_model.predict(X_test_arr[v])

print("RMSE: ", mean_squared_error(pred, y_test_arr[v].values) ** 0.5)

plt.plot(pred, label='predicted')
plt.plot(y_test_arr[v].values, label='actual')
plt.legend()
plt.show()

- RMSE: 45001.89115279937
    - k = 13
    - n_lags = 1
    - no seasonal
    - test size = 100
    - v = 3
    - very frequent seasonality, quite flat
- RMSE: 60406.69353189595
    - k = 5
    - n_lags = 1
    - seasonal_n_lags = 2
    - test size = 100
    - v = 3
    - has a little seasonality, quite flat
- RMSE: 52201.216485396195
    - k = 5
    - n_lag = 9
    - test size = 100
    - v = 3
    - has pretty good seasonality

In [None]:
scores = []
for k in range(2, 40):
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(time_df.drop(columns=['volume']).iloc[:-100], time_df['volume'].iloc[:-100])
    pred = knn_model.predict(time_df.drop(columns=['volume']).iloc[-100:])
    scores.append([k, mean_squared_error(pred, time_df['volume'].iloc[-100:]) ** 0.5])
scores_df = pd.DataFrame(scores, columns=['k', 'rmse'])

knn_model = KNeighborsRegressor(n_neighbors=scores_df.sort_values(by='rmse', ascending=True).iloc[0, 0])
knn_model.fit(time_df.drop(columns=['volume']).iloc[:-100], time_df['volume'].iloc[:-100])
pred = knn_model.predict(time_df.drop(columns=['volume']).iloc[-100:])

print("k:", scores_df.sort_values(by='rmse', ascending=True).iloc[0, 0])
print("RMSE: ", mean_squared_error(pred, time_df['volume'].iloc[-100:]) ** 0.5)
plt.plot(pred, label='predicted')
plt.plot(y_test_arr[3].values, label='actual')
plt.legend()
plt.show()

### SVR

In [None]:
class SVR_Time_Series:
    def __init__(self):
        self.svr_model = SVR()

    def fit(self, target_data, data, n_lag=1, seasonal_n_lag=0, period=0, n_forecast=1):
        self.train_data = data
        self.n_lag = n_lag
        self.seasonal_n_lag = seasonal_n_lag
        self.period = period
        self.n_forecast = n_forecast
        self.new_df = time_to_supervised(self.train_data, self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast)
        self.new_df.drop(columns=[self.train_data.columns[-1] + " (t)"], inplace=True)
        self.svr_model.fit(self.new_df, target_data.iloc[len(target_data) - len(self.new_df):])

    def predict(self, pred_data):
        self.pred_data = pd.concat([self.train_data, pred_data], axis=0)
        self.prediction_arr = []
        for i in range(len(pred_data)):
            self.pred_fmt_data = time_to_supervised(self.pred_data.iloc[:len(self.pred_data)-len(pred_data)+i+1], self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast, dropnan=False)
            self.predicting_data = self.pred_fmt_data.drop(columns=[self.train_data.columns[-1] + " (t)"])
            self.predicting_data.dropna(inplace=True)
            self.pred = self.svr_model.predict(self.predicting_data.iloc[-1:])
            self.prediction_arr.append(self.pred[-1].tolist())
            self.pred_data.at[self.pred_data.index[len(self.pred_data)-len(pred_data)+i], self.train_data.columns[-1]] = self.pred[-1]
        return self.prediction_arr

In [None]:
new_time_df = time_df[['open_USD', 'volume']]

y = new_time_df[['volume']]
X = new_time_df

y_train_arr = []
X_train_arr = []

y_test_arr = []
X_test_arr = []

tscv = TimeSeriesSplit(n_splits=4, test_size=100)
for train_index, test_index in tscv.split(X):
    y_train_arr.append(y.iloc[train_index])
    X_train_arr.append(X.iloc[train_index])
    y_test_arr.append(y.iloc[test_index])
    X_test_arr.append(X.drop(columns=['volume']).iloc[test_index])

In [None]:
scores = []

order_arr = []
for n_lag in range(1,10):
    for seasonal_n_lag in range(1,10):
        order_arr.append([n_lag, seasonal_n_lag])

# for n_lag in range(1, 10):
#     order_arr.append([n_lag])

for order in tqdm(order_arr, desc='lag loop'):
    rmse_arr = []
    for i in range(3):
        svrts_model = SVR_Time_Series()
        svrts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0], seasonal_n_lag=order[1], period=7)
        # svrts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0])
        pred = svrts_model.predict(X_test_arr[i])
        rmse_arr.append(mean_squared_error(pred, y_test_arr[i]) ** 0.5)

    scores.append([order[0], order[1], np.mean(rmse_arr)])
    # scores.append([order[0], np.mean(rmse_arr)])

scores_df = pd.DataFrame(scores, columns=['n_lag', 'seasonal_n_lag', 'rmse'])
# scores_df = pd.DataFrame(scores, columns=['n_lag', 'rmse'])
scores_df.sort_values(by='rmse', ascending=True).head(1)

In [None]:
v = 3
svrts_model = SVR_Time_Series()
svrts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=1, seasonal_n_lag=9, period=7)
# svrts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=9)
pred = svrts_model.predict(X_test_arr[v])

print("RMSE: ", mean_squared_error(pred, y_test_arr[v].values) ** 0.5)

plt.plot(pred, label='predicted')
plt.plot(y_test_arr[v].values, label='actual')
plt.legend()
plt.show()

- RMSE: 61131.72429583171
    - n_lag = 1
    - seasonal_n_lag = 9
    - test size = 100
    - v = 3
    - straight line
- RMSE: 62289.679460216226
    - n_lag = 9
    - test size = 100
    - v = 3
    - straight line

### Linear Regression

In [None]:
class LinearRegression_Time_Series:
    def __init__(self):
        self.lr_model = LinearRegression()

    def fit(self, target_data, data, n_lag=1, seasonal_n_lag=0, period=0, n_forecast=1):
        self.train_data = data
        self.n_lag = n_lag
        self.seasonal_n_lag = seasonal_n_lag
        self.period = period
        self.n_forecast = n_forecast
        self.new_df = time_to_supervised(self.train_data, self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast)
        self.new_df.drop(columns=[self.train_data.columns[-1] + " (t)"], inplace=True)
        self.lr_model.fit(self.new_df, target_data.iloc[len(target_data) - len(self.new_df):])

    def predict(self, pred_data):
        self.pred_data = pd.concat([self.train_data, pred_data], axis=0)
        self.prediction_arr = []
        for i in range(len(pred_data)):
            self.pred_fmt_data = time_to_supervised(self.pred_data.iloc[:len(self.pred_data)-len(pred_data)+i+1], self.n_lag, self.seasonal_n_lag, self.period, self.n_forecast, dropnan=False)
            self.predicting_data = self.pred_fmt_data.drop(columns=[self.train_data.columns[-1] + " (t)"])
            self.predicting_data.dropna(inplace=True)
            self.pred = self.lr_model.predict(self.predicting_data.iloc[-1:])
            self.prediction_arr.append(self.pred[-1].tolist())
            self.pred_data.at[self.pred_data.index[len(self.pred_data)-len(pred_data)+i], self.train_data.columns[-1]] = self.pred[-1]
        return self.prediction_arr

In [None]:
new_time_df = time_df[['open_USD', 'volume']]

y = new_time_df[['volume']]
X = new_time_df

y_train_arr = []
X_train_arr = []

y_test_arr = []
X_test_arr = []

tscv = TimeSeriesSplit(n_splits=4, test_size=100)
for train_index, test_index in tscv.split(X):
    y_train_arr.append(y.iloc[train_index])
    X_train_arr.append(X.iloc[train_index])
    y_test_arr.append(y.iloc[test_index])
    X_test_arr.append(X.drop(columns=['volume']).iloc[test_index])

In [None]:
scores = []

order_arr = []
# for n_lag in range(1,10):
#     for s_n_lag in range(1,10):
#         order_arr.append([n_lag, s_n_lag])

for n_lag in range(1,10):
    order_arr.append([n_lag])

for order in tqdm(order_arr):
    rmse_arr = []
    for i in range(3):
        lrts_model = LinearRegression_Time_Series()
        # lrts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0], seasonal_n_lag=order[1], period=7)
        lrts_model.fit(y_train_arr[i], X_train_arr[i], n_lag=order[0])
        pred = lrts_model.predict(X_test_arr[i])
        rmse_arr.append(mean_squared_error(pred, y_test_arr[i]) ** 0.5)
        
    # scores.append([order[0], order[1], np.mean(rmse_arr)])
    scores.append([order[0], np.mean(rmse_arr)])

# scores_df = pd.DataFrame(scores, columns=['n lag', 'seasonal lag', 'rmse'])
scores_df = pd.DataFrame(scores, columns=['n lag', 'rmse'])
scores_df.sort_values(by='rmse', ascending=True).head(1)

In [None]:
v = 3
lrts_model = LinearRegression_Time_Series()
lrts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=9, seasonal_n_lag=5, period=7)
# lrts_model.fit(y_train_arr[v], X_train_arr[v], n_lag=3)
pred = lrts_model.predict(X_test_arr[v])

print("RMSE: ", mean_squared_error(pred, y_test_arr[v].values) ** 0.5)

plt.plot(pred, label='predicted')
plt.plot(y_test_arr[v].values, label='actual')
plt.legend()
plt.show()

- RMSE: 61824.225984872195
    - n_lag = 9
    - seasonal_n_lag = 5
    - test size = 100
    - v = 3
    - straight line
- RMSE: 62434.37767652163
    - n_lag = 3
    - test size = 100
    - v = 3
    - straight line

### ARIMA

In [None]:
def arima_tuning(train_df, test_df, v_range, max_order, max_seasonal_order=[], period=0):
    scores = []
    order_arr = []
    if max_seasonal_order == []:
        for q in range(1, max_order[0]):
            for d in range(max_order[1]):
                for p in range(1, max_order[2]):
                    order_arr.append([[q,d,p]])
    else:
        for q in range(1, max_order[0]):
            for d in range(max_order[1]):
                for p in range(1, max_order[2]):
                    for Q in range(max_seasonal_order[0]):
                        for D in range(max_seasonal_order[1]):
                            for P in range(max_seasonal_order[2]):
                                order_arr.append([[q,d,p], [Q,D,P,period]])

    for order in tqdm(order_arr):
        rmse_arr = []
        aic_arr = []
        for v in range(v_range):
            exog_df = train_df[v].iloc[:, :-1]
            endog_df = train_df[v].iloc[:, -1]

            exog_test = test_df[v].iloc[:, :-1]
            endog_test = test_df[v].iloc[:, -1]

            n = len(test_df[v].index)

            try:
                if max_seasonal_order == []:
                    arima_model = ARIMA(endog=endog_df, exog=exog_df, order=order[0])
                else:
                    arima_model = ARIMA(endog=endog_df, exog=exog_df, order=order[0], seasonal_order=order[1])
                arima_result = arima_model.fit()
                fc = arima_result.forecast(steps=n, exog=exog_test)
                rmse_arr.append(mean_squared_error(fc, endog_test) ** 0.5)
                aic_arr.append(arima_result.aic)
            except: continue
        
        if max_seasonal_order == []:
            scores.append([order[0][0], order[0][1], order[0][2], np.mean(aic_arr), np.mean(rmse_arr)])
        else:
            scores.append([order[0][0], order[0][1], order[0][2], order[1][0], order[1][1], order[1][2], np.mean(aic_arr), np.mean(rmse_arr)])

    if max_seasonal_order == []: return  pd.DataFrame(scores, columns=['p', 'd', 'q', 'aic', 'rmse'])
    else: return pd.DataFrame(scores, columns=['p', 'd', 'q', 'P', 'D', 'Q', 'aic', 'rmse'])

In [None]:
tss = TimeSeriesSplit(n_splits=4, test_size=100)

train_df, test_df = [], []

for train_i, test_i in tss.split(time_df):
    train_df.append(time_df.iloc[train_i])
    test_df.append(time_df.iloc[test_i])
    # train_df.append(time_df[['open_USD', 'volume']].iloc[train_i])
    # test_df.append(time_df[['open_USD', 'volume']].iloc[test_i])

In [None]:
scores_df = arima_tuning(train_df, test_df, 3, (3,3,3), max_seasonal_order=(2,2,2), period=7)

In [None]:
print("rmse : \n", scores_df.sort_values(by='rmse', ascending=True).head(1))
print("aic : \n", scores_df.sort_values(by='aic', ascending=True).head(1))

In [None]:
v = 3
exog_df = train_df[v].iloc[:, :-1]
endog_df = train_df[v].iloc[:, -1]

exog_test = test_df[v].iloc[:, :-1]
endog_test = test_df[v].iloc[:, -1]

n = len(test_df[v].index)

arima_model = ARIMA(endog=endog_df, exog=exog_df, order=(1,0,1), seasonal_order=(1,0,1,7))
# arima_model = ARIMA(endog=endog_df, exog=exog_df, order=(1,0,2), seasonal_order=(0,1,1,7))
arima_result = arima_model.fit()
fc = arima_result.forecast(steps=n, exog=exog_test)
print('RMSE: ', mean_squared_error(fc, endog_test) ** 0.5)

plt.figure(figsize=(14, 10))
plt.plot(fc, label='prediction')
plt.plot(endog_test, label='actual')
plt.legend()
plt.show()

### RMSE 74743.56986169254 ## RMSE metric | order = (1,0,1) | seasonal order = (1,1,1,7) | test size = 100
### RMSE 385057918.3019191 ## AIC metric | order = (1,2,2) | seasonal order = (1,0,1,7) | test size = 100
### RMSE 48113.173750691385 ## AIC metric | order = (1,2,2) | seasonal order = (0,0,1,7) | test size = 20
### RMSE 44961.00223276705 ## AIC metric | order = (1,0,1) | seasonal order = (0,1,1,7) | test size = 20

- RMSE: 74743.56986169254
    - RMSE metric
    - order = (1,0,1)
    - seasonal_order = (1,1,1,7)
    - test size = 100
- RMSE: 385057918.3019191
    - AIC metric
    - order = (1,2,2)
    - seasonal_order = (1,0,1,7)
    - test size = 100
- RMSE: 98495.86596233559
    - RMSE metric
    - order = (1,1,2)
    - seasonal_order = (0,1,1,7)
    - test size = 60
- RMSE: 126240.64247175687
    - AIC metric
    - order = (1,0,1)
    - seasonal_order = (0,1,1,7)
    - test size = 60
- RMSE: 42198.0083154924
    -  RMSE metric
    - order = (2,0,1)
    - test size = 60
    - removed other correlated variables
- RMSE: 41501.67554212661
    - AIC metric
    - order = (2,0,2)
    - seasonal_order = (0,1,1,7)
    - test size = 60
    - removed other correlated variables
- RMSE: 61971.889367297226
    - AIC metric
    - order = (1,0,2)
    - seasonal_order = (0,1,1)
    - test size = 100
    - removed other correlated variables
- RMSE: 42140.29575194003
    - RMSE metric
    - order = (1,0,1)
    - seasonal_order = (1,0,1)
    - test size = 100
    - removed other correlated variables

## References
Dataset: https://www.kaggle.com/datasets/szrlee/stock-time-series-20050101-to-20171231?select=AABA_2006-01-01_to_2018-01-01.csv