# Modeling

In [1]:
import pandas as pd

In [2]:
daily = pd.read_csv('data/daily.csv', index_col=0)
monthly = pd.read_csv('data/monthly.csv', index_col=0)
quarterly = pd.read_csv('data/trimestral.csv', index_col=0)

daily.index = pd.to_datetime(daily.index)
monthly.index = pd.to_datetime(monthly.index)
quarterly.index = pd.to_datetime(quarterly.index)

In [3]:
weekly = daily.resample('W').sum()

In [4]:
daily['Day'] = daily.index.day
daily['Month'] = daily.index.month

weekly['Month'] = weekly.index.month

monthly['Month'] = monthly.index.month

quarterly['Quarter'] = quarterly.index.quarter

### Train-test split

In [5]:
train_size = 0.8

def get_train_test(data, train_size):
    size = len(data)
    train = data.iloc[:int(len(data) * train_size), :]
    test = data.iloc[int(len(data) * train_size):, :]

    return train, test

In [6]:
daily_train, daily_test = get_train_test(daily, train_size)
weekly_train, weekly_test = get_train_test(weekly, train_size)
monthly_train, monthly_test = get_train_test(monthly, train_size)
quarterly_train, quarterly_test = get_train_test(quarterly, train_size)

In [35]:
# Split check
print(daily_train.iloc[0].name, '-', daily_train.iloc[-1].name, '-->', daily_test.iloc[0].name, '-', daily_test.iloc[-1].name)
print(weekly_train.iloc[0].name, '-', weekly_train.iloc[-1].name, '-->', weekly_test.iloc[0].name, '-', weekly_test.iloc[-1].name)
print(monthly_train.iloc[0].name, '-', monthly_train.iloc[-1].name, '-->', monthly_test.iloc[0].name, '-', monthly_test.iloc[-1].name)
print(quarterly_train.iloc[0].name, '-', quarterly_train.iloc[-1].name, '-->', quarterly_test.iloc[0].name, '-', quarterly_test.iloc[-1].name)

2016-01-01 00:00:00 - 2021-08-06 00:00:00 --> 2021-08-07 00:00:00 - 2022-12-31 00:00:00
2016-01-03 00:00:00 - 2021-08-01 00:00:00 --> 2021-08-08 00:00:00 - 2023-01-01 00:00:00
2016-01-31 00:00:00 - 2021-07-31 00:00:00 --> 2021-08-31 00:00:00 - 2022-12-31 00:00:00
2016-01-31 00:00:00 - 2021-07-31 00:00:00 --> 2021-10-31 00:00:00 - 2023-01-31 00:00:00


In [32]:
weekly_train.iloc[0].name
monthly_train.iloc[0].name
weekly_train.iloc[0].name

Timestamp('2016-01-03 00:00:00', freq='W-SUN')

## SARIMAX

In [8]:
import pmdarima as pmd

In [277]:
smax_wly_foot = pmd.auto_arima(
    weekly['Football'], X=weekly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=3, max_d=1, max_q=3,
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2,
    max_order=6, m=52, trace=True)

smax_mth_foot = pmd.auto_arima(
    monthly['Football'], X=monthly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=12, trace=True)

smax_qtr_foot = pmd.auto_arima(
    quarterly['Football'], X=quarterly['Quarter'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=4, trace=True)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[52] intercept   : AIC=8084.734, Time=0.08 sec
 ARIMA(1,1,0)(1,0,0)[52] intercept   : AIC=7962.192, Time=1.20 sec
 ARIMA(0,1,1)(0,0,1)[52] intercept   : AIC=7900.429, Time=1.53 sec
 ARIMA(0,1,0)(0,0,0)[52]             : AIC=9872.196, Time=0.02 sec
 ARIMA(0,1,1)(0,0,0)[52] intercept   : AIC=7963.459, Time=0.04 sec
 ARIMA(0,1,1)(1,0,1)[52] intercept   : AIC=7888.884, Time=2.11 sec
 ARIMA(0,1,1)(1,0,0)[52] intercept   : AIC=7887.099, Time=1.60 sec
 ARIMA(0,1,1)(2,0,0)[52] intercept   : AIC=7888.903, Time=10.04 sec
 ARIMA(0,1,1)(2,0,1)[52] intercept   : AIC=7890.853, Time=21.10 sec
 ARIMA(0,1,0)(1,0,0)[52] intercept   : AIC=8005.869, Time=1.14 sec
 ARIMA(1,1,1)(1,0,0)[52] intercept   : AIC=7873.722, Time=2.46 sec
 ARIMA(1,1,1)(0,0,0)[52] intercept   : AIC=7961.236, Time=0.06 sec
 ARIMA(1,1,1)(2,0,0)[52] intercept   : AIC=7875.665, Time=15.63 sec
 ARIMA(1,1,1)(1,0,1)[52] intercept   : AIC=7875.686, Time=3.73 sec
 ARIMA(1,1,1)(0,

In [278]:
def print_params(mod):
    order = mod.get_params()['order']
    sorder = mod.get_params()['seasonal_order']
    print(f'SARIMAX {order} {sorder[:3]}-{sorder[-1]}')

print_params(smax_wly_foot)
print_params(smax_mth_foot)
print_params(smax_qtr_foot)

SARIMAX (3, 1, 1) (1, 0, 0)-52
SARIMAX (1, 0, 0) (0, 0, 0)-12
SARIMAX (0, 0, 0) (1, 0, 0)-4


In [279]:
smax_wly_casu = pmd.auto_arima(
    weekly['Casual'], X=weekly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=3, max_d=1, max_q=3,
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2,
    max_order=6, m=52, trace=True)

smax_mth_casu = pmd.auto_arima(
    monthly['Casual'], X=monthly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=12, stepwise=True, trace=True)

smax_qtr_casu = pmd.auto_arima(
    quarterly['Casual'], X=quarterly['Quarter'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=4, stepwise=True, trace=True)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[52] intercept   : AIC=7776.094, Time=0.02 sec
 ARIMA(1,1,0)(1,0,0)[52] intercept   : AIC=7666.714, Time=1.38 sec
 ARIMA(0,1,1)(0,0,1)[52] intercept   : AIC=7570.793, Time=8.74 sec
 ARIMA(0,1,0)(0,0,0)[52]             : AIC=9563.558, Time=0.02 sec
 ARIMA(0,1,1)(0,0,0)[52] intercept   : AIC=7600.270, Time=0.13 sec
 ARIMA(0,1,1)(1,0,1)[52] intercept   : AIC=7573.965, Time=7.08 sec
 ARIMA(0,1,1)(0,0,2)[52] intercept   : AIC=7578.742, Time=10.90 sec
 ARIMA(0,1,1)(1,0,0)[52] intercept   : AIC=7573.123, Time=4.70 sec
 ARIMA(0,1,1)(1,0,2)[52] intercept   : AIC=7580.510, Time=19.27 sec
 ARIMA(0,1,0)(0,0,1)[52] intercept   : AIC=7734.887, Time=1.32 sec
 ARIMA(1,1,1)(0,0,1)[52] intercept   : AIC=7574.006, Time=7.49 sec
 ARIMA(0,1,2)(0,0,1)[52] intercept   : AIC=7579.229, Time=2.40 sec
 ARIMA(1,1,0)(0,0,1)[52] intercept   : AIC=7668.030, Time=1.37 sec
 ARIMA(1,1,2)(0,0,1)[52] intercept   : AIC=7571.354, Time=9.28 sec
 ARIMA(0,1,1)(0,0

In [280]:
print_params(smax_wly_casu)
print_params(smax_mth_casu)
print_params(smax_qtr_casu)

SARIMAX (0, 1, 1) (0, 0, 1)-52
SARIMAX (1, 1, 1) (0, 0, 0)-12
SARIMAX (0, 1, 0) (0, 0, 0)-4


In [281]:
smax_wly_fish = pmd.auto_arima(
    weekly['Fishing'], X=weekly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=3, max_d=1, max_q=3,
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2,
    max_order=6, m=52, trace=True)

smax_mth_fish = pmd.auto_arima(
    monthly['Fishing'], X=monthly['Month'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=12, stepwise=True, trace=True)

smax_qtr_fish = pmd.auto_arima(
    quarterly['Fishing'], X=quarterly['Quarter'].values.reshape(-1, 1), 
    start_p=0, d=None, start_q=0, max_p=5, max_d=2, max_q=5, 
    start_P=0, D=None, start_Q=0, max_P=2, max_D=1, max_Q=2, 
    max_order=12, m=4, stepwise=True, trace=True)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[52] intercept   : AIC=7552.351, Time=0.02 sec
 ARIMA(1,1,0)(1,0,0)[52] intercept   : AIC=7477.379, Time=1.55 sec
 ARIMA(0,1,1)(0,0,1)[52] intercept   : AIC=7394.881, Time=1.91 sec
 ARIMA(0,1,0)(0,0,0)[52]             : AIC=9339.815, Time=0.02 sec
 ARIMA(0,1,1)(0,0,0)[52] intercept   : AIC=7394.302, Time=0.11 sec
 ARIMA(0,1,1)(1,0,0)[52] intercept   : AIC=7394.629, Time=1.97 sec
 ARIMA(0,1,1)(1,0,1)[52] intercept   : AIC=inf, Time=9.56 sec
 ARIMA(1,1,1)(0,0,0)[52] intercept   : AIC=7393.653, Time=0.14 sec
 ARIMA(1,1,1)(1,0,0)[52] intercept   : AIC=7393.424, Time=5.89 sec
 ARIMA(1,1,1)(2,0,0)[52] intercept   : AIC=7390.666, Time=33.80 sec
 ARIMA(1,1,1)(2,0,1)[52] intercept   : AIC=7393.484, Time=27.24 sec
 ARIMA(1,1,1)(1,0,1)[52] intercept   : AIC=inf, Time=12.16 sec
 ARIMA(0,1,1)(2,0,0)[52] intercept   : AIC=7391.239, Time=9.72 sec
 ARIMA(1,1,0)(2,0,0)[52] intercept   : AIC=7472.802, Time=7.55 sec
 ARIMA(2,1,1)(2,0,0)[52] i

In [282]:
print_params(smax_wly_fish)
print_params(smax_mth_fish)
print_params(smax_qtr_fish)

SARIMAX (3, 1, 2) (2, 0, 0)-52
SARIMAX (0, 1, 2) (0, 0, 0)-12
SARIMAX (0, 0, 0) (1, 0, 0)-4


In [9]:
# PARAMETERS
params = {
    'Football':{
        'weekly': [(3, 1, 1), (1, 0, 0, 52)],
        'monthly': [(1, 0, 0), (0, 0, 0, 12)],
        'quarterly': [(0, 0, 0), (1, 0, 0, 4)]
    },
    'Casual':{
        'weekly': [(0, 1, 1), (0, 0, 1, 52)],
        'monthly': [(1, 1, 1), (0, 0, 1, 12)],
        'quarterly': [(0, 1, 0), (0, 0, 0, 4)]
    },
    'Fishing':{
        'weekly': [(3, 1, 2), (2, 0, 0, 52)],
        'monthly': [(0, 1, 2), (0, 0, 0, 12)],
        'quarterly': [(0, 0, 0), (1, 0, 0, 4)]
    }
}

# DATASETS
datasets = {
    'Train': {
        'weekly': weekly_train,
        'monthly': monthly_train,
        'quarterly': quarterly_train
    },
    'Test': {
        'weekly': weekly_test,
        'monthly': monthly_test,
        'quarterly': quarterly_test
    }
}

In [10]:
import statsmodels.api as sm

def sarimax_predict(sector, time_frame, refit_period=None):
    assert (sector in ('Football', 'Casual', 'Fishing')) & (time_frame in ('weekly', 'monthly', 'quarterly'))
    print("Starting SARIMAX fit...")

    if time_frame == 'quarterly':
        exog_tf = 'Quarter'
    else:
        exog_tf = 'Month'

    if not refit_period:
        refit_period = len(datasets['Test'][time_frame])

    print(f"Number of fits: {len(datasets['Test'][time_frame]) // refit_period}")

    predictions = []

    for i in range(len(datasets['Test'][time_frame]) // refit_period):

        endog = pd.concat(
            [datasets['Train'][time_frame][sector], datasets['Test'][time_frame][sector].iloc[:i*refit_period]],
            axis=0
        )

        exog = pd.concat(
            [datasets['Train'][time_frame][exog_tf], datasets['Test'][time_frame][exog_tf].iloc[:i*refit_period]],
            axis=0
        )

        mod = sm.tsa.statespace.SARIMAX(
            endog=endog, 
            exog=exog,
            order=params[sector][time_frame][0],
            seasonal_order=params[sector][time_frame][1])
        
        fit_ = mod.fit()

        y_hat = fit_.get_prediction(
            start=datasets['Test'][time_frame].index[i*refit_period], 
            end=datasets['Test'][time_frame].index[(i+1)*refit_period - 1], 
            exog=datasets['Test'][time_frame][exog_tf][i*refit_period:(i+1)*refit_period])
        
        predictions.append(pd.concat([
            y_hat.predicted_mean,
            y_hat.conf_int()
        ], axis=1))
        
    return pd.concat(predictions, axis=0)

In [None]:
foot_wly_pred = sarimax_predict('Football', 'weekly', refit_period=4)
foot_mth_pred = sarimax_predict('Football', 'monthly', refit_period=1)
foot_qtr_pred = sarimax_predict('Football', 'quarterly', refit_period=1)

In [None]:
casu_wly_pred = sarimax_predict('Casual', 'weekly', refit_period=4)
casu_mth_pred = sarimax_predict('Casual', 'monthly', refit_period=1)
casu_qtr_pred = sarimax_predict('Casual', 'quarterly', refit_period=1)

In [None]:
fish_wly_pred = sarimax_predict('Fishing', 'weekly', refit_period=4)
fish_mth_pred = sarimax_predict('Fishing', 'monthly', refit_period=1)
fish_qtr_pred = sarimax_predict('Fishing', 'quarterly', refit_period=1)

In [14]:
foot_wly_pred.to_csv('data/predictions/SARIMAX/foot_wly.csv')
foot_mth_pred.to_csv('data/predictions/SARIMAX/foot_mth.csv')
foot_qtr_pred.to_csv('data/predictions/SARIMAX/foot_qtr.csv')

casu_wly_pred.to_csv('data/predictions/SARIMAX/casu_wly.csv')
casu_mth_pred.to_csv('data/predictions/SARIMAX/casu_mth.csv')
casu_qtr_pred.to_csv('data/predictions/SARIMAX/casu_qtr.csv')

fish_wly_pred.to_csv('data/predictions/SARIMAX/fish_wly.csv')
fish_mth_pred.to_csv('data/predictions/SARIMAX/fish_mth.csv')
fish_qtr_pred.to_csv('data/predictions/SARIMAX/fish_qtr.csv')

In [15]:
# PREDICTIONS
predictions = {
    'Football':{
        'weekly': foot_wly_pred,
        'monthly': foot_mth_pred,
        'quarterly': foot_qtr_pred
    },
    'Casual':{
        'weekly': casu_wly_pred,
        'monthly': casu_mth_pred,
        'quarterly': casu_qtr_pred
    },
    'Fishing':{
        'weekly': fish_wly_pred,
        'monthly': fish_mth_pred,
        'quarterly': fish_qtr_pred
    }
}

In [16]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import numpy as np

def get_metrics(sector, time_frame):

    cut_size = len(predictions[sector][time_frame])

    mape = np.round_(mean_absolute_percentage_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size], 
        predictions[sector][time_frame].iloc[:, 0])* 100., decimals=2)
    
    rmse = np.round_(np.sqrt(mean_squared_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size], 
        predictions[sector][time_frame].iloc[:, 0])), decimals=2)
    
    print(f"SECTOR: {sector}  \t TIME FRAME: {time_frame}")
    print(f"RMSE: {rmse} \t\t MAPE: {mape}")

In [17]:
get_metrics('Football', 'weekly')
get_metrics('Football', 'monthly')
get_metrics('Football', 'quarterly')

SECTOR: Football  	 TIME FRAME: weekly
RMSE: 8618.93 		 MAPE: 39.34
SECTOR: Football  	 TIME FRAME: monthly
RMSE: 41226.97 		 MAPE: 35.45
SECTOR: Football  	 TIME FRAME: quarterly
RMSE: 91787.82 		 MAPE: 24.09


In [20]:
get_metrics('Casual', 'weekly')
get_metrics('Casual', 'monthly')
get_metrics('Casual', 'quarterly')

SECTOR: Casual  	 TIME FRAME: weekly
RMSE: 5297.43 		 MAPE: 42.96
SECTOR: Casual  	 TIME FRAME: monthly
RMSE: 14907.47 		 MAPE: 26.08
SECTOR: Casual  	 TIME FRAME: quarterly
RMSE: 29596.15 		 MAPE: 16.22


In [21]:
get_metrics('Fishing', 'weekly')
get_metrics('Fishing', 'monthly')
get_metrics('Fishing', 'quarterly')

SECTOR: Fishing  	 TIME FRAME: weekly
RMSE: 5424.31 		 MAPE: 22.0
SECTOR: Fishing  	 TIME FRAME: monthly
RMSE: 12423.57 		 MAPE: 11.34
SECTOR: Fishing  	 TIME FRAME: quarterly
RMSE: 69861.36 		 MAPE: 28.41


## XGBoost

In [211]:
# Dataset preparation

def prepare_dataset(sector, time_frame, n, k, dropna=True):

    if time_frame == 'quarterly':
        exog_tf = 'Quarter'
    else:
        exog_tf = 'Month'

    train = datasets['Train'][time_frame][[sector, exog_tf]].copy()
    test = datasets['Test'][time_frame][[sector, exog_tf]].copy()

    train_size = len(train)

    all_df = pd.concat([train, test], axis=0)

    for i in range(n // k):
        all_df[f"t-{(i+1)*k}"] = all_df[sector].shift((i+1)*k)

    train = all_df.iloc[:train_size].copy()
    test = all_df.iloc[train_size:].copy()

    if dropna:
        train.dropna(inplace=True)
    
    return train, test

In [212]:
foot_mth_train, foot_mth_test = prepare_dataset('Football', 'monthly', 12, 3)

In [264]:
def xgboost_predict(sector, time_frame, n, k, refit_period=None):
    assert (sector in ('Football', 'Casual', 'Fishing')) & (time_frame in ('weekly', 'monthly', 'quarterly'))
    print("Starting XGBoost fit...")

    train, test = prepare_dataset(sector, time_frame, n, k)

    if not refit_period:
        refit_period = len(test)

    print(f"Number of fits: {len(test) // refit_period}")
    predictions = []

    for i in range(len(test) // refit_period):

        train_df = pd.concat([train, test.iloc[:i*refit_period]], axis=0)
        test_df = test.iloc[i*refit_period:(i+1)*refit_period]
        
        X_train = train_df.iloc[:, 1:]
        y_train = train_df.iloc[:, 0]

        X_test = test_df.iloc[:, 1:]

        mod = XGBRegressor()
        
        mod.fit(X_train, y_train)

        y_hat = mod.predict(X_test)
        
        predictions.append(y_hat)
        
    return pd.Series(np.concatenate(predictions), index=test.index[:len(predictions)*refit_period])

In [265]:
foot_wly_pred = xgboost_predict('Football', 'weekly', 52, 6, refit_period=8)
foot_mth_pred = xgboost_predict('Football', 'monthly', 12, 2, refit_period=1)
foot_qtr_pred = xgboost_predict('Football', 'quarterly', 4, 1, refit_period=1)

Starting XGBoost fit...
Number of fits: 9
Starting XGBoost fit...
Number of fits: 17
Starting XGBoost fit...
Number of fits: 6


In [266]:
casu_wly_pred = xgboost_predict('Casual', 'weekly', 52, 6, refit_period=8)
casu_mth_pred = xgboost_predict('Casual', 'monthly', 12, 2, refit_period=1)
casu_qtr_pred = xgboost_predict('Casual', 'quarterly', 4, 1, refit_period=1)

Starting XGBoost fit...
Number of fits: 9
Starting XGBoost fit...
Number of fits: 17
Starting XGBoost fit...
Number of fits: 6


In [267]:
fish_wly_pred = xgboost_predict('Fishing', 'weekly', 52, 6, refit_period=8)
fish_mth_pred = xgboost_predict('Fishing', 'monthly', 12, 2, refit_period=1)
fish_qtr_pred = xgboost_predict('Fishing', 'quarterly', 4, 1, refit_period=1)

Starting XGBoost fit...
Number of fits: 9
Starting XGBoost fit...
Number of fits: 17
Starting XGBoost fit...
Number of fits: 6


In [268]:
foot_wly_pred.to_csv('data/predictions/XGBOOST/foot_wly.csv')
foot_mth_pred.to_csv('data/predictions/XGBOOST/foot_mth.csv')
foot_qtr_pred.to_csv('data/predictions/XGBOOST/foot_qtr.csv')

casu_wly_pred.to_csv('data/predictions/XGBOOST/casu_wly.csv')
casu_mth_pred.to_csv('data/predictions/XGBOOST/casu_mth.csv')
casu_qtr_pred.to_csv('data/predictions/XGBOOST/casu_qtr.csv')

fish_wly_pred.to_csv('data/predictions/XGBOOST/fish_wly.csv')
fish_mth_pred.to_csv('data/predictions/XGBOOST/fish_mth.csv')
fish_qtr_pred.to_csv('data/predictions/XGBOOST/fish_qtr.csv')

In [257]:
# PREDICTIONS
predictions = {
    'Football':{
        'weekly': foot_wly_pred,
        'monthly': foot_mth_pred,
        'quarterly': foot_qtr_pred
    },
    'Casual':{
        'weekly': casu_wly_pred,
        'monthly': casu_mth_pred,
        'quarterly': casu_qtr_pred
    },
    'Fishing':{
        'weekly': fish_wly_pred,
        'monthly': fish_mth_pred,
        'quarterly': fish_qtr_pred
    }
}

In [258]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import numpy as np

def get_metrics(sector, time_frame):

    cut_size = len(predictions[sector][time_frame])

    mape = np.round_(mean_absolute_percentage_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size], 
        predictions[sector][time_frame])* 100., decimals=2)
    
    rmse = np.round_(np.sqrt(mean_squared_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size], 
        predictions[sector][time_frame])), decimals=2)
    
    print(f"SECTOR: {sector}  \t TIME FRAME: {time_frame}")
    print(f"RMSE: {rmse} \t\t MAPE: {mape}")

In [259]:
get_metrics('Football', 'weekly')
get_metrics('Football', 'monthly')
get_metrics('Football', 'quarterly')

SECTOR: Football  	 TIME FRAME: weekly
RMSE: 10410.21 		 MAPE: 50.38
SECTOR: Football  	 TIME FRAME: monthly
RMSE: 34862.39 		 MAPE: 33.68
SECTOR: Football  	 TIME FRAME: quarterly
RMSE: 36367.99 		 MAPE: 14.02


In [260]:
get_metrics('Casual', 'weekly')
get_metrics('Casual', 'monthly')
get_metrics('Casual', 'quarterly')

SECTOR: Casual  	 TIME FRAME: weekly
RMSE: 7312.21 		 MAPE: 72.5
SECTOR: Casual  	 TIME FRAME: monthly
RMSE: 24229.22 		 MAPE: 42.2
SECTOR: Casual  	 TIME FRAME: quarterly
RMSE: 44139.34 		 MAPE: 19.56


In [261]:
get_metrics('Fishing', 'weekly')
get_metrics('Fishing', 'monthly')
get_metrics('Fishing', 'quarterly')

SECTOR: Fishing  	 TIME FRAME: weekly
RMSE: 8088.3 		 MAPE: 37.31
SECTOR: Fishing  	 TIME FRAME: monthly
RMSE: 21653.69 		 MAPE: 21.4
SECTOR: Fishing  	 TIME FRAME: quarterly
RMSE: 64648.51 		 MAPE: 23.07


## LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from sklearn.preprocessing import StandardScaler

def get_model(n_layers, hidden_size, input_shape):
    model = Sequential()

    if n_layers > 3:
        model.add(LSTM(units=hidden_size, input_shape=(input_shape, 1), return_sequences=True))

        for i in range(n_layers - 2):
            model.add(LSTM(units=hidden_size, return_sequences=True))
            model.add(Dropout(rate=0.2))

        model.add(LSTM(units=hidden_size))

    elif n_layers == 2:
        model.add(LSTM(units=hidden_size, input_shape=(input_shape, 1), return_sequences=True))
        model.add(Dropout(rate=0.2))
        model.add(LSTM(units=hidden_size))

    else:
        model.add(LSTM(units=hidden_size, input_shape=(input_shape, 1)))
        model.add(Dropout(rate=0.2))

    model.add(Dense(units=1))

    # Compile the model
    model.compile(optimizer='adam', loss='mse')

    return model

def lstm_predict(sector, time_frame, n, k, n_layers=1, hidden_size=50, refit_period=None):
    assert (sector in ('Football', 'Casual', 'Fishing')) & (time_frame in ('weekly', 'monthly', 'quarterly'))
    print("Starting LSTM fit...")

    train, test = prepare_dataset(sector, time_frame, n, k)

    if not refit_period:
        refit_period = len(test)

    print(f"Number of fits: {len(test) // refit_period}")
    predictions = []

    for i in range(len(test) // refit_period):

        norm = StandardScaler()

        train_df = pd.concat([train, test.iloc[:i*refit_period]], axis=0)
        test_df = test.iloc[i*refit_period:(i+1)*refit_period]

        norm_train = norm.fit_transform(train_df)
        norm_test = norm.transform(test_df)

        X_train = norm_train[:, 1:]
        y_train = norm_train[:, 0]

        X_test = norm_test[:, 1:]

        mod = get_model(n_layers, hidden_size, n // k + 1)

        mod.fit(X_train, y_train, epochs=50)

        y_hat = mod.predict(X_test)

        predictions.append(norm.inverse_transform(np.c_[y_hat, np.zeros((y_hat.shape[0], n // k + 1))])[:, 0])

    return pd.Series(np.concatenate(predictions), index=test.index[:len(predictions)*refit_period])

In [None]:
lstm_predict('Football', 'monthly', 12, 2, n_layers=1)

Starting LSTM fit...
Number of fits: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


data
2021-08-31    110047.490889
2021-09-30    121605.471553
2021-10-31    113047.285297
2021-11-30    116914.676411
2021-12-31    116952.173324
2022-01-31     69319.272521
2022-02-28     67769.651344
2022-03-31     80655.497919
2022-04-30     78665.310165
2022-05-31     94013.120040
2022-06-30     87904.274682
2022-07-31    102932.197820
2022-08-31     93703.884085
2022-09-30    119110.237637
2022-10-31    114525.089157
2022-11-30    134014.390859
2022-12-31    120017.929475
dtype: float64

In [None]:
foot_wly_pred = lstm_predict('Football', 'weekly', 52, 6, refit_period=8, n_layers=1)
foot_mth_pred = lstm_predict('Football', 'monthly', 12, 2, refit_period=1, n_layers=1)
foot_qtr_pred = lstm_predict('Football', 'quarterly', 4, 1, refit_period=1, n_layers=1)

Starting LSTM fit...
Number of fits: 9
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 3



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [None]:
casu_wly_pred = lstm_predict('Casual', 'weekly', 52, 6, refit_period=8, n_layers=1)
casu_mth_pred = lstm_predict('Casual', 'monthly', 12, 2, refit_period=1, n_layers=1)
casu_qtr_pred = lstm_predict('Casual', 'quarterly', 4, 1, refit_period=1, n_layers=1)

Starting LSTM fit...
Number of fits: 9
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 3

In [None]:
fish_wly_pred = lstm_predict('Fishing', 'weekly', 52, 6, refit_period=8, n_layers=1)
fish_mth_pred = lstm_predict('Fishing', 'monthly', 12, 2, refit_period=1, n_layers=1)
fish_qtr_pred = lstm_predict('Fishing', 'quarterly', 4, 1, refit_period=1, n_layers=1)

Starting LSTM fit...
Number of fits: 9
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 3

In [None]:
foot_wly_pred.to_csv('data/predictions/LSTM/foot_wly.csv')
foot_mth_pred.to_csv('data/predictions/LSTM/foot_mth.csv')
foot_qtr_pred.to_csv('data/predictions/LSTM/foot_qtr.csv')

casu_wly_pred.to_csv('data/predictions/LSTM/casu_wly.csv')
casu_mth_pred.to_csv('data/predictions/LSTM/casu_mth.csv')
casu_qtr_pred.to_csv('data/predictions/LSTM/casu_qtr.csv')

fish_wly_pred.to_csv('data/predictions/LSTM/fish_wly.csv')
fish_mth_pred.to_csv('data/predictions/LSTM/fish_mth.csv')
fish_qtr_pred.to_csv('data/predictions/LSTM/fish_qtr.csv')

In [None]:
# PREDICTIONS
predictions = {
    'Football':{
        'weekly': foot_wly_pred,
        'monthly': foot_mth_pred,
        'quarterly': foot_qtr_pred
    },
    'Casual':{
        'weekly': casu_wly_pred,
        'monthly': casu_mth_pred,
        'quarterly': casu_qtr_pred
    },
    'Fishing':{
        'weekly': fish_wly_pred,
        'monthly': fish_mth_pred,
        'quarterly': fish_qtr_pred
    }
}

In [None]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import numpy as np

def get_metrics(sector, time_frame):

    cut_size = len(predictions[sector][time_frame])

    mape = np.round_(mean_absolute_percentage_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size],
        predictions[sector][time_frame])* 100., decimals=2)

    rmse = np.round_(np.sqrt(mean_squared_error(
        datasets['Test'][time_frame][sector].iloc[:cut_size],
        predictions[sector][time_frame])), decimals=2)

    print(f"SECTOR: {sector}  \t TIME FRAME: {time_frame}")
    print(f"RMSE: {rmse} \t\t MAPE: {mape}")

In [None]:
get_metrics('Football', 'weekly')
get_metrics('Football', 'monthly')
get_metrics('Football', 'quarterly')

SECTOR: Football  	 TIME FRAME: weekly
RMSE: 9034.07 		 MAPE: 51.15
SECTOR: Football  	 TIME FRAME: monthly
RMSE: 24495.09 		 MAPE: 24.85
SECTOR: Football  	 TIME FRAME: quarterly
RMSE: 46842.97 		 MAPE: 18.15


In [None]:
get_metrics('Casual', 'weekly')
get_metrics('Casual', 'monthly')
get_metrics('Casual', 'quarterly')

SECTOR: Casual  	 TIME FRAME: weekly
RMSE: 8602.92 		 MAPE: 87.66
SECTOR: Casual  	 TIME FRAME: monthly
RMSE: 20692.25 		 MAPE: 44.42
SECTOR: Casual  	 TIME FRAME: quarterly
RMSE: 65928.52 		 MAPE: 47.76


In [None]:
get_metrics('Fishing', 'weekly')
get_metrics('Fishing', 'monthly')
get_metrics('Fishing', 'quarterly')

SECTOR: Fishing  	 TIME FRAME: weekly
RMSE: 7392.63 		 MAPE: 36.1
SECTOR: Fishing  	 TIME FRAME: monthly
RMSE: 23010.95 		 MAPE: 25.28
SECTOR: Fishing  	 TIME FRAME: quarterly
RMSE: 87086.89 		 MAPE: 36.67
