Implementation of Arima using the ibex_ARIMA class in `src.arima`

In [33]:
# Standard Imports
import importlib
import os
import numpy as np
import pandas as pd
import sys

# visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm

# Src modules
sys.path.append(os.path.dirname(os.getcwd())) # Add the parent directory to the Python path so we can import src modules
import src
from src.arima import ibex_ARIMA
from src.data_setup import get_data, train_val_split, create_day_of_week, RAW_PATH, RESULTS_PATH, SUBMISSION_PATH
from src.model_evaluation import model_eval_pipeline, eval_hypothesis_test, transform_daily_sales_predictions
from src.visualisation import plot_heatmap, plot_sales_by, plot_time_series_preds, plot_rolling_average_stdev

In [34]:
train, test, stores, transactions = get_data()
# merge stores data into train
train = train.merge(stores, on ='store_nbr')
assert train.isnull().any().any() == False
assert train.duplicated().any() == False
print('no null or duplicate values in the training data')

loading pickled dataframes...
no null or duplicate values in the training data


In [35]:
# split into train and validation data
train, validation = train_val_split(train)
# validation = validation.groupby('date')['sales'].sum().reset_index()
train.shape, validation.shape

((2949210, 25), (51678, 25))

In [36]:
arima = ibex_ARIMA(train)

In [37]:
orders = [
    (1, 1, 1),
    # (1, 1, 0),
    # (1, 0, 1),
    # (1, 0, 0),
    # (0, 1, 1),
    # (0, 1, 0),
]
sale_perf = {}
diff_perf = {}
for order in tqdm(orders):
    s, d = arima.fit(order, (order + (7,)), plot=False)
    sale_perf[order] = s
    diff_perf[order] = d

best_order = min(diff_perf, key=diff_perf.get)
print(best_order, diff_perf[order])

100%|██████████| 1/1 [00:01<00:00,  1.75s/it]

(1, 1, 1) 110072.3539587817





In [38]:
best_order = min(sale_perf, key=sale_perf.get)
print(best_order, diff_perf[order], sale_perf[order])

(1, 1, 1) 110072.3539587817 452699.6567539397


In [39]:
validation_results = arima.evaluate(validation=validation.groupby('date')['sales'].sum().reset_index())

In [40]:
validation_results = validation_results.drop(columns=['pred_diff_sales'])
validation_results = transform_daily_sales_predictions(validation_results, train)
validation_results.head()

Unnamed: 0,date,sales,day_of_week,pred_sales,store_nbr,family,pct_sales,transformed_sales
0,2017-07-18,730133.6875,2,739021.190895,1,AUTOMOTIVE,6.798028e-06,5.023887
1,2017-07-18,730133.6875,2,739021.190895,1,BABY CARE,0.0,0.0
2,2017-07-18,730133.6875,2,739021.190895,1,BEAUTY,4.529532e-06,3.34742
3,2017-07-18,730133.6875,2,739021.190895,1,BEVERAGES,0.002892669,2137.74392
4,2017-07-18,730133.6875,2,739021.190895,1,BOOKS,2.611756e-07,0.193014


In [41]:
model_eval_pipeline(validation['sales'], validation_results['transformed_sales'])

{'mae': 336.5845219207806,
 'mse': 1118342.995234885,
 'rmse': 1057.5173734908024,
 'rmsle': 1.1717345176616767,
 'r2': 0.2998583776179432}

# Test Data

In [42]:
train, test, stores, transactions = get_data()
# merge stores data into train
train = train.merge(stores, on ='store_nbr')
assert train.isnull().any().any() == False
assert train.duplicated().any() == False
print('no null or duplicate values in the training data')

loading pickled dataframes...
no null or duplicate values in the training data


In [43]:
train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,year,month,week,day,quarter,...,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,city,state,type,cluster
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,2013,1,1,1,1,...,0,1,0,1,0,0,Quito,Pichincha,D,13
1,2013-01-01,1,BABY CARE,0.0,0.0,2013,1,1,1,1,...,0,1,0,1,0,0,Quito,Pichincha,D,13
2,2013-01-01,1,BEAUTY,0.0,0.0,2013,1,1,1,1,...,0,1,0,1,0,0,Quito,Pichincha,D,13
3,2013-01-01,1,BEVERAGES,0.0,0.0,2013,1,1,1,1,...,0,1,0,1,0,0,Quito,Pichincha,D,13
4,2013-01-01,1,BOOKS,0.0,0.0,2013,1,1,1,1,...,0,1,0,1,0,0,Quito,Pichincha,D,13


In [44]:
test.head()

Unnamed: 0_level_0,date,store_nbr,family,onpromotion,year,month,week,day,quarter,day_of_week,day_of_month,week_of_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0.0,2017,8,33,228,3,3,16,3,0,3,0,0,0,0,0,2
3000889,2017-08-16,1,BABY CARE,0.0,2017,8,33,228,3,3,16,3,0,3,0,0,0,0,0,2
3000890,2017-08-16,1,BEAUTY,2.0,2017,8,33,228,3,3,16,3,0,3,0,0,0,0,0,2
3000891,2017-08-16,1,BEVERAGES,20.0,2017,8,33,228,3,3,16,3,0,3,0,0,0,0,0,2
3000892,2017-08-16,1,BOOKS,0.0,2017,8,33,228,3,3,16,3,0,3,0,0,0,0,0,2


In [45]:
arima = ibex_ARIMA(train)
order = (1, 1, 1)
arima.fit(order, (order + (7,)), plot=False)


(454997.5613370491, 109663.75662187938)

In [46]:
test_results = arima.predict(test.date.unique())

In [47]:
test_results

Unnamed: 0,date,pred_diff_sales,day_of_week,pred_sales
0,2017-08-16,2286.527511,3,764948.5
1,2017-08-17,-125599.582172,4,639348.9
2,2017-08-18,139723.027224,5,779071.9
3,2017-08-19,141624.892053,6,920696.8
4,2017-08-20,88112.681797,7,1008809.0
5,2017-08-21,-231079.619102,1,777729.9
6,2017-08-22,-35006.335616,2,742723.5
7,2017-08-23,12537.030229,3,755260.6
8,2017-08-24,-126235.230042,4,629025.3
9,2017-08-25,142332.589045,5,771357.9


In [48]:
test_results = test_results.drop(columns=['pred_diff_sales'])
test_results = transform_daily_sales_predictions(test_results, train)
test_results.head()

Unnamed: 0,date,day_of_week,pred_sales,store_nbr,family,pct_sales,transformed_sales
0,2017-08-16,3,764948.465011,1,AUTOMOTIVE,5.977018e-06,4.572111
1,2017-08-16,3,764948.465011,1,BABY CARE,0.0,0.0
2,2017-08-16,3,764948.465011,1,BEAUTY,4.417796e-06,3.379386
3,2017-08-16,3,764948.465011,1,BEVERAGES,0.003114104,2382.128769
4,2017-08-16,3,764948.465011,1,BOOKS,2.809409e-07,0.214905


In [49]:
test.shape, test_results.shape

((28512, 20), (28512, 7))

In [50]:
submission = pd.read_csv(SUBMISSION_PATH / 'sample_submission.csv')

In [51]:
submission['sales'] = test_results.transformed_sales

In [52]:
assert len(submission) == 28512

In [53]:
submission.to_csv(SUBMISSION_PATH / 'arima_submission.csv', index=False)