In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from prophet import Prophet

from utils import *


sns.set_style('whitegrid')
pd.set_option('display.max_rows', None)

In [2]:
data = pd.read_parquet('data/n_forecast_preprocessed.parquet', engine='pyarrow', dtype_backend='numpy_nullable')

In [3]:
data.head()

Unnamed: 0,sales_date,store_hashed,n_transactions,store_format,zipcode_region,region,holiday_saint_nicholas,holiday_first_christmas,holiday_liberation_day,holiday_good_friday,...,holiday_valentines_day,autumn_school_holiday,summer_school_holiday,spring_school_holiday,christmas_school_holiday,may_school_holiday,easter_school_holiday,datetime_store_open,datetime_store_closed,working_hours
0,2019-01-02,b9d487a30398d42ecff55c228ed5652b,6570,19,68,south,0,0,0,0,...,0,0,0,0,1,0,0,2019-01-02 08:00:00,2019-01-02 20:00:00,12.0
1,2019-01-02,c3e0c62ee91db8dc7382bde7419bb573,1773,6,30,middle,0,0,0,0,...,0,0,0,0,1,0,0,2019-01-02 08:00:00,2019-01-02 20:00:00,12.0
2,2019-01-02,90db9da4fc5414ab55a9fe495d555c06,8783,19,10,north,0,0,0,0,...,0,0,0,0,1,0,0,2019-01-02 08:00:00,2019-01-02 22:00:00,14.0
3,2019-01-02,07042ac7d03d3b9911a00da43ce0079a,5391,19,10,north,0,0,0,0,...,0,0,0,0,1,0,0,2019-01-02 08:00:00,2019-01-02 22:00:00,14.0
4,2019-01-02,b60c5ab647a27045b462934977ccad9a,2789,6,96,north,0,0,0,0,...,0,0,0,0,1,0,0,2019-01-02 08:00:00,2019-01-02 20:00:00,12.0


In [5]:
data.set_index('sales_date', inplace=True)
data.sort_index(inplace=True)

In [6]:
data.head()

Unnamed: 0_level_0,store_hashed,n_transactions,store_format,zipcode_region,region,holiday_saint_nicholas,holiday_first_christmas,holiday_liberation_day,holiday_good_friday,holiday_new_years_day,...,holiday_valentines_day,autumn_school_holiday,summer_school_holiday,spring_school_holiday,christmas_school_holiday,may_school_holiday,easter_school_holiday,datetime_store_open,datetime_store_closed,working_hours
sales_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,a58149d355f02887dfbe55ebb2b64ba3,1142,19,56,south,0,0,0,0,1,...,0,0,0,0,1,0,0,2019-01-01 12:00:00,2019-01-01 18:00:00,6.0
2019-01-01,bb57db42f77807a9c5823bd8c2d9aaef,3761,19,10,north,0,0,0,0,1,...,0,0,0,0,1,0,0,2019-01-01 11:00:00,2019-01-01 22:00:00,11.0
2019-01-01,3a09a524440d44d7f19870070a5ad42f,1681,16,10,north,0,0,0,0,1,...,0,0,0,0,1,0,0,2021-01-19 08:30:00,2021-01-19 18:00:00,9.5
2019-01-01,28e209b61a52482a0ae1cb9f5959c792,3124,19,24,middle,0,0,0,0,1,...,0,0,0,0,1,0,0,2019-01-01 12:00:00,2019-01-01 18:00:00,6.0
2019-01-01,1cecc7a77928ca8133fa24680a88d2f9,2534,19,10,north,0,0,0,0,1,...,0,0,0,0,1,0,0,2019-01-01 10:00:00,2019-01-01 20:00:00,10.0


In [7]:
# Plot all the RAW time series
# plt.figure(figsize=(16, 6))
# plt.plot(data['n_transactions'])
# plt.show()

In [8]:
data['store_hashed'].nunique()

595

In [9]:
# There are 595 stores in the dataset. 
# Let's take a look at the sales of one of them.

# store = data[data['store_hashed'] == 'b60c5ab647a27045b462934977ccad9a']
# store.head(10)

# Check stationarity for each store

In [18]:
stationary_stores, non_stationary_stores = test_stores_stationarity(data, plot=False)

Stationary stores: 498
Non stationary stores: 97


# Make every store time series stationary

In [19]:
def make_stationary(data, non_stationary_stores):
    stationary_data = data.copy()
    
    for store in non_stationary_stores:
        store_data = data[data['store_hashed'] == store]
        diff = store_data['n_transactions'].diff().dropna()
        
        # Re-run Dickey-Fuller test
        dftest = adfuller(diff, autolag='AIC', maxlag=12)
        
        if dftest[1] < 0.05:
            # print(f'Store {store} is now stationary')
            stationary_data.loc[store_data.index, 'n_transactions'] = diff
        else:
            print(f'Store {store} is still non-stationary')
            
    return stationary_data

In [20]:
stationary_data = make_stationary(data, non_stationary_stores)

Store 3a09a524440d44d7f19870070a5ad42f is now stationary
Store e520f70ac3930490458892665cda6620 is now stationary
Store 99f42c473afe0eb4bd047ae133b851fc is now stationary
Store fd2ae8ec902471d8956fca3486031013 is now stationary
Store e6385d39ec9394f2f3a354d9d2b88eec is now stationary
Store 50905d7b2216bfeccb5b41016357176b is now stationary
Store 85d6e9c8255c0364fb67b5ac8a25eea3 is now stationary
Store 487129304eca93e3646dd0c7dd441bf5 is now stationary
Store b1bc40d056bad6ec6949d9bb6fee5e84 is now stationary
Store 677fa4059ee76333f9bb9a7920aef719 is now stationary
Store 069654d5ce089c13f642d19f09a3d1c0 is now stationary
Store fdc0eb412a84fa549afe68373d9087e9 is now stationary
Store ff1ced3097ccf17c1e67506cdad9ac95 is now stationary
Store 1a3650aedfdd3a21444047ed2d89458f is now stationary
Store 98fb202278940504d75b5a97b1476be4 is now stationary
Store 94f192dee566b018e0acf31e1f99a2d9 is now stationary
Store b448d8292fd27ae25bbc2e09ad43ff88 is now stationary
Store c41dd99a69df04044aa4e33ec

In [21]:
stationary_data = stationary_data.dropna()

In [22]:
# test stationarity again
stationary_stores, non_stationary_stores = test_stores_stationarity(stationary_data, plot=False)

Stationary stores: 595
Non stationary stores: 0


In [23]:
len(stationary_data)

708983