In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.stats.diagnostic import acorr_ljungbox
import matplotlib.pyplot as plt
import warnings

In [2]:
df = pd.read_csv("../../data/raw/train_test.csv")

In [3]:
df.week_start = pd.to_datetime(df.week_start)

In [4]:
df.head()

Unnamed: 0,store_id,week_start,area_hier_desc,division_hier_desc,store_geography,carpark,soc_name,total_items,idle_hours,total_on_till_hours,total_transaction_count
0,124,2020-05-17,NORTHERN IRELAND REGION,IRELAND,NORTHERN IRELAND,1.0,SIMPLY FOOD,66943,56.6575,162.705556,7275
1,130,2020-01-26,MANCHESTER,NORTH,LANCASHIRE & CUMBRIA,0.0,LARGE - HIGH STREET,67949,64.993056,187.373333,9460
2,135,2019-11-17,MERSEYSIDE,NORTH,"MERSEYSIDE, NORTH WALES & IOM",1.0,MEDIUM - RETAIL PARK,45570,36.532222,118.333333,5812
3,136,2020-09-27,HOME COUNTIES SOUTH,SOUTH,HOME COUNTIES SOUTH,1.0,MAJOR - RETAIL PARK,131658,80.551944,326.306944,9168
4,176,2021-05-23,"NORTH WALES, CHESHIRE & IOM",CENTRAL,"MERSEYSIDE, NORTH WALES & IOM",1.0,MEDIUM - RETAIL PARK,56307,57.786111,149.586111,5671


## Finding closed stores

Some stores which are closed will have the most recent data and we can use that to filter that out:

In [5]:
stores_open = df[(df.week_start >= pd.to_datetime("2021-11-07")) ].store_id.unique().tolist()

In [6]:
stores_closed = list(set(df.store_id.unique()) - set(stores_open))

In [7]:
len(stores_closed)

37

In [8]:
df_result = pd.read_csv("../../data/raw/result.csv")
df_result.week_start = pd.to_datetime(df_result.week_start, dayfirst=False)

In [9]:
df_result.week_start.max()

Timestamp('2022-02-06 00:00:00')

In [10]:
predict_stores = df_result.store_id.unique().tolist()

In [11]:
df_result_pivot = df_result.pivot_table(
    values="week_start",
    index="store_id",
    aggfunc={
        "week_start": "size"
    }
)

In [12]:
df_result_pivot.week_start.unique()

array([13, 12,  8,  4,  1, 11])

The data for some stores are missing in this case, we go for something like interpolation to impute the missing values:

In [20]:
warnings.filterwarnings("ignore")
df_out = pd.DataFrame()
date = pd.date_range("2019-11-17", periods=104, freq="W")
df_date = pd.DataFrame(date).rename(columns={0: "week_start"})
for store in predict_stores:
    if store not in stores_closed:
        try:
            print(f"Predicting for store: {store}")
            test = df[df['store_id'] == store][["week_start", "total_on_till_hours"]].set_index("week_start").sort_index()
            model = ARIMA(np.log(test), order=(1, 1, 1), freq="7D")
            results = model.fit()
            predictions = np.exp(results.forecast(13))
            preds = pd.DataFrame(predictions).rename(columns={"predicted_mean": "predicted_total_on_till_hours"})
            preds["store_id"] = store
        except ValueError:
            test.reset_index(inplace=True)
            first_date = test.iloc[0]["week_start"]
            test_new = df_date.merge(test, on="week_start", how="left")
            test_final = test_new.copy()[test_new.week_start >= first_date]
            test_final["total_on_till_hours"].interpolate(method="linear", inplace=True)
            test_final.set_index("week_start", inplace=True)
            model = ARIMA(np.log(test_final), order=(1, 1, 1), freq="7D")
            results = model.fit()
            predictions = np.exp(results.forecast(13))
            preds = pd.DataFrame(predictions).rename(columns={"predicted_mean": "predicted_total_on_till_hours"})
            preds["store_id"] = store

            
            
    df_out = pd.concat([df_out, preds])


Predicting for store: 660
Predicting for store: 2114
Predicting for store: 798
Predicting for store: 314
Predicting for store: 1860
Predicting for store: 5278
Predicting for store: 1579
Predicting for store: 108
Predicting for store: 369
Predicting for store: 1889
Predicting for store: 356
Predicting for store: 397
Predicting for store: 1147
Predicting for store: 1289
Predicting for store: 1312
Predicting for store: 785
Predicting for store: 848
Predicting for store: 1961
Predicting for store: 2642
Predicting for store: 3308
Predicting for store: 782
Predicting for store: 2097
Predicting for store: 7773
Predicting for store: 3036
Predicting for store: 4828
Predicting for store: 5359
Predicting for store: 2071
Predicting for store: 1630
Predicting for store: 176
Predicting for store: 1454
Predicting for store: 1740
Predicting for store: 955
Predicting for store: 779
Predicting for store: 244
Predicting for store: 4556
Predicting for store: 7472
Predicting for store: 8989
Predicting for 

In [109]:
# store = 1029
# test = df[df['store_id'] == store][["week_start", "total_on_till_hours"]].set_index("week_start").sort_index()
# model = ARIMA(np.log(test), order=(1, 1, 1), freq="7D")
# results = model.fit()
# predictions = np.exp(results.forecast(13))

In [120]:
df_out_final = df_out.copy().reset_index().rename(columns={"index": "week_start"})

In [122]:
df_pre_final = df_result[["week_start", "store_id"]].merge(df_out_final, on=["week_start", "store_id"], how="left")

## Check that closed stores have not been predicted for

In [125]:
df_pre_final[~df_pre_final.predicted_total_on_till_hours.isna()].store_id.nunique()

563

In [126]:
set(df_pre_final[df_pre_final.predicted_total_on_till_hours.isna()].store_id.unique()) - set(stores_closed)

set()

Closed stores are given 0 as output

In [127]:
df_final = df_pre_final.copy().fillna(0)

In [None]:
df_final.to_csv("../../data/processed/final_results.csv", index=False)

## Seasonal models

In [132]:
test_final

Unnamed: 0_level_0,total_on_till_hours
week_start,Unnamed: 1_level_1
2019-11-17,65.207778
2019-11-24,72.526667
2019-12-01,69.032222
2019-12-08,74.675833
2019-12-15,46.109444
...,...
2021-10-10,0.527778
2021-10-17,0.368611
2021-10-24,9.299722
2021-10-31,0.467778


In [17]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [18]:
model = SARIMAX(test_final, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results = model.fit()
np.exp(results.forecast(13))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.81666D+00    |proj g|=  5.72521D-02

At iterate    5    f=  2.80588D+00    |proj g|=  2.71814D-03

At iterate   10    f=  2.80576D+00    |proj g|=  4.85620D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     10     13      1     0     0   4.856D-06   2.806D+00
  F =   2.8057632690684069     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


 This problem is unconstrained.


2021-11-14      10.473954
2021-11-21     236.929715
2021-11-28      39.437467
2021-12-05      22.323677
2021-12-12      12.717770
2021-12-19      13.782203
2021-12-26      14.078755
2022-01-02      14.382456
2022-01-09      13.061282
2022-01-16    1508.398718
2022-01-23      13.458806
2022-01-30      27.017478
2022-02-06      37.475190
Freq: W-SUN, Name: predicted_mean, dtype: float64

In [19]:
print(acorr_ljungbox(results.resid, lags=[10], return_df=True))

      lb_stat  lb_pvalue
10  10.892737   0.365936


In [23]:
store = 660
test = df[df['store_id'] == store][["week_start", "total_on_till_hours"]].set_index("week_start").sort_index()
model = ARIMA(np.log(test), order=(1, 1, 1), freq="7D")
results = model.fit()
predictions = np.exp(results.forecast(13))
preds = pd.DataFrame(predictions).rename(columns={"predicted_mean": "predicted_total_on_till_hours"})
preds["store_id"] = store
preds

Unnamed: 0,predicted_total_on_till_hours,store_id
2021-11-14,142.385142,660
2021-11-21,142.203735,660
2021-11-28,142.082595,660
2021-12-05,142.001671,660
2021-12-12,141.947601,660
2021-12-19,141.911466,660
2021-12-26,141.887316,660
2022-01-02,141.871175,660
2022-01-09,141.860385,660
2022-01-16,141.853173,660


In [24]:
test

Unnamed: 0_level_0,total_on_till_hours
week_start,Unnamed: 1_level_1
2019-11-17,171.436111
2019-11-24,185.754722
2019-12-01,198.127222
2019-12-08,242.268333
2019-12-15,230.485833
...,...
2021-10-10,132.208333
2021-10-17,140.271667
2021-10-24,135.443889
2021-10-31,138.338611
