In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from tqdm.notebook import tqdm as tqdm

import holidays
from datetime import date

import pywt


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pmdarima.arima import auto_arima

In [2]:
"""
Aux functions
"""

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

'\nAux functions\n'

In [3]:
"""
Reading dataframes

"""

INPUT_DIR_PATH = ''
DAYS_PRED = 28
DATASET_SIZE = 1947
TR_LAST = DATASET_SIZE - 28 - 28
VL_LAST = DATASET_SIZE - 28
TS_LAST = DATASET_SIZE

def read_data():
    sell_prices_df = pd.read_csv(INPUT_DIR_PATH + 'sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

    calendar_df = pd.read_csv(INPUT_DIR_PATH + 'calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    calendar_df = calendar_df.fillna('unknown')
    print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))

    sales_df = pd.read_csv(INPUT_DIR_PATH + 'sales_train_evaluation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_df.shape[0], sales_df.shape[1]))

    submission_df = pd.read_csv(INPUT_DIR_PATH + 'sample_submission.csv')
    return sell_prices_df, calendar_df, sales_df, submission_df
    
prices_df, calendar_df, sales_df, submission_df = read_data()

num_cols = [f"d_{day}" for day in range(0,TR_LAST+1)]
cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

df = pd.melt(sales_df,
                  id_vars = cat_cols,
                  value_vars = [col for col in sales_df.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")

df = df.merge(calendar_df, on= "d", copy = False)
df = df.merge(prices_df, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
del sales_df, calendar_df, prices_df
# gc.collect()

df_raw = df.copy()

'\nReading dataframes\n\n'

Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Sales train validation has 30490 rows and 1947 columns


In [9]:
df.shape
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
df.set_index('date', inplace=True)
df.head()

(46881677, 22)

Unnamed: 0_level_0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,CA_1,HOBBIES,CA,d_1,12,11101,Saturday,...,1,2011,unknown,unknown,unknown,unknown,0,0,0,0.459961
2011-01-30,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,CA_1,HOBBIES,CA,d_2,15,11101,Sunday,...,1,2011,unknown,unknown,unknown,unknown,0,0,0,0.459961
2011-01-31,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,CA_1,HOBBIES,CA,d_3,0,11101,Monday,...,1,2011,unknown,unknown,unknown,unknown,0,0,0,0.459961
2011-02-01,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,CA_1,HOBBIES,CA,d_4,0,11101,Tuesday,...,2,2011,unknown,unknown,unknown,unknown,1,1,0,0.459961
2011-02-02,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,CA_1,HOBBIES,CA,d_5,0,11101,Wednesday,...,2,2011,unknown,unknown,unknown,unknown,1,0,1,0.459961


In [12]:
df = df[df['id'] == 'HOBBIES_1_008_CA_1_evaluation']
df.shape

(1941, 21)

In [13]:
y = df['sales']
y.index=df.index
y.head(2)

date
2011-01-29    12
2011-01-30    15
Name: sales, dtype: int64

In [14]:
train_size=int(len(df) *0.7)
test_size = int(len(df)) - train_size

X = df.drop(columns=['sales'])
X = pd.get_dummies(X)

train_X, train_y = X[:train_size].dropna(), y[:train_size].dropna()
test_X, test_y = X[train_size:].dropna(), y[train_size:].dropna()

In [None]:
# step_wise=auto_arima(train_y, 
#  exogenous= train_X,
#  start_p=1, start_q=1, 
#  max_p=7, max_q=7, 
#  d=1, max_d=7,
#  trace=True, 
#  error_action='ignore', 
#  suppress_warnings=True, 
#  stepwise=True)

In [None]:
# step_wise.summary()


In [17]:
model= sm.tsa.statespace.SARIMAX(endog=train_y, 
                                 exog=train_X,
                                 enforce_invertibility=False, 
                                 enforce_stationarity=False,
                                 trend='n', order=(6,1,0), seasonal_order=(0,1,1,7))



In [18]:
results= model.fit()

  warn("Maximum Likelihood optimization failed to converge. "


In [None]:
predictions=pd.DataFrame(predictions)
predictions.reset_index(drop=True, inplace=True)
predictions.index=test_X.index
predictions['sales'] = act['forecast']
predictions.rename(columns={0:'forecast'}, inplace=True)

In [None]:
predictions['sales'].plot(figsize=(20,8), legend=True, color=’blue’)
predictions['forecast'].plot(legend=True, color=’red’, figsize=(20,8))

In [22]:
end_train

item_store_list = df['item_id'].unique_values()

for item_store in item_store_list:
    aux = df[df['item_id'] == item_store]
    X_train = aux.drop(columns=['sales'])[:end_train]
    y_train = aux['sales'][:end_train]
    X_test = aux.drop(columns=['sales'])[end_train:]
    y_test = aux['sales'][end_train:]
    
    
    model= sm.tsa.statespace.SARIMAX(endog=y_train, 
                                 exog=X_train,
                                 enforce_invertibility=False, 
                                 enforce_stationarity=False,
                                 trend='n', order=(6,1,0), seasonal_order=(0,1,1,7))
    results= model.fit()

In [None]:
start = 0
result = sarimax_predictor(df['sales'], [1,1,0], [1,1,0,24], 7*4, start,
                                                  'Weekly forecast - Foods')