In [7]:
# ***********************  Forecasting Store Sales via the Linear regression Technique *******************
import pandas as pd
import numpy as np
import os
import gc
import warnings
import seaborn as sns
from warnings import simplefilter
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
simplefilter('ignore')
#
#  Maths Models 
#
import statistics as stats
import statsmodels.api as sm
#
# Data Visualization 
#
import matplotlib.pylab as plt
plt.show()
import plotly.express as px                    # For charts, plots, maps and diagrams
import plotly
# import seabon as sns
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = (12,8)
#
# Local Modules                                  # Coded from Scratch by me
#
import Stationarity_test
import eval_errors
#

# ***************** Seasonal Plot nad Periodogram *****************************
#



# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

# annotations: https://stackoverflow.com/a/49238256/5769929
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax

from scipy.signal import periodogram
def plot_periodogram(ts, detrend='linear', ax=None):
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")       #sampling frequency 
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(["Annual (1)","Semiannual (2)","Quarterly (4)","Bimonthly (6)",
                       "Monthly (12)","Biweekly (26)","Weekly (52)","Semiweekly (104)"],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance [a.u]")
    ax.set_title("Periodogram")
    return ax


In [6]:
fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
fs

365.2425

In [2]:
# *************** Read Data ****************
#
%matplotlib
store_sales = pd.read_csv('train.csv', usecols=['store_nbr', 'family', 'date', 'sales'],
                          dtype = {'store_nbr': 'int8',
                                  'family'    : 'category',
                                  'sales'     : 'float32'},
                          parse_dates=['date'],
                          infer_datetime_format=True,)

store_sales['date'] = store_sales.date.dt.to_period('D')

store_nber = 6   # 1 - 54
DF_store   = store_sales.loc[store_sales.store_nbr == store_nber]
DF_store   = DF_store.groupby(['date']).sales.sum().rename("Tot_sales").to_frame()
DF_store.plot(label = 'Total_sales', legend = True, title = 'store_nbr: {}'.format(str(store_nber)))


Using matplotlib backend: QtAgg


<AxesSubplot:title={'center':'store_nbr: 6'}, xlabel='date'>

In [3]:
# *********************** ERRORS ************************
#
from sklearn.metrics import mean_squared_error as mse
from statsmodels.tools.eval_measures import rmse
#
# Mean Absolute Percentage Error: MAPE
#
def MAPE(df1, df2):
    err = np.mean(np.abs((df1[2:] - df2[2:])/df1[2:]))*100
    print('\nMAPE: {}'.format(str(err)))
    
#
# Mean Squared Error: mse
#
def MSE(df1, df2):
    err = mse(df1, df2)
    print('\n MSE: {}'.format(str(err)))
#
# Root Mean Squared Error: rmse
#
def RMSE(df1, df2):
    err = rmse(df1, df2)
    print('\n MSE: {}'.format(str(err)))



In [3]:
# ************ Seasonal Plots ****************
#
%matplotlib
XX = DF_store.copy()
XX['week'] = XX.index.week
XX['day_of_week'] = XX.index.dayofweek
XX['year']  = XX.index.year
X = XX.loc[XX.year == 2017]
y = X['Tot_sales']
px.line(X, x = 'day_of_week', y = 'Tot_sales', color = 'week')
#
#  Periodogram 
#
%matplotlib
X_period = X['Tot_sales']
plot_periodogram(X_period)
X_period


Using matplotlib backend: QtAgg
Using matplotlib backend: QtAgg


date
2017-01-01        0.000000
2017-01-02    33420.296875
2017-01-03    24664.802734
2017-01-04    23453.037109
2017-01-05    15548.572266
                  ...     
2017-08-11    15775.921875
2017-08-12    14502.507812
2017-08-13    19261.919922
2017-08-14    14206.605469
2017-08-15    13849.208984
Freq: D, Name: Tot_sales, Length: 227, dtype: float32

In [5]:
# ************ Periodogram ************
#
%matplotlib
X_period = X['Tot_sales']
plot_periodogram(X_period)
X_period


Using matplotlib backend: QtAgg


date
2017-01-01        0.000000
2017-01-02    33420.296875
2017-01-03    24664.802734
2017-01-04    23453.037109
2017-01-05    15548.572266
                  ...     
2017-08-11    15775.921875
2017-08-12    14502.507812
2017-08-13    19261.919922
2017-08-14    14206.605469
2017-08-15    13849.208984
Freq: D, Name: Tot_sales, Length: 227, dtype: float32

In [4]:
# ****************  Features Creation ************
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
fourier = CalendarFourier(freq = 'M', order = 4)          # For Monthly Seasonality
dp = DeterministicProcess(
    index = y.index,
    constant=True,
    order=1,
    seasonal = True,
    additional_terms =[fourier],
    drop=True,
)
xx = dp.in_sample()
dp.index
xx

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)","sin(4,freq=M)","cos(4,freq=M)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2017-01-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
2017-01-02,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.979530,0.394356,0.918958,0.571268,0.820763,0.724793,0.688967
2017-01-03,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394356,0.918958,0.724793,0.688967,0.937752,0.347305,0.998717,-0.050649
2017-01-04,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.571268,0.820763,0.937752,0.347305,0.968077,-0.250653,0.651372,-0.758758
2017-01-05,1.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.724793,0.688967,0.998717,-0.050649,0.651372,-0.758758,-0.101168,-0.994869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,223.0,0.0,0.0,0.0,0.0,1.0,0.0,0.897805,-0.440394,-0.790776,-0.612106,-0.201299,0.979530,0.968077,-0.250653
2017-08-12,1.0,224.0,0.0,0.0,0.0,0.0,0.0,1.0,0.790776,-0.612106,-0.968077,-0.250653,0.394356,0.918958,0.485302,-0.874347
2017-08-13,1.0,225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.651372,-0.758758,-0.988468,0.151428,0.848644,0.528964,-0.299363,-0.954139
2017-08-14,1.0,226.0,1.0,0.0,0.0,0.0,0.0,0.0,0.485302,-0.874347,-0.848644,0.528964,0.998717,-0.050649,-0.897805,-0.440394


In [5]:
# **************** Seasonality Features ************
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
#
fourier = CalendarFourier(freq = 'M', order = 4)  # from monthly seasonality
dp = DeterministicProcess(
    index = y.index,
    constant=True,
    order=1,
    seasonal = True,
    additional_terms =[fourier],
    drop=True,
)
xx = dp.in_sample()
dp.index
xx

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)","sin(4,freq=M)","cos(4,freq=M)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2017-01-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
2017-01-02,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.979530,0.394356,0.918958,0.571268,0.820763,0.724793,0.688967
2017-01-03,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394356,0.918958,0.724793,0.688967,0.937752,0.347305,0.998717,-0.050649
2017-01-04,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.571268,0.820763,0.937752,0.347305,0.968077,-0.250653,0.651372,-0.758758
2017-01-05,1.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.724793,0.688967,0.998717,-0.050649,0.651372,-0.758758,-0.101168,-0.994869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,223.0,0.0,0.0,0.0,0.0,1.0,0.0,0.897805,-0.440394,-0.790776,-0.612106,-0.201299,0.979530,0.968077,-0.250653
2017-08-12,1.0,224.0,0.0,0.0,0.0,0.0,0.0,1.0,0.790776,-0.612106,-0.968077,-0.250653,0.394356,0.918958,0.485302,-0.874347
2017-08-13,1.0,225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.651372,-0.758758,-0.988468,0.151428,0.848644,0.528964,-0.299363,-0.954139
2017-08-14,1.0,226.0,1.0,0.0,0.0,0.0,0.0,0.0,0.485302,-0.874347,-0.848644,0.528964,0.998717,-0.050649,-0.897805,-0.440394


In [6]:
# ***************** Priliminary Fitting *******************
import numpy as np
%matplotlib
model  = LinearRegression().fit(xx, y)
y_pred = pd.Series(model.predict(xx), index=y.index, name = 'fitted')
eval_errors.MAPE(y, y_pred)
#
# forecasting
#
Nsteps   = 30
x_fore   = dp.out_of_sample(steps = Nsteps)
forecast = pd.Series(model.predict(x_fore), index = x_fore.index, name = 'forecast',)
#
# Plotting
#
fig, axes = plt.subplots()
y.plot(legend = True)
y_pred.plot(legend = True)
forecast.plot(legend = True)
axes.set_ylabel('total sales')
axes.set_title('Priliminary Fitting: store_{}'.format(str(store_nber)))


Using matplotlib backend: QtAgg

MAPE: 7.4471241721186505


Text(0.5, 1.0, 'Priliminary Fitting: store_6')

In [24]:
holidays_events = pd.read_csv("holidays_events.csv")
holidays_events

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [7]:
# ************************ Fitting with Holidays Features ********************
#
# National and regional holidays in the training set
from sklearn.preprocessing import OneHotEncoder

holidays_events = pd.read_csv("holidays_events.csv",
                              dtype={'type': 'category',
                                     'locale': 'category',
                                     'locale_name': 'category',
                                     'description': 'category',
                                     'transferred': 'bool'},
                              parse_dates=['date'],
                              infer_datetime_format=True,
                              )

holidays_events = holidays_events.set_index('date').to_period('D')

holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017':'2017-11-13', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)
#
# Holiday Features
#
encode = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(encode.fit_transform(holidays),
                        index= holidays.index,
                        columns  = holidays.description.unique())
# OR Using Pandas
# encode_df = pd.get_dummies(holidays)

x2  = xx.join(encode_df,on = 'date').fillna(0.0)
x2
# Apply the Model
model_hldays = LinearRegression().fit(x2, y)
yy_pred = pd.Series(model_hldays.predict(x2), index = x2.index, name = 'fitted_with_hldays')
eval_errors.MAPE(y, yy_pred)

fig, ax = plt.subplots()
y.plot(legend = True)
yy_pred.plot(legend = True)
axes.set_ylabel('total sales')
ax.set_title('Fitting with Holidays: Store_nbr {}'.format(str(store_nber)))


MAPE: 6.811042280588856


Text(0.5, 1.0, 'Fitting with Holidays: Store_nbr 6')

In [12]:
holidays
encode_df

Unnamed: 0_level_0,Primer dia del ano,Traslado Primer dia del ano,Carnaval,Provincializacion de Cotopaxi,Viernes Santo,Dia del Trabajo,Dia de la Madre-1,Dia de la Madre,Batalla de Pichincha,Traslado Batalla de Pichincha,Provincializacion de Imbabura,Primer Grito de Independencia,Traslado Primer Grito de Independencia,Independencia de Guayaquil,Dia de Difuntos,Independencia de Cuenca,Provincializacion de Santo Domingo,Provincializacion Santa Elena
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2017-02-27,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-28,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-04-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-04-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2017-05-01,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-13,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-24,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
(7.447 - 6.811)/(7.447)

0.08540351819524643

In [10]:
#
# Out of Sample Forecasting
#
Nsteps  = 30   # 1 month 
x2_fore = dp.out_of_sample(steps = Nsteps)
x2_fore = x2_fore.join(encode_df).fillna(0.0)
x2_fore[['Primer dia del ano', 'Carnaval']].tail(50)    # Checking that Carnaval falls on '2017-11-02'
#
# Apply the Model
#
y_forecast = pd.Series(model_hldays.predict(x2_fore), index = x2_fore.index, name = 'forecest_with_holidays')
#
# Plotting
#
fig, ax = plt.subplots()
y.plot(legend = True, axes = ax)
yy_pred.plot(label = 'fitted_with_holidays', legend = True)
ax.set_ylabel('total sales')
ax.set_title('Forecast with Holidays: Store_nbr {}'.format(str(store_nber)))
y_forecast.plot(legend = True)

<AxesSubplot:title={'center':'Forecast with Holidays: Store_nbr 6'}, xlabel='date', ylabel='total sales'>

In [12]:
# *********** Forcating for ALl Stores *********** 
%matplotlib
df  = store_sales.loc[store_sales.store_nbr == 1].groupby(['date']).sales.sum().rename('store_1').to_frame()
df                   
for jj in range(1, 55):
    df['store_{}'.format(jj)] = store_sales.loc[store_sales.store_nbr == jj].groupby(['date']).sales.sum().rename('store_{}'.format(jj)).to_frame()
df['year'] = df.index.year
# df.to_csv('Per_store_DF')

year = 2017
df = df.loc[df.year == year]
y_store = df.copy()
y_store = y_store.drop("year", axis = 1)
y_store
#
# Features
#
fourier = CalendarFourier(freq = 'M', order = 4)
dp = DeterministicProcess(
    index = y_store.index,
    constant=True,
    order=1,
    seasonal = True,
    additional_terms =[fourier],
    drop=True,
)
x_store = dp.in_sample()
x_store = x_store.join(encode_df).fillna(0.0)
x_store
#
# Apply the Model
#
model_store = LinearRegression().fit(x_store, y_store)
y_store_pred = pd.DataFrame(model_store.predict(x_store), index = x_store.index, columns=y_store.columns)
#
# Out of Sample prediction
#
Nsteps  = 30
x_store_fore = dp.out_of_sample(steps = Nsteps)
x_store_fore = x_store_fore.join(encode_df).fillna(0.0)
# Apply the Model
y_store_forecast = pd.DataFrame(model_store.predict(x2_fore), index = x2_fore.index, columns = y_store.columns)
#
# Plotting
#
store_nber = 'store_50'
fig, ax = plt.subplots()
y_store[store_nber].plot(label = 'Total_sales', legend = True, axes = ax)
y_store_pred[store_nber].plot(label = 'fitted_with_holidays', legend = True)
y_store_forecast[store_nber].plot(label = 'Forecast', legend = True)
ax.set_title('Fitting with Holidays: {}'.format(store_nber))
#Errors
errors = pd.DataFrame(columns = ['store_nber', 'MAPE'])
for ii in range(1,55):
    df1 = y_store['store_{}'.format(str(ii))]
    df2 = y_store_pred['store_{}'.format(str(ii))]
    err = np.mean(np.abs((df1[2:] - df2[2:])/df1[2:]))*100
    errors.loc[ii] = [str(ii), err]
# display(errors)
df

Using matplotlib backend: QtAgg


In [63]:
# ************** Store 52 (with zero sales untill April) ************
#
# df= df[df['ColName'] != 0]
store = 'store_52'
dff   = x_store.join(df[[store]]).fillna(0.0)
dff   = dff[dff[store] != 0]
x_52  = dff.drop(store, axis =1)
y_52  = dff[store]

# model training
model_52 = LinearRegression().fit(x_52, y_52)
y_52_pred = pd.Series(model_52.predict(x_52), index = x_52.index, name ='fitted')

# Out of Sample
Nsteps  = 30
x_52_fore = dp.out_of_sample(steps = Nsteps)
x_52_fore = x_52_fore.join(encode_df).fillna(0.0)
y_52_forecast = pd.Series(model_52.predict(x_52_fore), index = x_52_fore.index, name = 'forecast')

# Plotting
fig, ax = plt.subplots()
y_52.plot(axes = ax, label = 'Total _sales', legend = True)
y_52_pred.plot(axes = ax, label = 'fitted', legend = True)
y_52_forecast.plot(axes = ax, label = 'forecast', legend = True)
ax.set_ylabel('total sales')
ax.legend()

# Error
mape = np.mean(np.abs((y_52 - y_52_pred)/y_52))*100
print('MAPE: {}'.format(str(mape)))

MAPE: 6.886149328428727


In [34]:
import statsmodels.api as sm
%matplotlib
fig, ax = plt.subplots(1,2)
# store_nber = 'store_1'
sm.graphics.tsa.plot_acf(y_store[store_nber],  title = "ACF: {}".format(store_nber), ax =ax[0])
ax[0].set_xlabel('lag')
ax[0].set_ylabel('ACF:{}'.format(store_nber))

sm.graphics.tsa.plot_pacf(y_store[store_nber], title = "PACF: {}".format(store_nber), ax = ax[1])
ax[1].set_xlabel('lag')
ax[1].set_ylabel('ACF:{}'.format(store_nber))

df

Using matplotlib backend: QtAgg


Unnamed: 0_level_0,store_1,store_2,store_3,store_4,store_5,store_6,store_7,store_8,store_9,store_10,...,store_46,store_47,store_48,store_49,store_50,store_51,store_52,store_53,store_54,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2017
2017-01-02,5624.025879,23867.083984,62625.625000,23386.333984,13024.175781,33420.296875,27170.101562,36087.695312,38880.531250,10540.671875,...,72375.148438,74065.632812,62877.964844,63407.343750,43368.656250,27594.099609,0.000000,17306.611328,13112.303711,2017
2017-01-03,14932.340820,18242.900391,48968.941406,18079.419922,12096.441406,24664.802734,28773.128906,26262.029297,29011.273438,9617.971680,...,36651.390625,50223.832031,33423.105469,50321.156250,26479.121094,29489.021484,0.000000,13771.746094,11559.368164,2017
2017-01-04,16356.818359,18941.468750,48243.972656,17405.386719,12533.030273,23453.037109,31922.193359,30255.765625,21194.234375,6367.875000,...,33098.042969,47774.371094,27152.925781,48343.402344,23672.505859,33134.511719,0.000000,13442.102539,8455.379883,2017
2017-01-05,12789.186523,14372.650391,36147.976562,13270.617188,9360.265625,15548.572266,21061.583984,22199.845703,18888.748047,6192.646973,...,26519.957031,35960.875000,24248.152344,33285.636719,19211.662109,22408.039062,0.000000,10646.760742,7173.889160,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,5403.742188,13081.770508,32885.761719,10819.542969,9510.871094,15775.921875,15090.052734,18440.070312,15569.746094,5932.179199,...,32020.673828,38072.656250,26365.390625,32977.570312,21339.242188,26070.017578,30742.705078,12033.407227,8513.833984,2017
2017-08-12,7543.623047,12382.416016,31903.476562,10622.299805,8157.069824,14502.507812,14668.772461,19548.548828,16901.507812,6322.808105,...,28639.052734,34385.410156,24749.851562,30722.046875,19413.957031,19532.964844,22525.365234,12316.158203,9139.677734,2017
2017-08-13,3902.781982,13766.822266,34845.121094,13065.903320,10044.821289,19261.919922,14871.926758,21646.660156,18128.804688,6773.541992,...,32935.398438,38953.742188,29164.994141,32984.484375,21221.232422,25713.283203,24232.105469,12750.175781,14246.828125,2017
2017-08-14,10433.313477,12211.460938,32843.703125,10911.889648,9011.575195,14206.605469,19957.599609,19140.136719,15978.708984,6236.741211,...,23917.605469,31814.267578,20146.347656,31229.005859,18468.113281,18932.966797,18840.373047,8530.741211,11882.994141,2017


In [29]:
# ************* Trend **************
# Plot a 1 year MA
store_nber = 'store_6'
DF = df[[store_nber]]
DF         

#     window=365,       # 365-day window
#     center=True,      # puts the average at the center of the window
#     min_periods=183,  # choose about half the window size
#     .mean()           # compute the mean (could also do median, std, m

MA_365 = DF[store_nber].rolling (window = 365, center = True, min_periods = 183).mean()

fig, ax = plt.subplots()
DF[store_nber].plot(ax= ax, label = 'Total_sales')
ax.plot(MA_365, linewidth = 3, label = 'Moving Average_365')
ax.set_ylabel('Sales')
ax.set_title(store_nber)
ax.legend()
#
dp = DeterministicProcess( index = DF.index,   # dates from the training data
                           constant = True,    # dummy feature for the bias (y_intercept)
                           order=1,            # the time dummy (trend) as we saw above
                           drop=True,          # drop terms if necessary to avoid collinearity 
                         )

#
# Train the Model
#
X = dp.in_sample()       # the feature
y = DF[store_nber]   # The target
model = LinearRegression(fit_intercept = False)
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index = X.index).to_frame("Trend_predicted ")
y_pred 

fig, ax = plt.subplots()
DF[store_nber].plot(ax= ax, label = 'Total sales')
y_pred.plot(ax = ax, label = 'predicted_trend')
ax.set_ylabel('Sales')
ax.set_title(store_nber)
ax.legend()
#
# Out of Sample forecasting
#
Nsteps = 30       # months forecast
X1 = dp.out_of_sample(steps = Nsteps)
forecast = pd.Series(model.predict(X1), index = X1.index).to_frame('Trend_forecast')
list_forecast = list(forecast['Trend_forecast'])
y_forecast = pd.DataFrame(list_forecast, index = X1.index, columns = forecast.columns)
# Plot
y_forecast['Trend_forecast'].plot(ax = ax, label = 'Trend_forecast')
ax.legend()



<matplotlib.legend.Legend at 0x1c2e225bca0>

In [17]:
# ************ Forecasting For All Product families **********
#
family = []
for i, product in enumerate(store_sales.family.unique()):
    family.append(product)
family[2]
df  = store_sales.loc[store_sales.family == family[0]].groupby(['date']).sales.sum().rename(family[0]).to_frame()

for jj in range(1, 33):
    df[family[jj]] = store_sales.loc[store_sales.family == family[jj]].groupby(['date']).sales.sum().rename(family[jj]).to_frame()
df['year'] = df.index.year
y_family = df.copy()
y_family = y_family.loc[y_family.year == year]
y_family = y_family.drop('year', axis = 1)
#
# Features
#
fourier = CalendarFourier(freq = 'M', order = 4)
dp = DeterministicProcess(
    index = y_family.index,
    constant=True,
    order=1,
    seasonal = True,
    additional_terms =[fourier],
    drop=True,
)
x_family = dp.in_sample()
x_family = x_family.join(encode_df).fillna(0.0)

#
# Apply the Model
#
model_family = LinearRegression().fit(x_family, y_family)
y_family_pred = pd.DataFrame(model_family.predict(x_family), index = x_family.index, columns=y_family.columns)
#
# Out of Sample prediction
#
Nsteps  = 30
x_family_fore = dp.out_of_sample(steps = Nsteps)
x_family_fore = x_family_fore.join(encode_df).fillna(0.0)
#
# Apply the Model
#
y_family_forecast = pd.DataFrame(model_family.predict(x_family_fore), index = x_family_fore.index, columns = y_family.columns)
#
# Plotting
#
fmly = family[2]   # 0 - 32  :33 different products

fig, ax = plt.subplots()
y_family[fmly].plot(label = 'Total_sales', legend = True, axes = ax)
y_family_pred[fmly].plot(label = 'fitted_with_holidays', legend = True)
y_family_forecast[fmly].plot(label = 'Forecast', legend = True)
ax.set_ylabel('total sales')
ax.set_title('Fitting with Holidays: {}'.format(fmly))
#Errors
eval_errors.MAPE(y_family[fmly],y_family_pred[fmly])


MAPE: 15.237878061805985


15.237878061805985

In [24]:
# ********* Forecast per store per Family product ***********
store_sales_sorted = store_sales.copy()
store_sales_sorted = store_sales_sorted.set_index(['store_nbr', 'family', 'date']).sort_index()
y = store_sales_sorted.unstack(['store_nbr', 'family']).loc["2017"]
#
# Features
#
fourier = CalendarFourier(freq = 'M', order = 4)
dp = DeterministicProcess(
    index = y.index,
    constant=True,
    order=1,
    seasonal = True,
    additional_terms =[fourier],
    drop=True,
)
X = dp.in_sample()
X = X.join(encode_df).fillna(0.0)
#
# Apply the Model
#
model_fmly_store = LinearRegression().fit(X, y)
y_pred = pd.DataFrame(model_fmly_store.predict(X), index = X.index, columns=y.columns)
#
# Out of Sample prediction
#
Nsteps  = 30
X_fore = dp.out_of_sample(steps = Nsteps)
X_fore = X_fore.join(encode_df).fillna(0.0)
#
# Apply the Model
#
y_fore = pd.DataFrame(model_fmly_store.predict(X_fore), index = X_fore.index, columns = y.columns)
# Plotting
STORE_NBR = '1'  # 1 - 54
FAMILY = 'AUTOMOTIVE'
ax = y.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(**plot_params)
ax = y_pred.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(ax=ax)
ax = y_fore.loc(axis=1)['sales', STORE_NBR, FAMILY].plot(ax=ax)
ax.set_title(f'{FAMILY} Sales at Store {STORE_NBR}')
#Errors
eval_errors(y_family[fmly],y_family_pred[fmly])

In [28]:
y_fore.loc(axis =1)['sales', '1','AUTOMOTIVE']

KeyError: ('sales', '1', 'AUTOMOTIVE')