# Store Item Demand Forecasting Challenge

## Prophet

<a href="https://www.kaggle.com/c/demand-forecasting-kernels-only">Link to competition on Kaggle.</a>

<a href="https://facebook.github.io/prophet/">Prophet</a> is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

Prophet is open source software released by Facebook’s Core Data Science team. It is available for download on CRAN and PyPI.

In [None]:
# Basic packages
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
from fbprophet import Prophet

# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots


# TIME SERIES
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs


# settings
import warnings
warnings.filterwarnings("ignore")

Acc_Start = time.time()

In [None]:
# Import all of them 
sales=pd.read_csv("../input/sales_train.csv")

# settings
import warnings
warnings.filterwarnings("ignore")

item_cat=pd.read_csv("../input/item_categories.csv")
item=pd.read_csv("../input/items.csv")
sub=pd.read_csv("../input/sample_submission.csv")
shops=pd.read_csv("../input/shops.csv")
test=pd.read_csv("../input/test.csv")

In [None]:
#formatting the date column correctly
sales.date=sales.date.apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))
# check
print(sales.info())
Block_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Block_End,2) +"secs")

In [None]:
Block_Start=time.time()
sales = pd.merge(sales, item , how='left', on=['item_id'])
sales.drop(['item_name'], axis=1, inplace=True)

sales.index = sales.date

Block_End=time.time()-Block_Start
Acc_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

In [None]:
sales['ds'] = sales.index
sales['y'] = sales.item_cnt_day 


In [None]:
m = Prophet(yearly_seasonality = True, seasonality_prior_scale=0.1)
m.fit(sales)
future = m.make_future_dataframe(periods=213)
forecast = m.predict(future)

In [None]:
m.plot_components(forecast)



# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots


from fbprophet import Prophet

plt.rcParams['figure.figsize'] = (16, 9)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load Data

Path="../input/"
#os.listdir(f'{Path}')


from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# TIME SERIES
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 99
import random as rd # generating random numbers
import datetime # manipulating date formats
import time
import os
import shutil
%matplotlib inline
from datetime import datetime, date


# settings
import warnings
warnings.filterwarnings("ignore")


#Data Preparation
  #loading data, don't load sample submission
data = {
    'item_cat': pd.read_csv(f'{Path}item_categories.csv'),
    'items': pd.read_csv(f'{Path}items.csv'),
    'sales_train': pd.read_csv(f'{Path}sales_train.csv'),
    'shops': pd.read_csv(f'{Path}shops.csv'),
    'test': pd.read_csv(f'{Path}test.csv'),
    }



In [None]:
#Block_End=time.time()-Block_Start
#Acc_End=time.time()-Acc_Start
#print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

In [None]:
Block_Start=time.time()
data['sales_train']['date'] = pd.to_datetime(data['sales_train']['date'])
data['sales_train']['dow'] = data['sales_train']['date'].dt.dayofweek
data['sales_train']['year'] = data['sales_train']['date'].dt.year
data['sales_train']['month'] = data['sales_train']['date'].dt.month

data['sales_train'] = pd.merge(data['sales_train'], data['items'] , how='left', on=['item_id'])
data['sales_train'].drop(['item_name'], axis=1, inplace=True)

Block_End=time.time()-Block_Start
Acc_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

for pd in data:
    print(data[pd].head())

In [None]:
Block_Start=time.time()

# Lable encoder for categorical variables
lbl = LabelEncoder()
data['sales_train']['item_id'] = lbl.fit_transform(data['sales_train']['item_id'])
data['sales_train']['item_category_id'] = lbl.fit_transform(data['sales_train']['item_category_id'])

Block_End=time.time()-Block_Start
Acc_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

In [None]:
data['sales_train'].head(20)

In [None]:
cols = data['sales_train'].columns.tolist()

In [None]:
cols

In [None]:
cols=['date',
     'date_block_num',
     'year',
     'month',
     'dow',
     'shop_id',
     'item_category_id',
     'item_id',
     'item_price',
     'item_cnt_day',
    ]

In [None]:
data['sales_train']=data['sales_train'][cols]

In [None]:
data['sales_train']=data['sales_train'].groupby("date")

In [None]:
data['sales_train'].head()

In [None]:
Block_Start=time.time()
data['test'] = pd.merge(data['test'], data['items'], how='left', on=['item_id'])
data['test'].drop(['item_name'], axis=1, inplace=True)
data['test'].head(20)
Block_End=time.time()-Block_Start
Acc_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

In [None]:
Block_Start=time.time()
#Split the test set not in train set
train_items = data['sales_train'].item_id.unique()
test_old= data['test'][~data['test'] .item_id.isin(train_items)]
test_items_not_in_train = data['test'][~data['test'].item_id.isin(train_items)].item_id.unique()
print('%d items in test data not found in train data' % len(test_items_not_in_train))
print("\n")
Block_End=time.time()-Block_Start
Acc_End=time.time()-Acc_Start
print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

In [None]:
test_new=data['test'][~data['test'].item_id.isin(train_items)]
test_old= data['test'][data['test'].item_id.isin(train_items)]

y=len(data['test'])
x=len(test_old)+len(test_new)
x==y #True means that we have succesfully splitted the test_set with 


train = train[train['date'].notnull()]

#helper function to set my predict range

def Predict_Range(pd_data,begin,end) :
    dt_begin=datetime.datetime.strptime(begin, "%m.%d.%Y").date()
    dt_end=datetime.datetime.strptime(end, "%m.%d.%Y").date()
    delta = dt_end - dt_begin
    periods_in_days=delta.days+1
    lt_date=pd.date_range(begin, periods= periods_in_days, freq='d').tolist()
    
    i=0
    pd_filled=pd.DataFrame()
    pd_temp=pd_data[:]
    
    for i in range(len(lt_date)):
        pd_temp['date']= lt_date[i].strftime('%m/%d/%Y')
        pd_filled = pd.concat([pd_filled, pd_temp]) 
        pd_temp=pd_data[:]
        
    return lt_date,pd_filled
 



lt_date1,test=Predict_Range(test,'11.1.2015','11.30.2015')    #m.d.y


print (test[test.ID==200])

In [None]:
#Block_Start=time.time()
import pandas_profiling
pandas_profiling.ProfileReport(data['sales_train'])
#Block_End=time.time()-Block_Start



In [None]:
Acc_End=time.time()-Acc_Start
#print("Execute block use " + "%0.2f" % round(Block_End,2) +"secs\n")
print("Accumulate use " + "%0.2f" % round(Acc_End,2) +"secs")

**Stationarity Testing**


In [None]:
ts=data['sales_train'].groupby(["date_block_num"])["item_cnt_day"].sum()
ts.astype('float')
plt.figure(figsize=(16,8))
plt.title('Total Sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts);

In [None]:
plt.figure(figsize=(16,6))
plt.plot(ts.rolling(window=12,center=False).mean(),label='Rolling Mean');
plt.plot(ts.rolling(window=12,center=False).std(),label='Rolling sd');
plt.legend();

In [None]:
import statsmodels.api as sm
# multiplicative
res = sm.tsa.seasonal_decompose(ts.values,freq=12,model="multiplicative")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()

In [None]:
# Additive model
res = sm.tsa.seasonal_decompose(ts.values,freq=12,model="additive")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()

In [None]:
# Stationarity tests
def test_stationarity(timeseries):
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(ts)

In [None]:
# to remove trend
from pandas import Series as Series
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

# invert differenced forecast
def inverse_difference(last_ob, value):
    return value + last_ob



In [None]:
ts=data['sales_train'].groupby(["date_block_num"])["item_cnt_day"].sum()
ts.astype('float')
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.title('Original')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts)
plt.subplot(312)
plt.title('After De-trend')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts)
plt.plot(new_ts)
plt.plot()

plt.subplot(313)
plt.title('After De-seasonalization')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts,12)       # assuming the seasonality is 12 months long
plt.plot(new_ts)
plt.plot()

In [None]:
test_stationarity(new_ts)

Clean Above

## Prophet

We will focus on the **test_old** first which the items are covered in the training set.

In [None]:
test_old.head()

In [None]:
import logging
logging.getLogger('fbprophet.forecaster').propagate = False

#pd_sub = pd.read_csv('../input/sample_submission.csv')
#df_sub['store_id'] = df_sub['id'].apply(lambda x:x[:-11])

pd_test_old = pd_test_old.set_index('ID')

number_of_items = pd_test_old['item_id'].nunique()
date_range = pd.date_range(start=pd.to_datetime('2015-01-01'),
                           end=pd.to_datetime('2015-10-31'))
forecast_days = (pd.to_datetime('2015-10-31')-pd.to_datetime('2015-11-31')).days

for cnt, item_id in enumerate(['item_id'].unique()):
    print('Predicting %d of %d.'%(cnt, number_of_stores), end='\r')
    data = train[train['air_store_id'] == store_id]
    data = data[['visit_date', 'visitors']].set_index('visit_date')
    # Ensure we have full range of dates.
    data = data.reindex(date_range).fillna(0).reset_index()
    data.columns = ['ds', 'y']
    
    m = Prophet()
    #m = Prophet(yearly_seasonality=True, mcmc_samples=300)
    #m.add_seasonality(name='weekly', period=7, fourier_order=3)
    m.fit(data)
    future = m.make_future_dataframe(forecast_days)
    forecast = m.predict(future)
    forecast = forecast[['ds', 'yhat']]
    forecast.columns = ['id', 'visitors']
    forecast['id'] = forecast['id'].apply(lambda x:'%s_%s'%(store_id, x.strftime('%Y-%m-%d')))
    forecast = forecast.set_index('id')
    df_sub.update(forecast)
print('\n\nDone.')

In [None]:
proph_results = test_old.reset_index()
proph_results['item_cnt_day'] = 0

In [None]:
test_old.drop(['item_category_id'], axis=1, inplace=True)
test_old.head()

In [None]:
cols = test_old.columns.tolist()

In [None]:
cols

In [None]:
cols=['date','ID', 'shop_id', 'item_id']


In [None]:
test_old=test_old[cols]


In [None]:
test_old.head()

In [None]:
test_old['date'] = pd.to_datetime(test_old['date'], format="%m/%d/%Y")

In [None]:
test_old.set_index('date', inplace=True)

In [None]:
train.drop(['date_block_num','item_price'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
train.dropna(axis=1, how='all') 

In [None]:
test_old.dropna(axis=1, how='all') 

train_copy=train.copy()

In [None]:
train['item_cnt_day'].describe()

In [None]:
test_old.head()

In [None]:
tic = time.time()

for s in proph_results['shop_id'].unique():
    for i in proph_results['item_id'].unique():
        proph_train = train.loc[(train['shop_id'] == s) & (train['item_id'] == i)].reset_index()
        proph_train.rename(columns={'date': 'ds', 'item_cnt_day': 'y'}, inplace=True)
        
        m = Prophet()
        m.fit(proph_train[['ds', 'y']])
        future = m.make_future_dataframe(periods=len(test_old.index.unique()), include_history=False)
        fcst = m.predict(future)
        
        proph_results.loc[(proph_results['shop_id'] == s) & (proph_results['item_id'] == i), 'sales'] = fcst['yhat'].values
        
        toc = time.time()
        if i % 10 == 0:
            print("Completed store {} item {}. Cumulative time: {:.1f}s".format(s, i, toc-tic))

In [None]:
proph_results.drop(['date', 'store', 'item'], axis=1, inplace=True)
proph_results.head()
proph_results = np.clip(proph_results,0.,20.)

In [None]:
proph_results.to_csv('proph_results.csv', index=False)