In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import gc

In [None]:
#importing all input data to Dataframe

In [None]:
df_holevents = pd.read_csv('input_data/holidays_events.csv')
df_items     = pd.read_csv('input_data/items.csv')
df_oil       = pd.read_csv('input_data/oil.csv')
df_stores    = pd.read_csv('input_data/stores.csv')
df_test      = pd.read_csv('input_data/test.csv')
df_train     = pd.read_csv('input_data/train.csv')
df_trans     = pd.read_csv('input_data/transactions.csv')

# Exploring data

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
print(df_test.shape)
df_test.head()

In [None]:
print(df_stores.shape)
df_stores.head()

In [None]:
print(df_items.shape)
df_items.head()

In [None]:
print(df_oil.shape)
df_oil.head()

In [None]:
print(df_trans.shape)
df_trans.head()

# Train Data

In [None]:
df_train.info()

In [None]:
#Now, let's get a sense of the time range for which the data was collected.

In [None]:
# convert date to datetime
df_train["date"] =  pd.to_datetime(df_train["date"])

In [None]:
df_train.head()

In [None]:
df_train["date"].dt.year.value_counts(sort = False).plot.bar()
plt.xlabel('Year')
plt.ylabel("Collected Data")


They have collected data from 2013 to 2017. There is an increase in the number of observations for each year except 2017 but this is probably because it is not yet over. Note that the training set is quite large.

In [None]:
df_train_2016 = df_train[df_train["date"].dt.year == 2016]

Let's take a look at how the data is distributed by month.

In [None]:
df_train_2016["date"].dt.month.value_counts(sort = False).plot.bar()

The observations are almost uniformly distributed by month. The maximum occurs in December and the minimum in February. How about by day of the month?

In [None]:
df_train_2016["date"].dt.day.value_counts(sort = False).plot.bar()

Again, the observations are almost uniformly distributed by day.

# Stores

In [None]:
# How many stores?

In [None]:
df_stores.head()

In [None]:
df_stores['store_nbr'].unique()

In [None]:
df_stores['state'].unique()

# Items

In [None]:
df_train_2016["item_nbr"].unique().shape[0]

There were 3886 different types of items either sold or returned during 2016.

In [None]:
stores = np.arange(1, 55)
items_store = np.zeros((54, ))
for i, store in enumerate(stores) :
    items_store[i] = df_train_2016["item_nbr"][df_train_2016["store_nbr"] \
                                               == store].unique().shape[0]
sns.barplot(stores, items_store)

As expected, this is very similar to the last bar plot because it measures the variety of items in each store. Interestingly, store 52 has 0 unique items. This is because there is no store number 52.

In [None]:
#Item sales is our target variable
df_train_2016["unit_sales"].describe()

It is probably a good thing that the mean and median for unit sales is positive, otherwise the company would be losing money. Suprisingly, on one day, 4673 items were returned. I wonder if this corresponds to some sort of outbreak or health concern for a particular product. On the other hand, on another day, 89440 items were purchased. Perhaps this was before some sort of natural disaster (e.g. a hurricane).

Now, let's find out how many items were purchased by coupon clippers.

In [None]:
df_train_2016["onpromotion"].value_counts()

In [None]:
3514584/31715287 * 100

About 11% of items are purchased on promotion.

Missing data and Outliers

In [None]:
df_train_2016.isnull().sum()

Yay! There is no missing data in the training set. How about outliers in the target variable?

In [None]:
unit_sales = df_train_2016["unit_sales"].values
gc.collect()

In [None]:
plt.scatter(x = range(unit_sales.shape[0]), y = np.sort(unit_sales))

In [None]:
df_train = df_train.set_index('date')

In [None]:
df_train = df_train[['unit_sales']]

In [None]:
df_train = df_train.to_period(freq='M')
df_train.head()

In [None]:
df_train = df_train.groupby(['date']).sum()

In [None]:
df_train.head()

In [None]:
#Plotting the time Series for the training dataset
df_train.plot()

Problem description:
    In this competition, we will be predicting the unit sales for thousands of items sold at different Favorita stores located in Ecuador. The training data includes dates, store and item information, whether that item was being promoted, as well as the unit sales. Additional files include supplementary information that may be useful in building your models.

The dataset provides the number of monthly sales of items from January 2013 to August 2017.

The values are a count of millions of sales and there are 56 monthly observations.

In [None]:
df_train.plot(kind = "hist", bins = 30)

Transformation - Log
Transformations such as logarithms can help to stabilize the variance of a time series.

In [None]:
df_train['sales_unit_Log'] = np.log(df_train.unit_sales)
df_train.head()

In [None]:
df_train['sales_unit_Log'].plot(kind = "hist", bins = 30)

In [None]:
df_train['sales_unit_Log'].plot()

Basic Time Series Model
We will build a time-series forecasting model to get a forecast for Onion prices. Let us start with the three most basic models -
Mean Constant Model
Linear Trend Model
Random Walk Model

Mean Model
This very simple forecasting model will be called the "mean model"

In [None]:
model_mean_pred = df_train.sales_unit_Log.mean()

In [None]:
# Let us store this as our Mean Predication Value
df_train["salesMean"] = np.exp(model_mean_pred)

In [None]:
df_train.head()

In [None]:
df_train.plot(kind="line", y = ["unit_sales", "salesMean"])

In [None]:
def RMSE(predicted, actual):
    mse = (predicted - actual)**2
    rmse = np.sqrt(mse.sum()/mse.count())
    return rmse

In [None]:
model_mean_RMSE = RMSE(df_train.salesMean, df_train.unit_sales)
model_mean_RMSE

In [None]:
# Save this in a dataframe
dfBangResults = pd.DataFrame(columns = ["Model", "Forecast", "RMSE"])
dfBangResults.head()

In [None]:
dfBangResults.loc[0,"Model"] = "Mean"
dfBangResults.loc[0,"Forecast"] = np.exp(model_mean_pred)
dfBangResults.loc[0,"RMSE"] = model_mean_RMSE
dfBangResults.head()


Linear Trend Model
Let us start by plotting a linear trend model between priceModLog and time.
However to do linear regression, we need a numeric indicator for time period - Let us create that.

In [None]:
df_train.head()

In [None]:
df_train['date'] = df_train.index.to_timestamp()

In [None]:
df_train.head()

In [None]:
# Convert date in datetimedelta figure starting from zero
df_train["timeIndex"] = df_train.date - df_train.date.min()

In [None]:
df_train.head()

In [None]:
df_train.dtypes

In [None]:
# Convert to months using the timedelta function
df_train["timeIndex"] =  df_train["timeIndex"]/np.timedelta64(1, 'M')

In [None]:
df_train.timeIndex.head()

In [None]:
# Round the number to 0
df_train["timeIndex"] = df_train["timeIndex"].round(0).astype(int)

In [None]:
df_train.tail()

In [None]:
## Now plot linear regression
# Import statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller

model_linear = smf.ols('sales_unit_Log ~ timeIndex', data = df_train).fit()

In [None]:
model_linear.summary()

In [None]:
## Parameters for y = mx + c equation
model_linear.params

In [None]:
c = model_linear.params[0]
c

In [None]:
m = model_linear.params[1]
m

In [None]:
model_linear_pred = model_linear.predict()

In [None]:
model_linear_pred

In [None]:
# Plot the prediction line
df_train.plot(kind="line", x="timeIndex", y = "sales_unit_Log")
plt.plot(df_train.timeIndex,model_linear_pred, '-')

In [None]:
model_linear.resid.plot(kind = "bar")

In [None]:
df_train["salesLinear"] = np.exp(model_linear_pred)

In [None]:
df_train.head()

In [None]:
# Root Mean Squared Error (RMSE)
model_linear_RMSE = RMSE(df_train.salesLinear, df_train.unit_sales)
model_linear_RMSE

In [None]:
# Manual Calculation
model_linear_forecast_manual = m * 146 + c
model_linear_forecast_manual

In [None]:
dfBangResults.loc[1,"Model"] = "Linear"
dfBangResults.loc[1,"Forecast"] = np.exp(model_linear_forecast_manual)
dfBangResults.loc[1,"RMSE"] = model_linear_RMSE
dfBangResults.head()

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales", "salesMean", "salesLinear"])

Random Walk Model
When faced with a time series that shows irregular growth, the best strategy may not be to try to directly predict the level of the series at each period (i.e., the quantity Yt). Instead, it may be better to try to predict the change that occurs from one period to the next (i.e., the quantity Yt - Yt-1). That is, it may be better to look at the first difference of the series, to see if a predictable pattern can be found there. For purposes of one-period-ahead forecasting, it is just as good to predict the next change as to predict the next level of the series, since the predicted change can be added to the current level to yield a predicted level. The simplest case of such a model is one that always predicts that the next change will be zero, as if the series is equally likely to go up or down in the next period regardless of what it has done in the past.
Random Walk Model $$ \hat{Y_t} = Y_{t-1} + \epsilon \\$$
Random Walk Model with drift $$ \hat{Y_t} = Y_{t-1} + c + \epsilon \\$$

In [None]:
df_train["priceModLogShift1"] = df_train.sales_unit_Log.shift()

In [None]:
df_train.head()

In [None]:
df_train.plot(kind= "scatter", y = "sales_unit_Log", x = "priceModLogShift1", s = 50)

In [None]:
# Lets plot the one-month difference curve
df_train["priceModLogDiff"] = df_train.sales_unit_Log - df_train.priceModLogShift1

In [None]:
df_train.priceModLogDiff.plot()

In [None]:
df_train["priceRandom"] = np.exp(df_train.priceModLogShift1)
df_train.head()

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales","priceRandom"])

In [None]:
# Root Mean Squared Error (RMSE)
model_random_RMSE = RMSE(df_train.priceRandom, df_train.unit_sales)
model_random_RMSE

In [None]:
dfBangResults.loc[2,"Model"] = "Random"
dfBangResults.loc[2,"Forecast"] = np.exp(df_train.priceModLogShift1[-1])
dfBangResults.loc[2,"RMSE"] = model_random_RMSE
dfBangResults.head()

Advanced Model
Most of the time series models work on the assumption that the time series is stationary. Intuitively, we can see that if a time series has a particular behaviour over time, there is a very high probability that it will follow the same in the future. Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series
Statistical stationarity: A stationary time series is one whose statistical properties such as mean, variance, autocorrelation, etc. are all constant over time. Most statistical forecasting methods are based on the assumption that the time series can be rendered approximately stationary (i.e., "stationarized") through the use of mathematical transformations. A stationarized series is relatively easy to predict: you simply predict that its statistical properties will be the same in the future as they have been in the past!
There are three basic criterion for a series to be classified as stationary series :
The mean of the series should not be a function of time rather should be a constant.
The variance of the series should not a be a function of time. This property is known as homoscedasticity.
The covariance of the i th term and the (i + m) th term should not be a function of time.

How do we check for Stationarity in a series?
Plotting Rolling Statistics: We can plot the moving average or moving variance and see if it varies with time. By moving average/variance I mean that at any instant ‘t’, we’ll take the average/variance of the last year, i.e. last 12 months. But again this is more of a visual technique.
Dickey-Fuller Test: This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the time series is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

In [None]:
def adf(ts):
    
    # Determing rolling statistics
    rolmean = pd.rolling_mean(ts, window=12)
    rolstd = pd.rolling_std(ts, window=12)

    #Plot rolling statistics:
    orig = plt.plot(ts, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Calculate ADF factors
    adftest = adfuller(ts, autolag='AIC')
    adfoutput = pd.Series(adftest[0:4], index=['Test Statistic','p-value','# of Lags Used',
                                              'Number of Observations Used'])
    for key,value in adftest[4].items():
        adfoutput['Critical Value (%s)'%key] = value
    return adfoutput

In [None]:
# For smoothing the values we can use 12 month Moving Averages 
df_train['priceModLogMA12'] = pd.rolling_mean(df_train.sales_unit_Log, window = 12)

In [None]:
df_train.plot(kind ="line", y=["priceModLogMA12", "sales_unit_Log"])

In [None]:
df_train["priceMA12"] = np.exp(df_train.priceModLogMA12)
df_train.tail()

In [None]:
model_MA12_forecast = df_train.sales_unit_Log.tail(12).mean()

In [None]:
# Root Mean Squared Error (RMSE)
model_MA12_RMSE = RMSE(df_train.priceMA12, df_train.unit_sales)
model_MA12_RMSE

In [None]:
dfBangResults.loc[3,"Model"] = "Moving Average 12"
dfBangResults.loc[3,"Forecast"] = np.exp(model_MA12_forecast)
dfBangResults.loc[3,"RMSE"] = model_MA12_RMSE
dfBangResults.head()

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales", "salesMean", "salesLinear",
                                             "priceRandom", "priceMA12"])


Eliminating Trend and Seasonality
Differencing – taking the differece with a particular time lag
Decomposition – modeling both trend and seasonality and removing them from the model.
Differencing
One of the most common methods of dealing with both trend and seasonality is differencing. In this technique, we take the difference of the observation at a particular instant with that at the previous instant. This mostly works well in improving stationarity. We have already done first order difference earlier

In [None]:
df_train.priceModLogDiff.plot()

In [None]:
# Test remaining part for Stationary
ts = df_train.priceModLogDiff
ts.dropna(inplace = True)
adfuller(ts)


Time Series Decomposition
We can also decompose the time series into trend and seasonality

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
df_train.index = df_train.index.to_datetime()
df_train.head()

In [None]:
decomposition = seasonal_decompose(df_train.sales_unit_Log, model = "additive")
decomposition.plot()

In [None]:
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

In [None]:
df_train["priceDecomp"] = np.exp(trend + seasonal)

In [None]:
# Root Mean Squared Error (RMSE)
model_Decomp_RMSE = RMSE(df_train.priceDecomp, df_train.unit_sales)
model_Decomp_RMSE

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales", "salesMean", "salesLinear", "priceRandom",
                                             "priceMA12",  "priceDecomp"])

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales",
                                              "priceDecomp"])

In [None]:
# Test remaining part for Stationary
ts = decomposition.resid
ts.dropna(inplace = True)
adfuller(ts)

In [None]:
ts = df_train.sales_unit_Log
ts_diff = df_train.priceModLogDiff
ts_diff.dropna(inplace = True)

In [None]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf

In [None]:
lag_acf = acf(ts_diff, nlags=20)
lag_acf

In [None]:
ACF = pd.Series(lag_acf)

In [None]:
ACF.plot(kind = "bar")

In [None]:
lag_pacf = pacf(ts_diff, nlags=20, method='ols')

In [None]:
PACF = pd.Series(lag_pacf)
PACF.plot(kind = "bar")

Running the ARIMA Model

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
ts_diff.head()

In [None]:
# Running the ARIMA Model(1,0,1)
model_AR1MA = ARIMA(ts_diff, order=(1,0,1))

In [None]:
results_ARIMA = model_AR1MA.fit(disp = -1)

In [None]:
results_ARIMA.fittedvalues.head()

In [None]:
ts_diff.plot()
results_ARIMA.fittedvalues.plot()

In [None]:
ts_diff.sum()

In [None]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.tail()

In [None]:
predictions_ARIMA_diff.sum()

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.tail()

In [None]:
ts.ix[0]

In [None]:
predictions_ARIMA_log = pd.Series(ts.ix[0], index=ts.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.tail()

In [None]:
df_train['priceARIMA'] = np.exp(predictions_ARIMA_log)

In [None]:
df_train.plot(kind="line", x="timeIndex", y = ["unit_sales", "priceARIMA"])