# 1. Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from pandas import datetime
from sklearn import preprocessing

import h5py

import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline

# 2. Loading Data

In [2]:
# %% Setup path to data files


filename_train = "train.csv"
filename_test = "test.csv"
filename_feature = "features.csv"
filename_store = "stores.csv"

In [4]:
# %% Load data

df_store = pd.read_csv('stores.csv')
df_feature = pd.read_csv('features.csv')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# 3. Explore Data

### 3.1 Store Date

In [5]:
print(df_store.head().append(df_store.tail()),"\n")
print("Structure of Store:\n",df_store.shape, "\n")
print("Number of missing values:\n",df_store.isnull().sum().sort_values(ascending=False),"\n")

    Store Type    Size
0       1    A  151315
1       2    A  202307
2       3    B   37392
3       4    A  205863
4       5    B   34875
40     41    A  196321
41     42    C   39690
42     43    C   41062
43     44    C   39910
44     45    B  118221 

Structure of Store:
 (45, 3) 

Number of missing values:
 Size     0
Type     0
Store    0
dtype: int64 



### 3.2 Feature Date

In [6]:
print(df_feature.head().append(df_feature.tail()),"\n")
print("Structure of Feature: ",df_feature.shape,"\n")
print("Summary Statistic:\n",df_feature.describe(),"\n")
print("Count of missing values:\n",df_feature.isnull().sum().sort_values(ascending=False),"\n")

      Store        Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  \
0         1  05-02-2010        42.31       2.572        NaN        NaN   
1         1  12-02-2010        38.51       2.548        NaN        NaN   
2         1  19-02-2010        39.93       2.514        NaN        NaN   
3         1  26-02-2010        46.63       2.561        NaN        NaN   
4         1  05-03-2010        46.50       2.625        NaN        NaN   
8185     45  28-06-2013        76.05       3.639    4842.29     975.03   
8186     45  05-07-2013        77.50       3.614    9090.48    2268.58   
8187     45  12-07-2013        79.37       3.614    3789.94    1827.31   
8188     45  19-07-2013        82.84       3.737    2961.49    1047.07   
8189     45  26-07-2013        76.06       3.804     212.02     851.73   

      MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment  IsHoliday  
0           NaN        NaN        NaN  211.096358         8.106      False  
1           NaN        NaN     

In [7]:
# %% Summary Table

feature_percent_missing = df_feature.isnull().sum()*100/len(df_feature)
feature_data_type = df_feature.dtypes

feature_summary = pd.DataFrame({"Percent_missing": feature_percent_missing.round(2), 
                               "Datatypes": feature_data_type})

feature_summary

Unnamed: 0,Percent_missing,Datatypes
Store,0.0,int64
Date,0.0,object
Temperature,0.0,float64
Fuel_Price,0.0,float64
MarkDown1,50.77,float64
MarkDown2,64.33,float64
MarkDown3,55.89,float64
MarkDown4,57.7,float64
MarkDown5,50.55,float64
CPI,7.14,float64


### 3.3 Train Date

In [8]:
print(df_train.head().append(df_train.tail()),"\n")
print("Structure of train:\n",df_train.shape,"\n")
print("Summary Statistic:\n",df_train.describe(),"\n")

        Store  Dept        Date  Weekly_Sales  IsHoliday
0           1     1  05-02-2010      24924.50      False
1           1     1  12-02-2010      46039.49       True
2           1     1  19-02-2010      41595.55      False
3           1     1  26-02-2010      19403.54      False
4           1     1  05-03-2010      21827.90      False
421565     45    98  28-09-2012        508.37      False
421566     45    98  05-10-2012        628.10      False
421567     45    98  12-10-2012       1061.02      False
421568     45    98  19-10-2012        760.01      False
421569     45    98  26-10-2012       1076.80      False 

Structure of train:
 (421570, 5) 

Summary Statistic:
                Store           Dept   Weekly_Sales
count  421570.000000  421570.000000  421570.000000
mean       22.200546      44.260317   15981.258123
std        12.785297      30.492054   22711.183519
min         1.000000       1.000000   -4988.940000
25%        11.000000      18.000000    2079.650000
50%       

In [9]:
# %% Summary Table

train_percent_missing = df_train.isnull().sum()*100/len(df_train)
train_datatype = df_train.dtypes

train_summary = pd.DataFrame({"Percent_Missing":train_percent_missing.round(2),
                             "Datatypes":train_datatype})

train_summary

Unnamed: 0,Percent_Missing,Datatypes
Store,0.0,int64
Dept,0.0,int64
Date,0.0,object
Weekly_Sales,0.0,float64
IsHoliday,0.0,bool


### 3.4 Test Date

In [10]:
print(df_test.head().append(df_test.tail()),"\n")
print("Structure of test:\n",df_test.shape,"\n")
print("Summary Statistic:\n",df_test.describe(),"\n")

        Store  Dept        Date  IsHoliday
0           1     1  02-11-2012      False
1           1     1  09-11-2012      False
2           1     1  16-11-2012      False
3           1     1  23-11-2012       True
4           1     1  30-11-2012      False
115059     45    98  28-06-2013      False
115060     45    98  05-07-2013      False
115061     45    98  12-07-2013      False
115062     45    98  19-07-2013      False
115063     45    98  26-07-2013      False 

Structure of test:
 (115064, 4) 

Summary Statistic:
                Store           Dept
count  115064.000000  115064.000000
mean       22.238207      44.339524
std        12.809930      30.656410
min         1.000000       1.000000
25%        11.000000      18.000000
50%        22.000000      37.000000
75%        33.000000      74.000000
max        45.000000      99.000000 



In [11]:
# summary

test_percent_missing = df_test.isnull().sum()*100/len(df_test)
test_datatypes = df_test.dtypes

test_summary = pd.DataFrame({"Datatypes": test_datatypes,
                             "Percent_Missing":test_percent_missing.round(2)})

test_summary

Unnamed: 0,Datatypes,Percent_Missing
Store,int64,0.0
Dept,int64,0.0
Date,object,0.0
IsHoliday,bool,0.0


In [13]:
# Make datetypes constant for all datasets

df_feature['Date'] = pd.to_datetime(df_feature['Date'], format="%d-%m-%Y")
df_train['Date'] = pd.to_datetime(df_train['Date'], format="%d-%m-%Y")
df_test['Date'] = pd.to_datetime(df_test['Date'], format="%d-%m-%Y")

# 4. Joining Tables

In [14]:
combined_train = pd.merge(df_train, df_store, how="left", on="Store")
combined_test = pd.merge(df_test, df_store, how="left", on="Store")

print(combined_train.head(),"\n", combined_train.shape,"\n")
print(combined_test.head(),"\n", combined_test.shape,"\n")

   Store  Dept       Date  Weekly_Sales  IsHoliday Type    Size
0      1     1 2010-02-05      24924.50      False    A  151315
1      1     1 2010-02-12      46039.49       True    A  151315
2      1     1 2010-02-19      41595.55      False    A  151315
3      1     1 2010-02-26      19403.54      False    A  151315
4      1     1 2010-03-05      21827.90      False    A  151315 
 (421570, 7) 

   Store  Dept       Date  IsHoliday Type    Size
0      1     1 2012-11-02      False    A  151315
1      1     1 2012-11-09      False    A  151315
2      1     1 2012-11-16      False    A  151315
3      1     1 2012-11-23       True    A  151315
4      1     1 2012-11-30      False    A  151315 
 (115064, 6) 



In [15]:
combined_train = pd.merge(combined_train, df_feature, how = "inner", on=["Store","Date"])
combined_test = pd.merge(combined_test, df_feature, how = "inner", on=["Store","Date"])

print(combined_train.head())
print(combined_test.head())

   Store  Dept       Date  Weekly_Sales  IsHoliday_x Type    Size  \
0      1     1 2010-02-05      24924.50        False    A  151315   
1      1     2 2010-02-05      50605.27        False    A  151315   
2      1     3 2010-02-05      13740.12        False    A  151315   
3      1     4 2010-02-05      39954.04        False    A  151315   
4      1     5 2010-02-05      32229.38        False    A  151315   

   Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  \
0        42.31       2.572        NaN        NaN        NaN        NaN   
1        42.31       2.572        NaN        NaN        NaN        NaN   
2        42.31       2.572        NaN        NaN        NaN        NaN   
3        42.31       2.572        NaN        NaN        NaN        NaN   
4        42.31       2.572        NaN        NaN        NaN        NaN   

   MarkDown5         CPI  Unemployment  IsHoliday_y  
0        NaN  211.096358         8.106        False  
1        NaN  211.096358        

In [16]:
# Drop Extra IsHoliday_y from both combined data set
combined_train = combined_train.drop(["IsHoliday_y"], axis=1)
combined_test = combined_test.drop(["IsHoliday_y"], axis=1)

print(combined_train.head())
print(combined_test.head())

   Store  Dept       Date  Weekly_Sales  IsHoliday_x Type    Size  \
0      1     1 2010-02-05      24924.50        False    A  151315   
1      1     2 2010-02-05      50605.27        False    A  151315   
2      1     3 2010-02-05      13740.12        False    A  151315   
3      1     4 2010-02-05      39954.04        False    A  151315   
4      1     5 2010-02-05      32229.38        False    A  151315   

   Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  \
0        42.31       2.572        NaN        NaN        NaN        NaN   
1        42.31       2.572        NaN        NaN        NaN        NaN   
2        42.31       2.572        NaN        NaN        NaN        NaN   
3        42.31       2.572        NaN        NaN        NaN        NaN   
4        42.31       2.572        NaN        NaN        NaN        NaN   

   MarkDown5         CPI  Unemployment  
0        NaN  211.096358         8.106  
1        NaN  211.096358         8.106  
2        NaN  211

In [17]:
combined_train.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,150681.0,111248.0,137091.0,134967.0,151432.0,421570.0,421570.0
mean,22.200546,44.260317,15981.258123,136727.915739,60.090059,3.361027,7246.420196,3334.628621,1439.421384,3383.168256,4628.975079,171.201947,7.960289
std,12.785297,30.492054,22711.183519,60980.583328,18.447931,0.458515,8291.221345,9475.357325,9623.07829,6292.384031,5962.887455,39.159276,1.863296
min,1.0,1.0,-4988.94,34875.0,-2.06,2.472,0.27,-265.76,-29.1,0.22,135.16,126.064,3.879
25%,11.0,18.0,2079.65,93638.0,46.68,2.933,2240.27,41.6,5.08,504.22,1878.44,132.022667,6.891
50%,22.0,37.0,7612.03,140167.0,62.09,3.452,5347.45,192.0,24.6,1481.31,3359.45,182.31878,7.866
75%,33.0,74.0,20205.8525,202505.0,74.28,3.738,9210.9,1926.94,103.99,3595.04,5563.8,212.416993,8.572
max,45.0,99.0,693099.36,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313


**NOTE:** Weekly_Sales, Markdown2 & Markdown3 have negative values in the combined_train set, which need to be replaced by 0

In [18]:
combined_test.describe()

Unnamed: 0,Store,Dept,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,115064.0,115064.0,115064.0,115064.0,115064.0,114915.0,86437.0,105235.0,102176.0,115064.0,76902.0,76902.0
mean,22.238207,44.339524,136497.688921,53.941804,3.581546,7689.216439,3734.051729,2403.088666,3356.219071,3922.681189,176.961347,6.868733
std,12.80993,30.65641,61106.926438,18.724153,0.239442,10698.760716,8323.495014,13767.939313,7570.501545,19445.150745,41.239967,1.583427
min,1.0,1.0,34875.0,-7.29,2.872,-2781.45,-35.74,-179.26,0.22,-185.17,131.236226,3.684
25%,11.0,18.0,93638.0,39.82,3.431,1966.46,180.35,15.1,155.46,1309.3,138.402033,5.771
50%,22.0,37.0,140167.0,54.47,3.606,4842.29,742.59,78.26,840.94,2390.43,192.304445,6.806
75%,33.0,74.0,202505.0,67.35,3.766,9439.14,2735.67,272.58,3096.92,4227.27,223.244532,8.036
max,45.0,99.0,219622.0,101.95,4.125,103184.98,71074.17,149483.31,65344.64,771448.1,228.976456,10.199


**NOTE:** Markdown1, Markdown2, Markdown3 & Markdown5 have negative values in the combined_test set, which need to be replaced by 0

# 5. Data pre-processing

### 5.1 Replace missing Values by 0

In [19]:
# Check for count of missing values in combined_train, combined_test

print(combined_test.isnull().sum())
print(combined_train.isnull().sum())

Store               0
Dept                0
Date                0
IsHoliday_x         0
Type                0
Size                0
Temperature         0
Fuel_Price          0
MarkDown1         149
MarkDown2       28627
MarkDown3        9829
MarkDown4       12888
MarkDown5           0
CPI             38162
Unemployment    38162
dtype: int64
Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday_x          0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
dtype: int64


In [20]:
processed_train = combined_train.fillna(0)
processed_test = combined_test.fillna(0)

### 5.2 Replace -ve Markdown values by 0 in both processed_train and processed_test

In [21]:
# %% Processed_train
processed_train.loc[processed_train['Weekly_Sales'] < 0.0,'Weekly_Sales'] = 0.0
processed_train.loc[processed_train['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
processed_train.loc[processed_train['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
processed_train.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0
mean,22.200546,44.260317,15981.46725,136727.915739,60.090059,3.361027,2590.074819,880.070274,468.092929,1083.132268,1662.772385,171.201947,7.960289
std,12.785297,30.492054,22711.032446,60980.583328,18.447931,0.458515,6052.385934,5084.520381,5528.872994,3894.529945,4207.629321,39.159276,1.863296
min,1.0,1.0,0.0,34875.0,-2.06,2.472,0.0,0.0,0.0,0.0,0.0,126.064,3.879
25%,11.0,18.0,2079.65,93638.0,46.68,2.933,0.0,0.0,0.0,0.0,0.0,132.022667,6.891
50%,22.0,37.0,7612.03,140167.0,62.09,3.452,0.0,0.0,0.0,0.0,0.0,182.31878,7.866
75%,33.0,74.0,20205.8525,202505.0,74.28,3.738,2809.05,2.2,4.54,425.29,2168.04,212.416993,8.572
max,45.0,99.0,693099.36,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313


In [22]:
# Processed_test
processed_test.loc[processed_test['MarkDown1'] < 0.0,'MarkDown1'] = 0.0
processed_test.loc[processed_test['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
processed_test.loc[processed_test['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
processed_test.loc[processed_test['MarkDown5'] < 0.0,'MarkDown5'] = 0.0
processed_test.describe()

Unnamed: 0,Store,Dept,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0
mean,22.238207,44.339524,136497.688921,53.941804,3.581546,7681.106189,2805.085103,2198.021241,2980.298267,3922.80606,118.270541,4.590657
std,12.80993,30.65641,61106.926438,18.724153,0.239442,10693.903634,7392.542823,13183.852299,7212.030921,19445.125049,89.878361,3.483338
min,1.0,1.0,34875.0,-7.29,2.872,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,18.0,93638.0,39.82,3.431,1963.55,0.0,7.77,70.37,1309.3,0.0,0.0
50%,22.0,37.0,140167.0,54.47,3.606,4828.72,316.88,60.0,600.58,2390.43,138.402033,5.771
75%,33.0,74.0,202505.0,67.35,3.766,9427.41,1575.85,244.2,2627.85,4227.27,201.21223,7.293
max,45.0,99.0,219622.0,101.95,4.125,103184.98,71074.17,149483.31,65344.64,771448.1,228.976456,10.199


### 5.3 Perform one hot encoding for categorical and boolean data

In [23]:
# Check the datatype of all variables in processed_train and processed_test

print(processed_train.dtypes, processed_test.dtypes)

Store                    int64
Dept                     int64
Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday_x               bool
Type                    object
Size                     int64
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
dtype: object Store                    int64
Dept                     int64
Date            datetime64[ns]
IsHoliday_x               bool
Type                    object
Size                     int64
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
dtype: object


In [24]:
cat_col = ['IsHoliday_x','Type']

In [25]:
for col in cat_col:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(processed_train[col].values.astype('str'))
    processed_train[col] = lbl.transform(processed_train[col].values.astype('str'))

In [26]:
for col in cat_col:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(processed_test[col].values.astype('str'))
    processed_test[col] = lbl.transform(processed_test[col].values.astype('str'))

In [27]:
processed_train.to_csv("Processed_data/processed_train.csv", index=False)
processed_test.to_csv("Processed_data/processed_test.csv", index=False)

In [28]:
processed_train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
1,1,2,2010-02-05,50605.27,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
2,1,3,2010-02-05,13740.12,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
3,1,4,2010-02-05,39954.04,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
4,1,5,2010-02-05,32229.38,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106


In [29]:
# Rearrange the response columns ('Weekly_Sales')

processed_train = processed_train[['Store', 'Dept', 'Date', 'Unemployment', 'IsHoliday_x', 'Type', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Weekly_Sales']]

In [31]:
# # Save the new processed file
processed_train.to_csv("Processed_data/processed_train.csv", index=False)

# 6. Baseline Model

In [38]:
pip install keras
import keras
from sklearn.preprocessing import MinMaxScaler

SyntaxError: invalid syntax (<ipython-input-38-19dc88f14986>, line 1)

In [None]:
# %% Check the date range in processed_train to perform train-val-test split
print(processed_train['Date'].min(), processed_train['Date'].max())

In [None]:
# %% Split train set into train-dev set
split_date = pd.datetime(2012,8,24)
train_set = processed_train.loc[processed_train['Date'] <= split_date]
dev_set = processed_train.loc[processed_train['Date'] > split_date]

In [None]:
# %% Split dev set into validation and test set
split_date_dev = pd.datetime(2012,9,25)
val_set = dev_set.loc[dev_set['Date'] <= split_date_dev]
test_set = dev_set.loc[dev_set['Date'] > split_date_dev]

In [None]:
train_set = train_set.set_index('Date')
val_set = val_set.set_index('Date')
test_set = test_set.set_index('Date')

In [None]:
train_set_array = train_set.iloc[:,:].values
val_set_array = val_set.iloc[:,:].values
test_set_array = test_set.iloc[:,:].values

print("Shape of train, val and test array:\n",train_set_array.shape,"\n",val_set_array.shape,"\n",test_set_array.shape)

In [None]:
# Scaling
sc = MinMaxScaler(feature_range=(0,1))
train_set_scaled = sc.fit_transform(train_set_array[:,:])
val_set_scaled = sc.fit_transform(val_set_array[:,:])
test_set_scaled = sc.fit_transform(test_set_array[:,:])

print(train_set_scaled.shape, val_set_scaled.shape, test_set_scaled.shape)

In [None]:
X_train = []
y_train = []
X_val = []
y_val = []
X_test = []
y_test = []

X_train, y_train = train_set_scaled[:,:-1], train_set_scaled[:,-1]
X_val, y_val = val_set_scaled[:,:-1], val_set_scaled[:,-1]
X_test, y_test = test_set_scaled[:,:-1], test_set_scaled[:,-1]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

In [None]:
# Create data structure

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Flatten

In [None]:
# Initialising RNN
regressor = Sequential()

In [None]:
# Adding the first LSTM layer and some Dropout regularization
# Dropout regularization is added to avoid overfitting

regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu', input_shape = (X_train.shape[1], 14)))
regressor.add(Dropout(0.5))

In [None]:
# adding a second LSTM layer and some dropout regularization
regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu'))
regressor.add(Dropout(0.5))

In [None]:
# # adding a third LSTM layer and some dropout regularization
# regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu'))
# regressor.add(Dropout(0.7))

In [None]:
# adding a fourth LSTM layer and some dropout regularization
regressor.add(LSTM(units = 10, return_sequences = False, activation = 'relu'))
regressor.add(Dropout(0.5))

In [None]:
# Adding the output layer
#regressor.add(Flatten())
regressor.add(Dense(units=1, activation = 'sigmoid'))

In [None]:
# Compiling the RNN
regressor.compile(optimizer='adam', 
                  loss='mean_squared_error', 
                  metrics=['accuracy'])

In [None]:
# Fitting the RNN to the training set
history = regressor.fit(X_train, 
              y_train, 
              epochs = 20, 
              batch_size = 512, 
              validation_data = (X_val, y_val),
              verbose = 1)

In [None]:
# Save Baseline Model

# regressor.save('models/model6_baseline.h5')

In [None]:
# To load the model

# regressor = load_model('models/model1_baseline.h5')

# 7 Predictions

In [None]:
# %% 
predicted_sales = regressor.predict(X_test)

In [None]:
# Reshape X_test for inverse scaling
X_test = X_test.reshape((X_test.shape[0], X_test.shape[2]))

In [None]:
print(X_test[:,:].shape, predicted_sales.shape)

In [None]:
# Concatenate in the same order. In our example, values of weekly sales should be in the end. Hence X_test[:,:] followed by predicted_sales
predicted_weekly_sales = np.concatenate((X_test[:,:], predicted_sales),axis=1)
predicted_weekly_sales = sc.inverse_transform(predicted_weekly_sales)
predicted_weekly_sales = predicted_weekly_sales[:,14:15]
predicted_weekly_sales

In [None]:
y_test = y_test.reshape((y_test.shape[0],1))

In [None]:
observed_weekly_sales = np.concatenate((X_test[:,:], y_test),axis=1)
observed_weekly_sales = sc.inverse_transform(observed_weekly_sales)
observed_weekly_sales = observed_weekly_sales[:,14:15]
observed_weekly_sales

In [None]:
print(observed_weekly_sales.shape, predicted_weekly_sales.shape)

In [None]:
fontP = FontProperties()
fontP.set_size('xx-large')

plt.subplots(figsize=(60,25))
plt.plot(observed_weekly_sales,color='red',label='Real weekly sales')
plt.plot(predicted_weekly_sales,color='blue',label='Predicted weekly sales')
plt.title('Walmart Weekly Sales', fontsize=40)
plt.xlabel('Time', fontsize=25)
plt.ylabel('Sales', fontsize=25)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.legend(loc=0, ncol=1, bbox_to_anchor=(0, 0, 1, 1),
           prop = fontP,fancybox=True,shadow=False)

plt.show()

In [None]:
# obs_pred_df = pd.DataFrame({'Date': test_set['Date'],
#               'Observed Sales': observed_weekly_sales.reshape(observed_weekly_sales.shape[0]),
#               'Predicted Sales': predicted_weekly_sales.reshape(predicted_weekly_sales.shape[0])})

# obs_pred_df.head()

In [None]:
# obs_pred_df.set_index('Date', inplace=True)
# obs_pred_df.head()

In [None]:
# obs_pred_df.plot(figsize=(20,10), linewidth=5, fontsize=20)
# plt.xlabel('Year', fontsize=20)
# plt.show()

# 8 Model Evaluation

In [None]:
test_set = test_set.reset_index()
test_set.head()

In [None]:
weight = np.where(test_set['IsHoliday_x']==0, 1, 5)

df = pd.DataFrame({"Date": test_set['Date'],
                   "Weight": weight,
                   "Observed_Values":observed_weekly_sales.reshape(observed_weekly_sales.shape[0]),
                  "Predicted_Values":predicted_weekly_sales.reshape(predicted_weekly_sales.shape[0])})

df['Derived'] = df['Weight']*abs(df['Observed_Values']-df['Predicted_Values'])

df.head()

In [None]:
WMAE = sum(df['Derived']) / sum(df['Weight'])
WMAE

In [None]:
# results = regressor.evaluate(X_test, y_test)
# results

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss)+1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
del(regressor)