# Load libraries

In [1]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# Load CSV Files

In [2]:
os.listdir() # Files in working directory

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'HP_LSTM_first_try.ipynb',
 'raw_data',
 'README.md']

In [3]:
os.listdir("raw_data")

['items.csv',
 'item_categories.csv',
 'sales_train.csv',
 'sample_submission.csv',
 'shops.csv',
 'test.csv']

In [4]:
items = pd.read_csv("raw_data/items.csv")
item_categories = pd.read_csv("raw_data/item_categories.csv")
sales = pd.read_csv("raw_data/sales_train.csv")
shops = pd.read_csv("raw_data/shops.csv")
test = pd.read_csv("raw_data/test.csv")
sample_submission = pd.read_csv("raw_data/sample_submission.csv")

# Preparing the dataset

## Define months, years and month-year from the date field

In [5]:
sales["date"] = sales["date"].apply(lambda x: x.replace(".", "-"))
sales["date"] = pd.to_datetime(sales["date"], dayfirst=True)
sales['year'] = pd.DatetimeIndex(sales['date']).year
sales['month'] = pd.DatetimeIndex(sales['date']).month
sales['month_year'] = sales["date"].dt.to_period('M')

## Grouping the sales by month 

In [6]:
sales = sales.groupby(["shop_id", "item_id", "year", "month", "month_year"]).agg(
    mean_item_price = pd.NamedAgg(column="item_price", aggfunc=np.mean),
    sd_item_price = pd.NamedAgg(column="item_price", aggfunc=np.std),
    item_cnt_month = pd.NamedAgg(column="item_cnt_day", aggfunc=np.sum)).reset_index()
sales

Unnamed: 0,shop_id,item_id,year,month,month_year,mean_item_price,sd_item_price,item_cnt_month
0,0,30,2013,2,2013-02,265.0,0.0,31.0
1,0,31,2013,2,2013-02,434.0,0.0,11.0
2,0,32,2013,1,2013-01,221.0,0.0,6.0
3,0,32,2013,2,2013-02,221.0,0.0,10.0
4,0,33,2013,1,2013-01,347.0,0.0,3.0
...,...,...,...,...,...,...,...,...
1609119,59,22164,2015,4,2015-04,699.0,0.0,2.0
1609120,59,22164,2015,7,2015-07,699.0,,1.0
1609121,59,22167,2013,10,2013-10,299.0,,1.0
1609122,59,22167,2013,12,2013-12,299.0,0.0,2.0


## Select shops and items with entries for each possible month in the dataset 

In [7]:
count_entries = sales.groupby(["shop_id", "item_id"]).count().sort_values("item_cnt_month", ascending=False)["item_cnt_month"].reset_index()
count_entries

Unnamed: 0,shop_id,item_id,item_cnt_month
0,52,1905,34
1,38,3076,34
2,52,5822,34
3,26,5822,34
4,31,32,34
...,...,...,...
424119,42,11768,1
424120,42,11769,1
424121,42,11770,1
424122,42,11771,1


In [8]:
full_entries = count_entries[["shop_id","item_id"]][count_entries["item_cnt_month"]==34]
full_entries = full_entries.merge(sales, on= ["shop_id", "item_id"], how="left")
full_entries

Unnamed: 0,shop_id,item_id,year,month,month_year,mean_item_price,sd_item_price,item_cnt_month
0,52,1905,2013,1,2013-01,249.0,0.000000,8.0
1,52,1905,2013,2,2013-02,249.0,0.000000,4.0
2,52,1905,2013,3,2013-03,249.0,0.000000,2.0
3,52,1905,2013,4,2013-04,249.0,0.000000,10.0
4,52,1905,2013,5,2013-05,249.0,0.000000,7.0
...,...,...,...,...,...,...,...,...
3463,28,13881,2015,6,2015-06,599.0,,1.0
3464,28,13881,2015,7,2015-07,599.0,0.000000,3.0
3465,28,13881,2015,8,2015-08,649.0,24.494897,6.0
3466,28,13881,2015,9,2015-09,659.0,0.000000,3.0


## Defining a LSTM model for each shop-item pair

For each shop-item pair with entries for every month, a dictionary stores the sequence corresponding to that shop-item pair, storing also a model of stacked LSTMs (50 units) taking a sequence of 3 time steps

### Defining a function that splits the initial sequence in 3 timesteps
Retrieves an X array of sequences of 3 timesteps and an Y array of the corresponding output for each 3 step sequence

In [9]:
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

### Training the LSTMs for each shop-item pair
The last item of each sequence is excluded on the training, once these are the values to be predicted

In [10]:
full_entries_shops = np.unique(full_entries["shop_id"])

In [11]:
models={}
n_steps = 3
n_features = 1
for shop in full_entries_shops:
    models[shop]={}
    items = np.unique(full_entries["item_id"][full_entries["shop_id"]==shop])
    for item in items:
        models[shop][item]={}
        seq = full_entries["item_cnt_month"][(full_entries["shop_id"]==shop) & (full_entries["item_id"]==item)].values
        models[shop][item]["sequence"]=seq
        X, y = split_sequence(seq[:-1], n_steps)
        X = X.reshape((X.shape[0], X.shape[1], n_features))
        model = Sequential()
        model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
        model.add(LSTM(50, activation='relu'))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        model.fit(X, y, epochs=200, verbose=0)
        models[shop][item]["model"]= model
        

# Evaluating the models
Takes the last entrie of a sequence as the real value to predict. Takes the the last 3 timesteps of each sequence as the input of the LSTM model used in prediction

In [12]:
real=[]
pred=[]
for shop in models.keys():
    for item in models[shop].keys():
        real.append(models[shop][item]["sequence"][-1])
        pred.append(models[shop][item]["model"].predict(models[shop][item]["sequence"][-n_steps:].reshape((1,n_steps,1))))



In [13]:
real = np.array(real)

In [14]:
pred = np.round(np.vstack(pred).reshape(-1),0)

## Computing RMSE

In [15]:
np.sqrt(np.mean((real-pred)**2))

208.24012817032204

In [16]:
real

array([3.000e+00, 1.200e+01, 6.000e+00, 1.800e+01, 1.000e+00, 1.000e+00,
       7.000e+00, 6.000e+00, 2.000e+00, 3.100e+01, 3.300e+01, 1.230e+02,
       2.253e+03, 2.000e+00, 8.000e+00, 6.000e+00, 5.000e+00, 8.000e+00,
       5.000e+00, 3.000e+00, 1.000e+00, 9.000e+00, 4.000e+00, 6.000e+00,
       6.000e+00, 3.000e+00, 1.400e+01, 2.000e+00, 1.000e+01, 8.000e+00,
       2.000e+00, 3.000e+00, 8.000e+00, 9.000e+00, 1.800e+01, 7.000e+00,
       1.000e+00, 5.000e+00, 3.000e+00, 1.000e+00, 3.000e+00, 7.000e+00,
       1.400e+01, 1.100e+01, 3.000e+00, 8.000e+00, 3.000e+00, 1.000e+00,
       3.000e+00, 4.000e+00, 4.000e+00, 2.000e+00, 2.000e+00, 3.000e+00,
       8.000e+00, 3.000e+00, 4.000e+00, 3.000e+00, 5.000e+00, 1.000e+00,
       9.000e+00, 4.000e+00, 1.000e+00, 2.000e+00, 7.000e+00, 1.400e+01,
       3.400e+01, 1.100e+01, 1.000e+00, 6.000e+00, 3.000e+00, 1.300e+01,
       2.000e+00, 4.000e+00, 4.000e+00, 6.000e+00, 2.000e+00, 5.000e+00,
       2.000e+00, 4.000e+00, 2.000e+00, 1.000e+00, 

In [17]:
pred

array([  5.,  15.,   6.,  18.,   1.,   6.,   7.,   6.,   3.,  85., 180.,
       140., 156.,   4.,   6.,   5.,   2.,  26.,   6.,   5.,   2.,   7.,
         6.,   6.,  12.,   4.,  12.,   3.,   8.,   6.,   4.,   4.,   7.,
         4.,  20.,   7.,   3.,   6.,   3.,   3.,   4.,   5.,  12.,  15.,
         3.,   3.,   5.,   5.,   5.,   8.,   8.,   4.,   3.,   6.,  10.,
         4.,   4.,   4.,   3.,   3.,   7.,   7.,   3.,  10.,   5.,  10.,
        32.,  12.,   1.,   3.,   5.,   7.,   2.,   4.,   4.,   6.,   3.,
         4.,   3.,   6.,   4.,   3.,  10.,   4.,  22.,   3.,   4.,   2.,
         7.,  14.,   6.,  10.,   7.,   5.,   6.,   6.,   5.,   3.,   5.,
         6.,   2.,   3.], dtype=float32)