## Load Packages

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from tensorflow.keras.applications import vgg16
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2


## Load data from Postgres

In [5]:
# Set postgres credentials

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [6]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [7]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables_list #check available tables

['test',
 'shops',
 'sample_submission',
 'item_categories',
 'sales_train',
 'shops_en',
 'item_categories_en',
 'items_en',
 'items',
 'cleaning_store_id',
 'cleaning_item_category_id']

In [8]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)

In [9]:
items_df.info()
sales_train_df.isnull().sum() # check for NaN values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Preprocessing

In [10]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')

# Drop column that contais the item_name
sales_train_df.drop(labels='item_name', axis=1, inplace=True)

In [11]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Drop duplicates
sales_train_df.drop_duplicates(inplace=True)

In [12]:
sales_train_df

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,2013-01-02,0,59,22154,999.00,1.0,37
1,2013-01-03,0,25,2552,899.00,1.0,58
2,2013-01-05,0,25,2552,899.00,-1.0,58
3,2013-01-06,0,25,2554,1709.05,1.0,58
4,2013-01-15,0,25,2555,1099.00,1.0,56
...,...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.00,1.0,55
2935845,2015-10-09,33,25,7460,299.00,1.0,55
2935846,2015-10-14,33,25,7459,349.00,1.0,55
2935847,2015-10-22,33,25,7440,299.00,1.0,57


In [14]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [15]:
# Sorted date in ascending order
sales_train_df_31 = sales_train_df_31.sort_values(by='date', ascending=True, na_position='first')

In [16]:
# Filtered item_category_id
sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

  sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]


In [None]:
#def create_date_features(df):
    sales_train_df['month'] = sales_train_df.date.dt.month
    sales_train_df['day_of_month'] = sales_train_df.date.dt.day
    sales_train_df['day_of_year'] = sales_train_df.date.dt.dayofyear
    sales_train_df['week_of_year'] = sales_train_df.date.dt.isocalendar
    sales_train_df['day_of_week'] = sales_train_df.date.dt.dayofweek + 1
    sales_train_df['year'] = sales_train_df.date.dt.year
    sales_train_df["is_wknd"] = sales_train_df.date.dt.weekday // 4
    sales_train_df["quarter"] = sales_train_df.date.dt.quarter
    sales_train_df['is_month_start'] = sales_train_df.date.dt.is_month_start.astype(int)
    sales_train_df['is_month_end'] = sales_train_df.date.dt.is_month_end.astype(int)
    sales_train_df['is_quarter_start'] = sales_train_df.date.dt.is_quarter_start.astype(int)
    sales_train_df['is_quarter_end'] = sales_train_df.date.dt.is_quarter_end.astype(int)
    sales_train_df['is_year_start'] = sales_train_df.date.dt.is_year_start.astype(int)
    sales_train_df['is_year_end'] = sales_train_df.date.dt.is_year_end.astype(int)

sales_train_df_features = create_date_features(sales_train_df)

## Model set up 

### Setting up y list for RNN input with last monht's volume

In [41]:
# Pivoted into sales_train_df_31
tmp_df = sales_train_df_31[['item_id', 'date_block_num', 'item_cnt_day']]
pivot_table = tmp_df.pivot_table(
    index=['date_block_num'],
    columns=['item_id'],
    values=['item_cnt_day'],
    aggfunc={'item_cnt_day': ['sum']}
)
pivot_table = pivot_table.set_axis([flatten_column_header(col) for col in pivot_table.keys()], axis=1)
sales_train_df_31_pivot = pivot_table.reset_index()

# Filtered date_block_num
sales_train_df_31_pivot = sales_train_df_31_pivot[sales_train_df_31_pivot['date_block_num'] == 33]

# Pivoted into sales_train_df_31
sales_train_df_31_pivot = pd.DataFrame(data={})

# Pivoted into sales_train_df_31
tmp_df = sales_train_df_31[['item_id', 'date_block_num', 'item_cnt_day']]
pivot_table = tmp_df.pivot_table(
    index=['date_block_num'],
    columns=['item_id'],
    values=['item_cnt_day'],
    aggfunc={'item_cnt_day': ['sum']}
)

sales_train_df_31_pivot = pivot_table.reset_index()
# Filtered date_block_num
sales_train_df_31_pivot = sales_train_df_31_pivot[sales_train_df_31_pivot['date_block_num'] == 33]

In [60]:
#setting up y list with last values from month 33 
y_list = sales_train_df_31_pivot.fillna(0)

y = y_list.values.tolist()

flat_list = [item for sublist in y for item in sublist]

y2 = flat_list[1:]

y3 = np.array(y2).astype(np.float32)

In [96]:
y3

array([0., 0., 0., ..., 6., 1., 0.], dtype=float32)

### Creating a list of unique item_id in order to create arrays for the model

In [74]:
#creating list of unique items to run 
list_unique_item = sales_train_df_31['item_id'].unique()

In [75]:
list_unique_item.sort()

In [76]:
list_unique_item

array([   26,    27,    28, ..., 22163, 22164, 22167])

### Looping the list to create arrays of observations per item_id/day

In [77]:
sales_train_df_31_item = sales_train_df_31.set_index('item_id')

In [79]:
list_seq = []
for x in list_unique_item:
    
    s = np.array(sales_train_df_31_item.loc[x,['date_block_num','item_price','item_category_id','item_cnt_day']])
    if len(np.shape(s)) < 2:
        
        s = np.expand_dims(s,axis=0)
    
    list_seq.append(s)


In [80]:
X = list_seq

In [82]:
X_pad = pad_sequences(X) 

In [83]:
X_pad.shape

(14132, 863, 4)

### Defining and runnig RNN model 

In [95]:
model = Sequential()
model.add(layers.SimpleRNN(units=2, activation='tanh', input_shape=(863,4)))
model.add(layers.Dense(1, activation="linear"))

# The compilation
model.compile(loss='mse', 
              optimizer='rmsprop')  
# The fit
model.fit(X_pad, y3,
         batch_size=50,
         epochs=20)

# The prediction (one per sequence/city)
model.predict(X_pad)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


array([[0.3922189],
       [0.3922189],
       [0.3922189],
       ...,
       [0.3922189],
       [0.3922189],
       [0.3922189]], dtype=float32)