In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from tensorflow.keras.applications import vgg16
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2


## Load data from Postgres

In [4]:
# Set postgres credentials

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [5]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [6]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables_list #check available tables

['test',
 'shops',
 'sample_submission',
 'item_categories',
 'sales_train',
 'shops_en',
 'item_categories_en',
 'items_en',
 'items',
 'cleaning_store_id',
 'cleaning_item_category_id']

In [7]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)

In [8]:
items_df.info()
sales_train_df.isnull().sum() # check for NaN values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Preprocessing

In [9]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')

# Drop column that contais the item_name
sales_train_df.drop(labels='item_name', axis=1, inplace=True)

In [10]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Drop duplicates
sales_train_df.drop_duplicates(inplace=True)

In [None]:
sales_train_df

In [16]:
import mitosheet
mitosheet.sheet(sales_train_df, analysis_to_replay="id-xxotyeldrp")

MitoWidget(analysis_data_json='{"analysisName": "id-xxotyeldrp", "analysisToReplay": null, "code": [], "stepSu…

In [12]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [13]:
# Sorted date in ascending order
sales_train_df_31 = sales_train_df_31.sort_values(by='date', ascending=True, na_position='first')

In [14]:
# Filtered item_category_id
sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

  sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]


In [None]:
sales_train_df_31

In [17]:
import mitosheet
mitosheet.sheet(sales_train_df_31, analysis_to_replay="id-mcamyghxqp")

MitoWidget(analysis_data_json='{"analysisName": "id-mcamyghxqp", "analysisToReplay": null, "code": [], "stepSu…

In [None]:
from mitosheet import *; register_analysis("id-mcamyghxqp");
    
# Added column new-column-s41i
sales_train_df_31.insert(1, 'new-column-s41i', 0)

# Set formula of new-column-s41i
sales_train_df_31['new-column-s41i'] = MONTH(sales_train_df_31['date'])

# Added column new-column-sz7i
sales_train_df_31.insert(2, 'new-column-sz7i', 0)

# Deleted columns new-column-s41i
sales_train_df_31.drop(['new-column-s41i'], axis=1, inplace=True)

# Deleted columns new-column-sz7i
sales_train_df_31.drop(['new-column-sz7i'], axis=1, inplace=True)

# Pivoted into sales_train_df_31
sales_train_df_31_pivot = pd.DataFrame(data={})


In [40]:
#sales_31_group = sales_train_df_31.groupby(['date_block_num','month']).agg({'score': 'sum', 'num_attempts': 'sum'})

In [41]:
list_unique_cat = sales_train_df_31['item_category_id'].unique()

In [42]:
list_unique_cat

array([23, 40, 30,  6, 63, 22, 38, 75, 69,  5, 37,  2, 55, 45, 19, 21, 79,
       28, 25, 62, 41, 57, 15, 33, 72, 11, 14, 35, 83, 67, 70,  3, 65, 71,
       64, 43, 29, 61,  4, 73, 49, 60, 77, 56, 58, 18, 20, 12, 17,  8, 16,
       24,  7, 47, 80])

In [43]:
sales_train_df_31_cat = sales_train_df_31.set_index('item_category_id')

In [70]:
list_seq = []
for x in list_unique_cat:
    
    s = np.array(sales_train_df_31_cat.loc[x,['date_block_num','item_price','item_id','item_cnt_day']])
    if len(np.shape(s)) < 2:
        
        s = np.expand_dims(s,axis=0)
    
    list_seq.append(s)


In [71]:
X = list_seq

Running RNN Model 

In [121]:
def create_date_features(df):
    sales_train_df['month'] = sales_train_df.date.dt.month
    sales_train_df['day_of_month'] = sales_train_df.date.dt.day
    sales_train_df['day_of_year'] = sales_train_df.date.dt.dayofyear
    sales_train_df['week_of_year'] = sales_train_df.date.dt.isocalendar
    sales_train_df['day_of_week'] = sales_train_df.date.dt.dayofweek + 1
    sales_train_df['year'] = sales_train_df.date.dt.year
    sales_train_df["is_wknd"] = sales_train_df.date.dt.weekday // 4
    sales_train_df["quarter"] = sales_train_df.date.dt.quarter
    sales_train_df['is_month_start'] = sales_train_df.date.dt.is_month_start.astype(int)
    sales_train_df['is_month_end'] = sales_train_df.date.dt.is_month_end.astype(int)
    sales_train_df['is_quarter_start'] = sales_train_df.date.dt.is_quarter_start.astype(int)
    sales_train_df['is_quarter_end'] = sales_train_df.date.dt.is_quarter_end.astype(int)
    sales_train_df['is_year_start'] = sales_train_df.date.dt.is_year_start.astype(int)
    sales_train_df['is_year_end'] = sales_train_df.date.dt.is_year_end.astype(int)

sales_train_df_features = create_date_features(sales_train_df)

In [122]:
sales_train_df_features

In [68]:
list_seq[45]

array([[6, 1258.2, 3761, 1.0]], dtype=object)

In [64]:
np.shape(list_seq[45])

(4,)

In [72]:
X_pad = pad_sequences(X) 
X_pad

array([[[    0,     0,     0,     0],
        [    0,     0,     0,     0],
        [    0,     0,     0,     0],
        ...,
        [   33,   999,  7007,     1],
        [   33,  2199,  7135,     1],
        [   33,  1999,  3713,     1]],

       [[    0,   347, 17241,     1],
        [    0,   149, 21552,     1],
        [    0,   399, 21424,     1],
        ...,
        [   33,   399, 17581,     2],
        [   33,   499,  8735,     1],
        [   33,   149,  8754,     1]],

       [[    0,     0,     0,     0],
        [    0,     0,     0,     0],
        [    0,     0,     0,     0],
        ...,
        [   33,   299, 13159,     1],
        [   33,   199, 19985,     2],
        [   33,   599,  6710,     1]],

       ...,

       [[    0,     0,     0,     0],
        [    0,     0,     0,     0],
        [    0,     0,     0,     0],
        ...,
        [   33,  4990,  8454,     1],
        [   33,  1190,  7933,     1],
        [   33,  2990,  8452,     2]],

       [[    0,

In [90]:
X_pad.shape

(55, 62235, 4)

In [102]:
import random

In [119]:
res = random.sample(range(1, 2), 55)

ValueError: Sample larger than population or is negative

In [109]:
y = np.array(res).astype(np.float32)

In [110]:
y.shape

(55,)

In [117]:
model = Sequential()
model.add(layers.SimpleRNN(units=2, activation='tanh', input_shape=(62235,4)))
model.add(layers.Dense(1, activation="linear"))

# The compilation
model.compile(loss='mse', 
              optimizer='rmsprop')  
# The fit
model.fit(X_pad, y,
         batch_size=100,
         epochs=10)

# The prediction (one per sequence/city)
model.predict(X_pad)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


array([[2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],
       [2.1715512],


In [77]:
# date_block_num: 34
# shop_id: 60
# item_id: 21807

In [None]:
date_block_num, shop_id
0
1
2
3
4
5
6
7