In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2


TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

## Load data from Postgres

In [4]:
# Set postgres credentials

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [5]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [6]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables_list #check available tables

['test',
 'shops',
 'sample_submission',
 'item_categories',
 'sales_train',
 'shops_en',
 'item_categories_en',
 'items_en',
 'items',
 'cleaning_store_id',
 'cleaning_item_category_id']

In [7]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)

In [8]:
items_df.info()
sales_train_df.isnull().sum() # check for NaN values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Preprocessing

In [9]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')

# Drop column that contais the item_name
sales_train_df.drop(labels='item_name', axis=1, inplace=True)

In [10]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Drop duplicates
sales_train_df.drop_duplicates(inplace=True)

In [11]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [12]:
# Sorted date in ascending order
sales_train_df_31 = sales_train_df_31.sort_values(by='date', ascending=True, na_position='first')

In [13]:
# Filtered item_category_id
sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

  sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]


In [14]:
sales_train_df_31

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
60440,2013-01-02,0,31,4248,1993.12,1.0,23
72614,2013-01-02,0,31,17241,347.11,1.0,40
71078,2013-01-02,0,31,15060,399.00,1.0,30
63528,2013-01-02,0,31,7893,1980.00,1.0,6
71610,2013-01-02,0,31,11854,199.00,1.0,63
...,...,...,...,...,...,...,...
2925574,2015-10-31,33,31,8478,349.00,1.0,43
2928312,2015-10-31,33,31,3713,1999.00,1.0,23
2890918,2015-10-31,33,31,12974,249.00,1.0,55
2928073,2015-10-31,33,31,2955,932.67,1.0,19


In [15]:
#sales_31_group = sales_train_df_31.groupby(['date_block_num','month']).agg({'score': 'sum', 'num_attempts': 'sum'})

In [16]:
list_unique_cat = sales_train_df_31['item_category_id'].unique()

In [17]:
list_unique_cat

array([23, 40, 30,  6, 63, 22, 38, 75, 69,  5, 37,  2, 55, 45, 19, 21, 79,
       28, 25, 62, 41, 57, 15, 33, 72, 11, 14, 35, 83, 67, 70,  3, 65, 71,
       64, 43, 29, 61,  4, 73, 49, 60, 77, 56, 58, 18, 20, 12, 17,  8, 16,
       24,  7, 47, 80])

In [18]:
sales_train_df_31_cat = sales_train_df_31.set_index('item_category_id')

In [19]:
list_seq = []
for x in list_unique_cat:
    
    s = np.array(sales_train_df_31_cat.loc[x,['date_block_num','item_price','item_id','item_cnt_day']])
    
    list_seq.append(s)


Running RNN Model 

In [32]:
def create_date_features(df):
    sales_train_df['month'] = sales_train_df.date.dt.month
    sales_train_df['day_of_month'] = sales_train_df.date.dt.day
    sales_train_df['day_of_year'] = sales_train_df.date.dt.dayofyear
    sales_train_df['week_of_year'] = sales_train_df.date.dt.weekofyear
    sales_train_df['day_of_week'] = sales_train_df.date.dt.dayofweek + 1
    sales_train_df['year'] = sales_train_df.date.dt.year
    sales_train_df["is_wknd"] = sales_train_df.date.dt.weekday // 4
    sales_train_df["quarter"] = sales_train_df.date.dt.quarter
    sales_train_df['is_month_start'] = sales_train_df.date.dt.is_month_start.astype(int)
    sales_train_df['is_month_end'] = sales_train_df.date.dt.is_month_end.astype(int)
    sales_train_df['is_quarter_start'] = sales_train_df.date.dt.is_quarter_start.astype(int)
    sales_train_df['is_quarter_end'] = sales_train_df.date.dt.is_quarter_end.astype(int)
    sales_train_df['is_year_start'] = sales_train_df.date.dt.is_year_start.astype(int)
    sales_train_df['is_year_end'] = sales_train_df.date.dt.is_year_end.astype(int)

sales_train_df_features = create_date_features(sales_train_df)

  sales_train_df['week_of_year'] = sales_train_df.date.dt.weekofyear


In [33]:
sales_train_df_features

In [21]:
X = np.array(list_seq)

  X = np.array(list_seq)


In [25]:
X_pad = pad_sequences(X, dtype='float32') 
X_pad

NameError: name 'pad_sequences' is not defined

In [None]:
model = Sequential()
model.add(layers.SimpleRNN(units=2, activation='tanh', input_shape=(4,3)))
model.add(layers.Dense(1, activation="linear"))

# The compilation
model.compile(loss='mse', 
              optimizer='rmsprop')  # Recommended optimizer for RNNs
# The fit
model.fit(X, y,
         batch_size=16,
         epochs=10, verbose=0)

# The prediction (one per sequence/city)
model.predict(X)

In [77]:
# date_block_num: 34
# shop_id: 60
# item_id: 21807

In [None]:
date_block_num, shop_id
0
1
2
3
4
5
6
7