## Load Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2
from tqdm import tqdm

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking
from tensorflow.keras import layers

## Load data from Postgres

In [3]:
# Set postgres credentials
load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [4]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [5]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
#tables_list #check available tables

In [6]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)
cleaning_store_df = pd.read_sql_query('''SELECT * FROM cleaning_store_id;''', cnx)
cleaning_item_category_df = pd.read_sql_query('''SELECT * FROM cleaning_item_category_id;''', cnx)

In [None]:
#items_df.info()
#sales_train_df.isnull().sum() # check for NaN values

## Preprocessing

In [7]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')
# Merge with cleaning_item_category_df to get the status of active and non active categories
sales_train_df = pd.merge(sales_train_df, cleaning_item_category_df, left_on='item_category_id', right_on='item_category_id', how='left')
# Merge with cleaning_store_df to get the status of active and non active shops
sales_train_df = pd.merge(sales_train_df, cleaning_store_df, left_on='shop_id', right_on='shop_id', how='left')

In [8]:
# Drop column that contais the items name, category status name, and shop status name
sales_train_df.drop(labels=['item_name', 'category_status','shop_status'], axis=1, inplace=True)

In [9]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Set property dtypes for all other columns
sales_train_df = sales_train_df.astype({'date_block_num':'Int32',
                                        'shop_id':'Int32',
                                        'item_id':'Int32',
                                        'item_price':'float32',
                                        'item_cnt_day':'Int32',
                                        'item_category_id':'Int32',
                                        'category_status_code':'Int32',
                                        'shop_status_code':'Int32'})

# Set a new DataFrame to work with models
sales_train_clean_df = sales_train_df.copy()

# Filter only active categories and stores
sales_train_clean_df = sales_train_clean_df.query('category_status_code == 1').query('shop_status_code == 1')

# Drop category_status_code and shop_status_code to lightweight the dataframe
sales_train_clean_df.drop(labels=['category_status_code', 'shop_status_code'], axis=1, inplace=True)

# Drop duplicates, just in case
sales_train_clean_df.drop_duplicates(inplace=True)

### Transforming work DataFrame

In [10]:
df = sales_train_clean_df.copy()

#df['quarter'] = df.date.dt.quarter
#df['is_quarter_start'] = df.date.dt.is_quarter_start.astype(int)
#df['is_quarter_end'] = df.date.dt.is_quarter_end.astype(int)
#df['is_year_start'] = df.date.dt.is_year_start.astype(int)
#df['is_year_end'] = df.date.dt.is_year_end.astype(int)

In [11]:
# Sum the total amount of sold products on a new column called total_item_amount_sold
df['total_item_amount_sold'] = df.apply(lambda x: (x["item_price"] * x["item_cnt_day"]) if x['item_cnt_day'] > 0 else 0, axis=1)

# Create a new column grouped by month and year concatenated
df['month_year'] = pd.to_datetime(df['date']).dt.to_period('M')

# Define the format of the DataFrame that will be used in the model
df = df.groupby(['month_year', 'date_block_num', 'shop_id', 'item_id', 'item_category_id']).aggregate({'total_item_amount_sold': 'sum','item_cnt_day':'sum'}).reset_index().sort_values(['date_block_num'], ascending = True)


In [80]:
list_shop_id = [31] #list(df.shop_id.unique())

In [81]:
list_unique_stores = df['shop_id'].sort_values().unique()

In [82]:
shop_df = df[df['shop_id'] == 31].query('item_cnt_day > 0')

In [83]:
shop_df

Unnamed: 0,month_year,date_block_num,shop_id,item_id,item_category_id,total_item_amount_sold,item_cnt_day
25155,2013-01,0,31,8674,40,198.0,1
25156,2013-01,0,31,8685,55,199.0,1
25157,2013-01,0,31,8690,55,199.0,1
25158,2013-01,0,31,8707,40,149.0,1
25159,2013-01,0,31,8709,40,1047.0,3
...,...,...,...,...,...,...,...
1331082,2015-10,33,31,45,57,299.0,1
1331080,2015-10,33,31,33,37,398.0,2
1331079,2015-10,33,31,32,40,447.0,3
1331078,2015-10,33,31,30,40,129.0,1


In [63]:
list_unique_item = shop_df['item_id'].sort_values().unique()

In [64]:
list_unique_item

<IntegerArray>
[   26,    27,    28,    29,    30,    31,    32,    33,    34,    35,
 ...
 22151, 22152, 22154, 22155, 22159, 22160, 22162, 22163, 22164, 22167]
Length: 14135, dtype: Int32

In [84]:
shop_df.set_index('item_id', inplace=True)


In [85]:
list_seq = []

In [86]:
for x in list_unique_item:
    #i = i + 1

    s = np.array(shop_df.loc[x,['date_block_num', 
                                'shop_id', 
                                'item_category_id',
                                'total_item_amount_sold',
                                'item_cnt_day']])
    if len(np.shape(s)) < 2:
        s = np.expand_dims(s,axis=0)

    list_seq.append(s)

    #print(i)

In [None]:
X = list_seq
X_pad = pad_sequences(X, dtype='float32', value=0)
y = X_pad[:,-1,-1]


In [None]:
modeldict = {}

for ele in tqdm(list_shop_id):
    # Get a test sample with only one store #31
    shop_df = df[df['shop_id'] == ele].query('item_cnt_day > 0')

    # Creating list of unique items to predict the amount sold per item
    list_unique_item = shop_df['item_id'].sort_values().unique()
    #print(f"Unique items: {list_unique_item}")

    shop_df.set_index('item_id', inplace=True)

    list_seq = []

    for x in list_unique_item:
        s = np.array(shop_df.loc[x,['date_block_num', 
                                    'shop_id', 
                                    'item_category_id',
                                    'total_item_amount_sold',
                                    'item_cnt_day']])
        if len(np.shape(s)) < 2:
            s = np.expand_dims(s,axis=0)

        list_seq.append(s)
             
    X = list_seq
    X_pad = pad_sequences(X, dtype='float32', value=0)
    y = X_pad[:,-1,-1]
    
    
    modeldict.update({f"X_{ele}":X_pad})
    modeldict.update({f"y_{ele}":y})

#     # –– Model
#     model = Sequential()
#     model.add(layers.Masking(mask_value=-1000, input_shape=(32, 2)))
#     model.add(layers.SimpleRNN(units=2, activation='tanh'))
#     model.add(layers.Dense(10, activation='relu'))
#     model.add(layers.Dense(1, activation='linear'))

#     # –– Compilation
#     model.compile(loss='mse', 
#                   optimizer='rmsprop') # Use `rmsprop`

#     # –– Fit
#     model.fit(X_pad, y);
#     modeldict.update({ele:model})



 50%|█████████████████████████████████████████████████████████████████████▌                                                                     | 1/2 [01:58<01:58, 118.76s/it]

In [56]:
X_pad.shape

(4814, 33, 5)

In [57]:
y.shape

(4814,)

In [55]:
np.shape(list_seq[5])

(1, 5)

In [None]:
shop_df.reset_index().groupby(by='item_id').count()['month_year

In [None]:
np.shape(X_pad)

In [None]:
X=len(list_seq[:-1])

In [None]:
len(list_seq)

In [None]:
X