## Load Packages

In [533]:
import math
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2
from tqdm import tqdm

In [209]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking
from tensorflow.keras import layers

## Load data from Postgres

In [3]:
# Set postgres credentials
load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [4]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [5]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
#tables_list #check available tables

In [6]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)
cleaning_store_df = pd.read_sql_query('''SELECT * FROM cleaning_store_id;''', cnx)
cleaning_item_category_df = pd.read_sql_query('''SELECT * FROM cleaning_item_category_id;''', cnx)

In [None]:
#items_df.info()
#sales_train_df.isnull().sum() # check for NaN values

## Preprocessing

In [7]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')
# Merge with cleaning_item_category_df to get the status of active and non active categories
sales_train_df = pd.merge(sales_train_df, cleaning_item_category_df, left_on='item_category_id', right_on='item_category_id', how='left')
# Merge with cleaning_store_df to get the status of active and non active shops
sales_train_df = pd.merge(sales_train_df, cleaning_store_df, left_on='shop_id', right_on='shop_id', how='left')

In [8]:
# Drop column that contais the items name, category status name, and shop status name
sales_train_df.drop(labels=['item_name', 'category_status','shop_status'], axis=1, inplace=True)

In [9]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Set property dtypes for all other columns
sales_train_df = sales_train_df.astype({'date_block_num':'Int32',
                                        'shop_id':'Int32',
                                        'item_id':'Int32',
                                        'item_price':'float32',
                                        'item_cnt_day':'Int32',
                                        'item_category_id':'Int32',
                                        'category_status_code':'Int32',
                                        'shop_status_code':'Int32'})

# Set a new DataFrame to work with models
sales_train_clean_df = sales_train_df.copy()

# Filter only active categories and stores
sales_train_clean_df = sales_train_clean_df.query('category_status_code == 1').query('shop_status_code == 1')

# Drop category_status_code and shop_status_code to lightweight the dataframe
sales_train_clean_df.drop(labels=['category_status_code', 'shop_status_code'], axis=1, inplace=True)

# Drop duplicates, just in case
sales_train_clean_df.drop_duplicates(inplace=True)

### Transforming work DataFrame

In [521]:
df = sales_train_clean_df.copy()

#df['quarter'] = df.date.dt.quarter
#df['is_quarter_start'] = df.date.dt.is_quarter_start.astype(int)
#df['is_quarter_end'] = df.date.dt.is_quarter_end.astype(int)
#df['is_year_start'] = df.date.dt.is_year_start.astype(int)
#df['is_year_end'] = df.date.dt.is_year_end.astype(int)

In [522]:
# Sum the total amount of sold products on a new column called total_item_amount_sold
df['total_item_amount_sold'] = df.apply(lambda x: (x["item_price"] * x["item_cnt_day"]) if x['item_cnt_day'] > 0 else 0, axis=1)

# Create a new column grouped by month and year concatenated
#df['month_year'] = pd.to_datetime(df['date']).dt.to_period('M')
df['month'] = pd.DatetimeIndex(df['date']).month

# Define the format of the DataFrame that will be used in the model
df = df.groupby(['month', 'date_block_num', 'shop_id', 'item_id','item_category_id']).aggregate({'total_item_amount_sold': 'sum','item_cnt_day':'sum','item_cnt_day':'count'}).reset_index().sort_values(['date_block_num'], ascending = True)


In [523]:
df

Unnamed: 0,month,date_block_num,shop_id,item_id,item_category_id,total_item_amount_sold,item_cnt_day
0,1,0,2,27,19,2499.000000,1
30961,1,0,41,2716,28,298.000000,2
30962,1,0,41,2727,21,898.500000,2
30963,1,0,41,2748,19,799.000000,1
30964,1,0,41,2801,30,56.000000,2
...,...,...,...,...,...,...,...
1142944,10,33,25,2861,25,4857.300049,3
1142943,10,33,25,2859,25,209.300003,1
1142942,10,33,25,2838,20,5798.000000,2
1142954,10,33,25,2959,19,1399.000000,1


In [547]:
list_shop_id = list(df.shop_id.unique()) #[2,31]

In [None]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse = False) 

modeldict = {}
exceptions = []

for ele in tqdm(list_shop_id):
    print("-------------------------------")     
    print(f"Start process of shop: {ele}")
    error_model = {}
    shop_df = df[df['shop_id'] == ele].query('item_cnt_day >= 0')
    
    #normalizing month
    shop_df["month_norm"] = 2 * math.pi * shop_df['month'] / shop_df['month'].max()

    #creating cos 
    shop_df["cos_x"] = np.cos(shop_df["month_norm"])

    #creating sin
    shop_df["sin_x"] = np.sin(shop_df["month_norm"])
    
    ohe.fit(shop_df[['item_category_id']]) 
    shop_df[ohe.get_feature_names_out()] = ohe.transform(shop_df[['item_category_id']])
    shop_df.drop(columns = ['item_category_id'], inplace = True)
    
    list_col = np.array(shop_df.T.index[1:])
    list_col = np.delete(list_col, np.where(list_col == 'item_cnt_day'))
    list_col = np.delete(list_col, np.where(list_col == 'item_id'))
    list_col = np.append(list_col, 'item_cnt_day')    

    # Creating list of unique items to predict the amount sold per item
    list_unique_item = shop_df['item_id'].sort_values().unique()
    #print(f"Unique items: {list_unique_item}")

    shop_df.set_index('item_id', inplace=True)
    shop_df.sort_index(ascending=True, inplace=True)

    list_seq = []

    for x in list_unique_item:
        #s = np.array(shop_df.loc[x,['date_block_num', 
        #                            'shop_id', 
        #                            'item_category_id',
        #                            'total_item_amount_sold',
        #                            'item_cnt_day']])
        
        s = np.array(shop_df.loc[x, list_col])
        
        if len(np.shape(s)) < 2:
            s = np.expand_dims(s,axis=0)

        list_seq.append(s)

    X = list_seq
    X_pad = pad_sequences(X, dtype='float32', value=0)
    #print(f"store:{ele} / shape:{X_pad.shape}")
    
    y = X_pad[:,-1,-1]
    
    X1 = X_pad[:,:-1,:]
    
    modeldict.update({f"list_unique_item_{ele}":list_unique_item})
    modeldict.update({f"X_{ele}":X_pad})
    modeldict.update({f"y_{ele}":y})
    modeldict.update({f"X1_{ele}":X1})

    
    X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.20)

    es = EarlyStopping(patience=5, restore_best_weights=True)
    
    model = Sequential()
    model.add(layers.LSTM(units=2, activation='tanh', input_shape=X1[0].shape))
    model.add(layers.Dense(1, activation="linear"))
    
    # The compilation
    model.compile(loss='mse', 
                  optimizer='rmsprop',
                  metrics=['RootMeanSquaredError', 'MeanAbsoluteError']
                  )
    
    # The fit
    model.fit(X_train, y_train,
              validation_data=(X_test, y_test),
              batch_size=8,
              epochs=20,
              callbacks = [es]
              )

    # Populate dict to be inserted in the database
    error_model.update({f"loss":history.history['loss']})
    error_model.update({f"rmse":history.history['root_mean_squared_error']})
    error_model.update({f"mae":history.history['mean_absolute_error']})
    error_model.update({f"val_loss":history.history['val_loss']})
    error_model.update({f"val_root_mean_squared_error":history.history['val_root_mean_squared_error']})
    
    # Populate dict to be used in the code later
    modeldict.update({f"loss_{ele}":history.history['loss']})
    modeldict.update({f"rmse_{ele}":history.history['root_mean_squared_error']})
    modeldict.update({f"mae_{ele}":history.history['mean_absolute_error']})
    modeldict.update({f"val_loss_{ele}":history.history['val_loss']})
    modeldict.update({f"val_root_mean_squared_error_{ele}":history.history['val_root_mean_squared_error']})    

    #print(f"history_{ele}: {history}")

    # The prediction (one per sequence/city)
    pred = model.predict(X1)

    modeldict.update({f"pred_{ele}":pred})
    #print(f"pred_{ele}: {pred}")
    

    list_to_insert = []
    
    for i, value in enumerate(modeldict[f"list_unique_item_{ele}"]): 
        postgres_insert_query = """ INSERT INTO model_rnn (shop_id, item_id, pred, history) VALUES (%s,%s,%s,%s)"""
        record_to_insert = (int(ele), int(modeldict[f"list_unique_item_{ele}"][i]), float(modeldict[f"pred_{ele}"][i]), json.dumps(error_model))
        list_to_insert.append(record_to_insert)
        
    try:
        conn = psycopg2.connect(postgres_str)
        cursor = conn.cursor()


        # cursor.mogrify() to insert multiple values
        args = ','.join(cursor.mogrify("(%s,%s,%s,%s)", i).decode('utf-8')
                        for i in list_to_insert)

        # executing the sql statement
        cursor.execute("INSERT INTO model_rnn VALUES " + (args))

        conn.commit()
        print("Record inserted successfully into model_rnn table")

    except (Exception, psycopg2.Error) as error:
        print("Failed to insert record into model_rnn table", error)
        print(f"Query: INSERT INTO model_rnn VALUES {args}")
        exceptions.append(ele)

    finally:
        # closing database connection.
        if conn:
            cursor.close()
            conn.close()
            print("PostgreSQL connection closed")
            print(f"End process of shop: {ele}")
            print("-------------------------------")

  0%|                                                                                                                                                   | 0/45 [00:00<?, ?it/s]

-------------------------------
Start process of shop: 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


  2%|███                                                                                                                                        | 1/45 [00:54<40:15, 54.89s/it]

Record inserted successfully into model_rnn table
PostgreSQL connection closed
End process of shop: 2
-------------------------------
-------------------------------
Start process of shop: 41
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
