In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

In [4]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2

## Load data from Postgres

In [6]:
# Set postgres credentials

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [7]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [8]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables_list #check available tables

['test',
 'shops',
 'sample_submission',
 'item_categories',
 'sales_train',
 'shops_en',
 'item_categories_en',
 'items_en',
 'items',
 'cleaning_store_id',
 'cleaning_item_category_id']

In [9]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)

In [10]:
items_df.info()
sales_train_df.isnull().sum() # check for NaN values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Preprocessing

In [11]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')

# Drop column that contais the item_name
sales_train_df.drop(labels='item_name', axis=1, inplace=True)

In [12]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Drop duplicates
sales_train_df.drop_duplicates(inplace=True)

In [17]:
# Filtered item_category_id
sales_train_df = sales_train_df[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

In [19]:
# Sorted date in ascending order
sales_train_df = sales_train_df.sort_values(by='date', ascending=True, na_position='first')

Starting Model 

In [24]:
# Summary Stats for each store
sales_train_df.groupby(["shop_id"]).agg({"item_cnt_day": ["count","sum", "mean", "median", "std", "min", "max"]})

Unnamed: 0_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,count,sum,mean,median,std,min,max
shop_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,9668,11435.0,1.182768,1.0,0.622336,1.0,11.0
1,5615,6137.0,1.092965,1.0,0.393995,1.0,7.0
2,25866,30460.0,1.177608,1.0,1.111459,-5.0,71.0
3,25458,28275.0,1.110653,1.0,0.992791,-2.0,83.0
4,37859,43388.0,1.146042,1.0,1.060178,-2.0,104.0
...,...,...,...,...,...,...,...
55,34769,63388.0,1.823118,1.0,8.002811,-1.0,637.0
56,68894,76901.0,1.116222,1.0,0.784564,-2.0,58.0
57,116471,139704.0,1.199475,1.0,1.104253,-2.0,98.0
58,71083,81125.0,1.141271,1.0,0.742400,-2.0,44.0


In [26]:
sales_train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
57605,2013-01-01,0,15,990,99.0,1.0,67
74711,2013-01-01,0,54,22134,399.0,2.0,40
74704,2013-01-01,0,54,22139,999.0,1.0,38
74684,2013-01-01,0,54,22151,399.0,1.0,40
48732,2013-01-01,0,15,11041,349.0,2.0,40


In [36]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [37]:
sales_train_df_31

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
65477,2013-01-02,0,31,1013,299.00,2.0,67
70951,2013-01-02,0,31,12967,199.00,1.0,55
70959,2013-01-02,0,31,12946,149.00,1.0,40
70963,2013-01-02,0,31,13735,597.43,3.0,21
70981,2013-01-02,0,31,13736,598.50,1.0,21
...,...,...,...,...,...,...,...
2890492,2015-10-31,33,31,15256,399.00,1.0,63
2926378,2015-10-31,33,31,13159,299.00,1.0,30
2890567,2015-10-31,33,31,14839,249.00,2.0,65
2890547,2015-10-31,33,31,15151,299.00,1.0,43


In [39]:
# 1. Time Related Features
#####################################################
def create_date_features(df):
    sales_train_df_31['month'] = sales_train_df_31.date.dt.month
    sales_train_df_31['day_of_month'] = sales_train_df_31.date.dt.day
    sales_train_df_31['day_of_year'] = sales_train_df_31.date.dt.dayofyear
    sales_train_df_31['week_of_year'] = sales_train_df_31.date.dt.weekofyear
    sales_train_df_31['day_of_week'] = sales_train_df_31.date.dt.dayofweek + 1
    sales_train_df_31['year'] = sales_train_df_31.date.dt.year
    sales_train_df_31["is_wknd"] = sales_train_df_31.date.dt.weekday // 4
    sales_train_df_31["quarter"] = sales_train_df_31.date.dt.quarter
    sales_train_df_31['is_month_start'] = sales_train_df_31.date.dt.is_month_start.astype(int)
    sales_train_df_31['is_month_end'] = sales_train_df_31.date.dt.is_month_end.astype(int)
    sales_train_df_31['is_quarter_start'] = sales_train_df_31.date.dt.is_quarter_start.astype(int)
    sales_train_df_31['is_quarter_end'] = sales_train_df_31.date.dt.is_quarter_end.astype(int)
    sales_train_df_31['is_year_start'] = sales_train_df_31.date.dt.is_year_start.astype(int)
    sales_train_df_31['is_year_end'] = sales_train_df_31.date.dt.is_year_end.astype(int)
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    sales_train_df_31["season"] = np.where(sales_train_df_31.month.isin([12,1,2]), 0, 1)
    sales_train_df_31["season"] = np.where(sales_train_df_31.month.isin([6,7,8]), 2, df["season"])
    sales_train_df_31["season"] = np.where(sales_train_df_31.month.isin([9, 10, 11]), 3, df["season"])
    return df
sales_train_df_31 = create_date_features(sales_train_df_31)

In [40]:
sales_train_df_31

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season
65477,2013-01-02,0,31,1013,299.00,2.0,67,1,2,2,1,3,2013,0,1,0,0,0,0,0,0,0
70951,2013-01-02,0,31,12967,199.00,1.0,55,1,2,2,1,3,2013,0,1,0,0,0,0,0,0,0
70959,2013-01-02,0,31,12946,149.00,1.0,40,1,2,2,1,3,2013,0,1,0,0,0,0,0,0,0
70963,2013-01-02,0,31,13735,597.43,3.0,21,1,2,2,1,3,2013,0,1,0,0,0,0,0,0,0
70981,2013-01-02,0,31,13736,598.50,1.0,21,1,2,2,1,3,2013,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890492,2015-10-31,33,31,15256,399.00,1.0,63,10,31,304,44,6,2015,1,4,0,1,0,0,0,0,3
2926378,2015-10-31,33,31,13159,299.00,1.0,30,10,31,304,44,6,2015,1,4,0,1,0,0,0,0,3
2890567,2015-10-31,33,31,14839,249.00,2.0,65,10,31,304,44,6,2015,1,4,0,1,0,0,0,0,3
2890547,2015-10-31,33,31,15151,299.00,1.0,43,10,31,304,44,6,2015,1,4,0,1,0,0,0,0,3


In [21]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [22]:
sales_train_df_31.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
65477,2013-01-02,0,31,1013,299.0,2.0,67
70951,2013-01-02,0,31,12967,199.0,1.0,55
70959,2013-01-02,0,31,12946,149.0,1.0,40
70963,2013-01-02,0,31,13735,597.43,3.0,21
70981,2013-01-02,0,31,13736,598.5,1.0,21


In [23]:
# Summary Stats for each store
sales_train_df_31.groupby(["shop_id"]).agg({"item_cnt_day": ["count","sum", "mean", "median", "std", "min", "max"]})

Unnamed: 0_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,count,sum,mean,median,std,min,max
shop_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
31,233790,308603.0,1.320001,1.0,2.203512,-2.0,288.0


In [14]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [15]:
# Sorted date in ascending order
sales_train_df_31 = sales_train_df_31.sort_values(by='date', ascending=True, na_position='first')

In [16]:
# Filtered item_category_id
sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

  sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]


In [17]:
sales_train_df_31

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
60440,2013-01-02,0,31,4248,1993.12,1.0,23
72614,2013-01-02,0,31,17241,347.11,1.0,40
71078,2013-01-02,0,31,15060,399.00,1.0,30
63528,2013-01-02,0,31,7893,1980.00,1.0,6
71610,2013-01-02,0,31,11854,199.00,1.0,63
...,...,...,...,...,...,...,...
2925574,2015-10-31,33,31,8478,349.00,1.0,43
2928312,2015-10-31,33,31,3713,1999.00,1.0,23
2890918,2015-10-31,33,31,12974,249.00,1.0,55
2928073,2015-10-31,33,31,2955,932.67,1.0,19


In [None]:
#sales_31_group = sales_train_df_31.groupby(['date_block_num','month']).agg({'score': 'sum', 'num_attempts': 'sum'})

In [30]:
list_unique_cat = sales_train_df_31['item_category_id'].unique()

In [31]:
list_unique_cat

array([23, 40, 30,  6, 63, 22, 38, 75, 69,  5, 37,  2, 55, 45, 19, 21, 79,
       28, 25, 62, 41, 57, 15, 33, 72, 11, 14, 35, 83, 67, 70,  3, 65, 71,
       64, 43, 29, 61,  4, 73, 49, 60, 77, 56, 58, 18, 20, 12, 17,  8, 16,
       24,  7, 47, 80])

In [32]:
sales_train_df_31_cat = sales_train_df_31.set_index('item_category_id')

In [42]:
list_seq = []
for x in list_unique_cat:
    
    s = np.array(sales_train_df_31_cat.loc[x,['date_block_num','item_price','item_id','item_cnt_day']])
    
    list_seq.append(s)


Running RNN Model 

In [37]:
X = list_seq
y = 

In [None]:
model = Sequential()
model.add(layers.SimpleRNN(units=2, activation='tanh', input_shape=(4,3)))
model.add(layers.Dense(1, activation="linear"))

# The compilation
model.compile(loss='mse', 
              optimizer='rmsprop')  # Recommended optimizer for RNNs
# The fit
model.fit(X, y,
         batch_size=16,
         epochs=10, verbose=0)

# The prediction (one per sequence/city)
model.predict(X)

In [77]:
# date_block_num: 34
# shop_id: 60
# item_id: 21807

In [None]:
date_block_num, shop_id
0
1
2
3
4
5
6
7