In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from statsmodels.tsa.statespace.sarimax import SARIMAX


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import psycopg2


## Load data from Postgres

In [3]:
# Set postgres credentials

load_dotenv()
user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')
host = os.getenv('DB_HOST')
port = os.getenv('DB_PORT')
db = os.getenv('DB')

In [4]:
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=user,
                        password=password,
                        ipaddress=host,
                        port=port,
                        dbname=db))
cnx = create_engine(postgres_str)

In [5]:
# List DB tables
conn = psycopg2.connect(postgres_str)
cursor = conn.cursor()
cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables_list = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables_list #check available tables

['test',
 'shops',
 'sample_submission',
 'item_categories',
 'sales_train',
 'shops_en',
 'item_categories_en',
 'items_en',
 'items',
 'cleaning_store_id',
 'cleaning_item_category_id']

In [6]:
# Load datasets
shops_df = pd.read_sql_query('''SELECT * FROM shops_en;''', cnx)
item_categories_df = pd.read_sql_query('''SELECT * FROM item_categories_en;''', cnx)
test_df = pd.read_sql_query('''SELECT * FROM test;''', cnx)
sales_train_df = pd.read_sql_query('''SELECT * FROM sales_train;''', cnx)
items_df = pd.read_sql_query('''SELECT * FROM items_en;''', cnx)

In [7]:
items_df.info()
sales_train_df.isnull().sum() # check for NaN values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

## Preprocessing

In [8]:
# Merge with categories to get the Ids
sales_train_df = pd.merge(sales_train_df, items_df, left_on='item_id', right_on='item_id', how='left')

# Drop column that contais the item_name
sales_train_df.drop(labels='item_name', axis=1, inplace=True)

In [9]:
# Set date to YYYY/mm/dd
sales_train_df['date'] = pd.to_datetime(sales_train_df['date'], format='%d.%m.%Y')

# Drop duplicates
sales_train_df.drop_duplicates(inplace=True)

In [10]:
# Filtered shop_id
sales_train_df_31 = sales_train_df[sales_train_df['shop_id'] == 31]

In [11]:
# Sorted date in ascending order
sales_train_df_31 = sales_train_df_31.sort_values(by='date', ascending=True, na_position='first')

In [12]:
# Filtered item_category_id
sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]

  sales_train_df_31 = sales_train_df_31[~sales_train_df['item_category_id'].isin([0, 1,10,13,32,39,42,46,50,51,52,53,59,66,68,82])]


In [13]:
sales_train_df_31

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
60440,2013-01-02,0,31,4248,1993.12,1.0,23
72614,2013-01-02,0,31,17241,347.11,1.0,40
71078,2013-01-02,0,31,15060,399.00,1.0,30
63528,2013-01-02,0,31,7893,1980.00,1.0,6
71610,2013-01-02,0,31,11854,199.00,1.0,63
...,...,...,...,...,...,...,...
2925574,2015-10-31,33,31,8478,349.00,1.0,43
2928312,2015-10-31,33,31,3713,1999.00,1.0,23
2890918,2015-10-31,33,31,12974,249.00,1.0,55
2928073,2015-10-31,33,31,2955,932.67,1.0,19


In [14]:
#sales_31_group = sales_train_df_31.groupby(['date_block_num','month']).agg({'score': 'sum', 'num_attempts': 'sum'})

In [15]:
sales_train_df_31.shape

(233790, 7)

In [16]:
233790*.7

163653.0

In [18]:
train = sales_train_df_31.item_cnt_day[0:163653]
test = sales_train_df_31.item_cnt_day[163653:]

  train = sales_train_df_31.item_cnt_day[0:163653]
  test = sales_train_df_31.item_cnt_day[163653:]


In [25]:
smodel = pm.auto_arima(train, seasonal=True, m=4, 
                       start_p=0, max_p=1, max_d=1, start_q=0, max_q=1,
                       start_P=0, max_P=2, max_D=1, start_Q=0, max_Q=2, 
                       trace=True, error_action='ignore', suppress_warnings=True)

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[4] intercept   : AIC=842668.853, Time=3.03 sec
 ARIMA(1,1,0)(1,0,0)[4] intercept   : AIC=795058.550, Time=11.51 sec
 ARIMA(0,1,1)(0,0,1)[4] intercept   : AIC=inf, Time=112.64 sec
 ARIMA(0,1,0)(0,0,0)[4]             : AIC=842666.853, Time=1.64 sec
 ARIMA(1,1,0)(0,0,0)[4] intercept   : AIC=795059.757, Time=3.56 sec
 ARIMA(1,1,0)(2,0,0)[4] intercept   : AIC=795046.630, Time=29.00 sec
 ARIMA(1,1,0)(2,0,1)[4] intercept   : AIC=795048.605, Time=40.45 sec
 ARIMA(1,1,0)(1,0,1)[4] intercept   : AIC=795060.487, Time=18.26 sec
 ARIMA(0,1,0)(2,0,0)[4] intercept   : AIC=842528.643, Time=22.09 sec
 ARIMA(1,1,1)(2,0,0)[4] intercept   : AIC=inf, Time=237.74 sec
 ARIMA(0,1,1)(2,0,0)[4] intercept   : AIC=inf, Time=234.42 sec
 ARIMA(1,1,0)(2,0,0)[4]             : AIC=795044.630, Time=13.29 sec
 ARIMA(1,1,0)(1,0,0)[4]             : AIC=795056.550, Time=6.19 sec
 ARIMA(1,1,0)(2,0,1)[4]             : AIC=795046.605, Time=17.69 sec
 ARIMA(1,1,0)

In [26]:
# Build Model
sarima = SARIMAX(train, order=(1, 1, 0), seasonal_order=(2, 0, 0, 4))
sarima = sarima.fit(maxiter=75)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.42923D+00    |proj g|=  1.97120D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      4      6      1     0     0   9.489D-06   2.429D+00
  F =   2.4290316413103223     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [27]:
# Forecast
results = sarima.get_forecast(len(test), alpha=0.05)
forecast = results.predicted_mean
confidence_int = results.conf_int()

  return get_prediction_index(


In [28]:
forecast.tail()

233785    1.656727
233786    1.656727
233787    1.656727
233788    1.656727
233789    1.656727
Name: predicted_mean, dtype: float64

In [30]:
sales_train_df_31.tail(50)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
2928632,2015-10-31,33,31,1583,3999.00,2.0,20
2925648,2015-10-31,33,31,8327,399.00,1.0,40
2925649,2015-10-31,33,31,8219,249.00,2.0,55
2925666,2015-10-31,33,31,8528,850.00,1.0,75
2889246,2015-10-31,33,31,20609,1099.00,1.0,72
...,...,...,...,...,...,...,...
2925574,2015-10-31,33,31,8478,349.00,1.0,43
2928312,2015-10-31,33,31,3713,1999.00,1.0,23
2890918,2015-10-31,33,31,12974,249.00,1.0,55
2928073,2015-10-31,33,31,2955,932.67,1.0,19


In [15]:
list_unique_cat = sales_train_df_31['item_category_id'].unique()

In [16]:
list_unique_cat

array([23, 40, 30,  6, 63, 22, 38, 75, 69,  5, 37,  2, 55, 45, 19, 21, 79,
       28, 25, 62, 41, 57, 15, 33, 72, 11, 14, 35, 83, 67, 70,  3, 65, 71,
       64, 43, 29, 61,  4, 73, 49, 60, 77, 56, 58, 18, 20, 12, 17,  8, 16,
       24,  7, 47, 80])

In [17]:
sales_train_df_31_cat = sales_train_df_31.set_index('item_category_id')

In [18]:
list_seq = []
for x in list_unique_cat:
    
    s = np.array(sales_train_df_31_cat.loc[x,['date_block_num','item_price','item_id','item_cnt_day']])
    
    list_seq.append(s)


Running RNN Model 

In [19]:
def create_date_features(df):
    sales_train_df['month'] = sales_train_df.date.dt.month
    sales_train_df['day_of_month'] = sales_train_df.date.dt.day
    sales_train_df['day_of_year'] = sales_train_df.date.dt.dayofyear
    sales_train_df['week_of_year'] = sales_train_df.date.dt.week
    sales_train_df['day_of_week'] = sales_train_df.date.dt.dayofweek + 1
    sales_train_df['year'] = sales_train_df.date.dt.year
    sales_train_df["is_wknd"] = sales_train_df.date.dt.weekday // 4
    sales_train_df["quarter"] = sales_train_df.date.dt.quarter
    sales_train_df['is_month_start'] = sales_train_df.date.dt.is_month_start.astype(int)
    sales_train_df['is_month_end'] = sales_train_df.date.dt.is_month_end.astype(int)
    sales_train_df['is_quarter_start'] = sales_train_df.date.dt.is_quarter_start.astype(int)
    sales_train_df['is_quarter_end'] = sales_train_df.date.dt.is_quarter_end.astype(int)
    sales_train_df['is_year_start'] = sales_train_df.date.dt.is_year_start.astype(int)
    sales_train_df['is_year_end'] = sales_train_df.date.dt.is_year_end.astype(int)

sales_train_df_features = create_date_features(sales_train_df)

  sales_train_df['week_of_year'] = sales_train_df.date.dt.week


In [25]:
sales_train_df_features

AttributeError: 'list' object has no attribute 'shape'

In [24]:
X_pad = pad_sequences(X) 
X_pad

ValueError: Shape of sample () of sequence at position 45 is different from expected shape (4,)

In [None]:
model = Sequential()
model.add(layers.SimpleRNN(units=2, activation='tanh', input_shape=(4,3)))
model.add(layers.Dense(1, activation="linear"))

# The compilation
model.compile(loss='mse', 
              optimizer='rmsprop')  # Recommended optimizer for RNNs
# The fit
model.fit(X, y,
         batch_size=16,
         epochs=10, verbose=0)

# The prediction (one per sequence/city)
model.predict(X)

In [77]:
# date_block_num: 34
# shop_id: 60
# item_id: 21807

In [None]:
date_block_num, shop_id
0
1
2
3
4
5
6
7