In [5]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

import xgboost as xgb

from math import sqrt
from datetime import datetime

from numpy import concatenate, percentile

from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, TimeDistributed, RepeatVector
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

!pip install pmdarima
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima




In [6]:
#Set parameters to see all data
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [7]:
# Load Train dataset
train = pd.read_csv("train.csv")
train.head(5).append(train.tail(5))

Unnamed: 0,productsGroup_key,date_key,quantitySales
0,1,20190902,26784.0
1,1,20190903,7432.0
2,1,20190904,1424.0
3,1,20190905,608.0
4,1,20190906,776.0
242395,297,20211123,18.0
242396,297,20211124,16.0
242397,297,20211125,16.0
242398,297,20211126,16.0
242399,297,20211127,38.0


In [8]:
# Load submission dataset
submission = pd.read_csv("submission.csv")
submission.head(5).append(submission.tail(5))

Unnamed: 0,productsGroup_key,date_key,quantitySales,q_0850,q_0900,q_0920,q_0950,q_0990
0,1,20211128,0,0,0,0,0,0
1,1,20211129,0,0,0,0,0,0
2,1,20211130,0,0,0,0,0,0
3,1,20211201,0,0,0,0,0,0
4,1,20211202,0,0,0,0,0,0
8311,297,20211221,0,0,0,0,0,0
8312,297,20211222,0,0,0,0,0,0
8313,297,20211223,0,0,0,0,0,0
8314,297,20211224,0,0,0,0,0,0
8315,297,20211225,0,0,0,0,0,0


In [9]:
train['isStationary'] = 0
for product in train.productsGroup_key.unique():
  series = train[train.productsGroup_key == product].quantitySales
  X = series.values
  result = adfuller(X)
  if (result[1] > 0.05):
    train.loc[train['productsGroup_key'] == product, 'isStationary'] = 1

In [10]:
train['isStationary'].value_counts()

0    200862
1     41538
Name: isStationary, dtype: int64

In [11]:
def outlier_detection(arr, isStationary):
  q25, q75 = percentile(arr, 15), percentile(arr, 85)
  iqr = q75 - q25
  if (isStationary == 1):
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
  else:
    # calculate the outlier cutoff
    cut_off = iqr * 0.5
  lower, upper = q25 - cut_off, q75 + cut_off
  return lower, upper

In [12]:
stationary_train = train[train['isStationary'] == 1].copy()
non_stationary_train = train[train['isStationary'] == 0].copy()

In [13]:
for key in stationary_train['productsGroup_key'].unique():
  lower_st, upper_st = outlier_detection(stationary_train[stationary_train['productsGroup_key'] == key]['quantitySales'], 1)

  stationary_train.loc[stationary_train['productsGroup_key'] == key, 'lower'] = lower_st
  stationary_train.loc[stationary_train['productsGroup_key'] == key, 'upper'] = upper_st

In [14]:
for key in non_stationary_train['productsGroup_key'].unique():
  lower_nst, upper_nst = outlier_detection(non_stationary_train[non_stationary_train['productsGroup_key'] == key]['quantitySales'], 0)

  non_stationary_train.loc[non_stationary_train['productsGroup_key'] == key, 'lower'] = lower_nst
  non_stationary_train.loc[non_stationary_train['productsGroup_key'] == key, 'upper'] = upper_nst

In [15]:
stationary_train.iloc[1]

productsGroup_key          60.00
date_key             20190902.00
quantitySales            2331.00
isStationary                1.00
lower                   -8474.85
upper                   14525.55
Name: 48315, dtype: float64

In [16]:
stationary_train.loc[stationary_train['quantitySales'] < stationary_train['lower'], 'quantitySales'] = stationary_train['lower']
stationary_train.loc[stationary_train['quantitySales'] > stationary_train['upper'], 'quantitySales'] = stationary_train['upper']
non_stationary_train.loc[non_stationary_train['quantitySales'] < non_stationary_train['lower'], 'quantitySales'] = non_stationary_train['lower']
non_stationary_train.loc[non_stationary_train['quantitySales'] > non_stationary_train['upper'], 'quantitySales'] = non_stationary_train['upper']


In [17]:
stationary_train.shape, non_stationary_train.shape

((41538, 6), (200862, 6))

In [18]:
train_df = stationary_train.append(non_stationary_train)
train_df.shape

(242400, 6)

In [19]:
train_df2 = train_df.copy()
train_df = train_df[train_df.productsGroup_key == 1]
train_df.shape

(818, 6)

In [20]:
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        # print(interval, dataset[i], dataset[i - interval])
        diff.append(value)
    return pd.Series(diff)

In [21]:
train_df.head()

Unnamed: 0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper
0,1,20190902,11794.4,0,-3824.8,11794.4
1,1,20190903,7432.0,0,-3824.8,11794.4
2,1,20190904,1424.0,0,-3824.8,11794.4
3,1,20190905,608.0,0,-3824.8,11794.4
4,1,20190906,776.0,0,-3824.8,11794.4


In [22]:
train_df.index = pd.to_datetime(train_df.date_key)
train_df.head()

Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970-01-01 00:00:00.020190902,1,20190902,11794.4,0,-3824.8,11794.4
1970-01-01 00:00:00.020190903,1,20190903,7432.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190904,1,20190904,1424.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190905,1,20190905,608.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190906,1,20190906,776.0,0,-3824.8,11794.4


In [23]:
diff_values = difference(train_df.quantitySales, 1)
diff_values


0     -4362.4
1     -6008.0
2      -816.0
3       168.0
4        32.0
        ...  
812   -8634.4
813    3664.0
814    -448.0
815    2184.0
816   -7984.0
Length: 817, dtype: float64

In [24]:
def split_data_ratio(df, train, submission):
  n_train = train.shape[0] / (train.shape[0] + submission.shape[0])
  n_train = round(df.shape[0] * n_train)
  return n_train

In [25]:
 # split train test datasets

n_train = split_data_ratio(diff_values, train, submission)
xtrain, xtest = diff_values[0:n_train], diff_values[n_train:]


In [26]:
xtrain.shape, xtest.shape

((790,), (27,))

In [27]:
xtrain

0     -4362.4
1     -6008.0
2      -816.0
3       168.0
4        32.0
        ...  
785     818.4
786       0.0
787   -8746.4
788   -2128.0
789    -840.0
Length: 790, dtype: float64

In [28]:
xtest

790       24.0
791     9832.0
792     1858.4
793    -8498.4
794     -712.0
795      232.0
796    -2736.0
797     1728.0
798     9986.4
799   -10466.4
800        0.0
801     8704.0
802    -5304.0
803    -4648.0
804     3904.0
805     7810.4
806    -1306.4
807   -10408.0
808     2184.0
809    -1664.0
810     -520.0
811    11714.4
812    -8634.4
813     3664.0
814     -448.0
815     2184.0
816    -7984.0
dtype: float64

In [29]:
# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
    dfx = pd.DataFrame(data)
    df = dfx.assign(**{
        '{} (t-{})'.format(col, t): dfx[col].shift(t)
        for t in range(lag+1)
        for col in dfx
    })

    df=df.drop([df.columns[0]], axis=1)
    df=df[df.columns[::-1]]
    return df[lag:]

In [30]:
# scale train and test data to [-1, 1] with MinMaxScaler
def scale(train, test):
    # fit scaler
    scaler = MinMaxScaler()  
    
    # StandardScaler()
    scaler = scaler.fit(train)

    # transform train
    train = train.reshape(train.shape[0], train.shape[1])
    tr_scaled = scaler.transform(train)

    # transform test
    test = test.reshape(test.shape[0], test.shape[1])
    ts_scaled = scaler.transform(test)
    
    return scaler, tr_scaled, ts_scaled

In [31]:
# reorganize dataset acording to window size
values_unscaled = np.concatenate((xtrain, xtest))
supervised_raw = timeseries_to_supervised(values_unscaled, 30)

In [32]:
supervised_raw


Unnamed: 0,0 (t-30),0 (t-29),0 (t-28),0 (t-27),0 (t-26),0 (t-25),0 (t-24),0 (t-23),0 (t-22),0 (t-21),0 (t-20),0 (t-19),0 (t-18),0 (t-17),0 (t-16),0 (t-15),0 (t-14),0 (t-13),0 (t-12),0 (t-11),0 (t-10),0 (t-9),0 (t-8),0 (t-7),0 (t-6),0 (t-5),0 (t-4),0 (t-3),0 (t-2),0 (t-1),0 (t-0)
30,-4362.4,-6008.0,-816.0,168.0,32.0,-704.0,528.0,6624.0,-6080.0,128.0,1432.0,-800.0,-1856.0,11714.4,-11178.4,5640.0,-5408.0,264.0,-664.0,-368.0,4928.0,-2648.0,2472.0,-1440.0,8402.4,-10898.4,-816.0,1112.0,448.0,-1184.0,-376.0
31,-6008.0,-816.0,168.0,32.0,-704.0,528.0,6624.0,-6080.0,128.0,1432.0,-800.0,-1856.0,11714.4,-11178.4,5640.0,-5408.0,264.0,-664.0,-368.0,4928.0,-2648.0,2472.0,-1440.0,8402.4,-10898.4,-816.0,1112.0,448.0,-1184.0,-376.0,728.0
32,-816.0,168.0,32.0,-704.0,528.0,6624.0,-6080.0,128.0,1432.0,-800.0,-1856.0,11714.4,-11178.4,5640.0,-5408.0,264.0,-664.0,-368.0,4928.0,-2648.0,2472.0,-1440.0,8402.4,-10898.4,-816.0,1112.0,448.0,-1184.0,-376.0,728.0,-104.0
33,168.0,32.0,-704.0,528.0,6624.0,-6080.0,128.0,1432.0,-800.0,-1856.0,11714.4,-11178.4,5640.0,-5408.0,264.0,-664.0,-368.0,4928.0,-2648.0,2472.0,-1440.0,8402.4,-10898.4,-816.0,1112.0,448.0,-1184.0,-376.0,728.0,-104.0,-624.0
34,32.0,-704.0,528.0,6624.0,-6080.0,128.0,1432.0,-800.0,-1856.0,11714.4,-11178.4,5640.0,-5408.0,264.0,-664.0,-368.0,4928.0,-2648.0,2472.0,-1440.0,8402.4,-10898.4,-816.0,1112.0,448.0,-1184.0,-376.0,728.0,-104.0,-624.0,576.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,-384.0,7544.0,3352.0,818.4,0.0,-8746.4,-2128.0,-840.0,24.0,9832.0,1858.4,-8498.4,-712.0,232.0,-2736.0,1728.0,9986.4,-10466.4,0.0,8704.0,-5304.0,-4648.0,3904.0,7810.4,-1306.4,-10408.0,2184.0,-1664.0,-520.0,11714.4,-8634.4
813,7544.0,3352.0,818.4,0.0,-8746.4,-2128.0,-840.0,24.0,9832.0,1858.4,-8498.4,-712.0,232.0,-2736.0,1728.0,9986.4,-10466.4,0.0,8704.0,-5304.0,-4648.0,3904.0,7810.4,-1306.4,-10408.0,2184.0,-1664.0,-520.0,11714.4,-8634.4,3664.0
814,3352.0,818.4,0.0,-8746.4,-2128.0,-840.0,24.0,9832.0,1858.4,-8498.4,-712.0,232.0,-2736.0,1728.0,9986.4,-10466.4,0.0,8704.0,-5304.0,-4648.0,3904.0,7810.4,-1306.4,-10408.0,2184.0,-1664.0,-520.0,11714.4,-8634.4,3664.0,-448.0
815,818.4,0.0,-8746.4,-2128.0,-840.0,24.0,9832.0,1858.4,-8498.4,-712.0,232.0,-2736.0,1728.0,9986.4,-10466.4,0.0,8704.0,-5304.0,-4648.0,3904.0,7810.4,-1306.4,-10408.0,2184.0,-1664.0,-520.0,11714.4,-8634.4,3664.0,-448.0,2184.0


In [33]:
supervised_raw = supervised_raw.values.astype("float32")


In [34]:
scaler, train_scaled, test_scaled = scale(xtrain.values.reshape(len(xtrain), 1), xtest.values.reshape(len(xtest), 1))


In [35]:
train_scaled.shape, test_scaled.shape

((790, 1), (27, 1))

In [36]:
values_scaled = np.concatenate((train_scaled, test_scaled))
values_scaled.shape

(817, 1)

In [37]:
n_train

790

In [38]:
supervised = timeseries_to_supervised(values_scaled, 30)

supervised_values = supervised.values.astype('float32')


In [39]:
supervised_values.shape

(787, 31)

In [40]:

##split supervised data into train and test-sets
n_train = split_data_ratio(supervised_raw, train, submission)

supervised_train, supervised_test = supervised_raw[0:n_train], supervised_raw[n_train:]


In [41]:
supervised_train.shape

(761, 31)

In [42]:
supervised_test.shape

(26, 31)

In [43]:
##split supervised data into train and test-sets
train_scaled, test_scaled = supervised_values[0:n_train], supervised_values[n_train:]

In [44]:
train_X, train_y = train_scaled[:, :-1], train_scaled[:, -1]
test_X, test_y = test_scaled[:, :-1], test_scaled[:, -1]

In [45]:
train_y.reshape(train_y.shape[0], 1)


array([[0.4837751 ],
       [0.5309127 ],
       [0.49538872],
       [0.47318622],
       [0.52442276],
       [0.576684  ],
       [0.5128091 ],
       [0.56643665],
       [0.76489276],
       [0.0834472 ],
       [0.47011203],
       [1.        ],
       [0.26028147],
       [0.2798538 ],
       [0.59171337],
       [0.4106777 ],
       [0.48582456],
       [0.47045362],
       [0.6272373 ],
       [0.39803934],
       [0.6204058 ],
       [0.44210276],
       [0.9115316 ],
       [0.03449925],
       [0.46498838],
       [0.8755636 ],
       [0.2064148 ],
       [0.91768   ],
       [0.26164776],
       [0.24774559],
       [0.5186159 ],
       [0.47113678],
       [1.        ],
       [0.4225304 ],
       [0.13160951],
       [0.9453477 ],
       [0.4998292 ],
       [0.02425195],
       [0.4752357 ],
       [0.54047686],
       [0.67539966],
       [0.37276268],
       [0.9108485 ],
       [0.10486405],
       [0.4280981 ],
       [0.4663547 ],
       [1.        ],
       [0.499

In [46]:
# prepare train dataset for lstm
train_X_lstm = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X_lstm = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

In [47]:
train_X_lstm.shape, test_X_lstm.shape####PAY ATTENTION#####

((761, 1, 30), (26, 1, 30))

In [48]:
# prepare test dataset for lstm
train_Y_lstm = train_y.reshape((train_y.shape[0], 1, 1))
test_Y_lstm = test_y.reshape((test_y.shape[0], 1, 1))

In [49]:
test_scaled[:, -1]


array([0.919627  , 0.5791775 , 0.13697226, 0.4694289 , 0.5097349 ,
       0.38300997, 0.57360977, 0.9262194 , 0.05294439, 0.4998292 ,
       0.87146467, 0.27336386, 0.30137312, 0.6665186 , 0.83331054,
       0.44404975, 0.0554379 , 0.5930796 , 0.42878124, 0.4776267 ,
       1.        , 0.13116546, 0.65627134, 0.4807009 , 0.5930796 ,
       0.15893565], dtype=float32)

# **XGBRegressor**

In [50]:
from sklearn.model_selection import GridSearchCV


In [51]:
# XGBRegressor Training
now = datetime.now()
print("Process started at : ",now)

parameters = { 'gamma' : [0, 0.1, 0.3, 1], 'learning_rate' : [0.001, 0.01, 0.1], 
              'max_depth' : [2, 4, 6, 7, 12], 
              'n_estimators' : [10, 45, 90, 100, 150, 250],
              'nthread' : [-1], 'reg_alpha' : [1], 'reg_lambda' : [1], 'seed' : [10] }

bst = xgb.XGBRegressor()
xgb_grid = GridSearchCV(bst,
                        parameters,
                        cv=5,
                        n_jobs=-1,
                        verbose=True,
                        )
xgb_grid.fit(train_X, train_y, eval_set=[(train_X, train_y)], early_stopping_rounds=50)

end = datetime.now()
print("Process finished at : ", end)
print("Process took : ", end-now)

Process started at :  2022-05-19 10:39:26.796699
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[0]	validation_0-rmse:0.194675
Will train until validation_0-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:0.190501
[2]	validation_0-rmse:0.186963
[3]	validation_0-rmse:0.183896
[4]	validation_0-rmse:0.181228
[5]	validation_0-rmse:0.178938
[6]	validation_0-rmse:0.176818
[7]	validation_0-rmse:0.174837
[8]	validation_0-rmse:0.173173
[9]	validation_0-rmse:0.171617
[10]	validation_0-rmse:0.170215
[11]	validation_0-rmse:0.168969
[12]	validation_0-rmse:0.167827
[13]	validation_0-rmse:0.166773
[14]	validation_0-rmse:0.165536
[15]	validation_0-rmse:0.164569
[16]	validation_0-rmse:0.163655
[17]	validation_0-rmse:0.162725
[18]	validation_0-rmse:0.161875
[19]	validation_0-rmse:0.161075
[20]	validation_0-rmse:0.160226
[21]	validation_0-rmse:0.159558
[22]	validation_0-rmse:0.158854
[23]	validation_0-rmse:0.158212
[24]	validation_0-rmse:0.157592
[25]	validation_0-rmse:0.156951


In [52]:
xgb_grid.best_estimator_


XGBRegressor(max_depth=2, n_estimators=250, nthread=-1, reg_alpha=1, seed=10)

In [53]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
    new_row = [x for x in X] + [value]
    array = np.array(new_row)
    array = array.reshape(1, len(array))

    inverted = scaler.inverse_transform(array)
    ##print("converting %s to %s" % (value,inverted[0, -1]))
    return inverted[0, -1]

In [54]:
# invert differenced value
def inverse_difference(history, yhat, interval=1):
    return yhat + history[-interval]

In [55]:

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((np.array(y_true) - np.array(y_pred)) / y_true)) * 100

In [56]:
#Invert scale predictions to time series
predictions = list()
start = 26 # test period
l = len(test_scaled) - start
print(l)
rmse = []
mape = []

0


In [57]:
X1, y = test_scaled[l, 0:-1], test_scaled[l, -1]


In [58]:
X1.shape, y.shape

((30,), ())

In [59]:
#Invert scale predictions to time series
predictions = list()
start = 26 # test period
l = len(test_scaled) - start
print(l)
rmse = []
mape = []

for i in range(len(test_scaled)):
    X1, y = test_scaled[l, 0:-1], test_scaled[l, -1]
    X1 = X1[-30:] # ts_window

    # prediction 
    # pred = predict()
    X1 = X1.reshape((1,-1))
    pred = xgb_grid.predict(X1)

    yhat = invert_scale(scaler, X1[0], pred)
        
    l=l+1

    yhat = inverse_difference(train_df['quantitySales'], yhat, len(test_scaled) + 1 - i) # if not stationary 

    predictions.append(yhat)

    rmse.append(sqrt(mean_squared_error([train_df['quantitySales'][n_train:][i]], [yhat])))
    mape.append(mean_absolute_percentage_error([train_df['quantitySales'][n_train:][i]], [yhat]))

0


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [60]:
print("Test RMSE:", np.mean(rmse))
print("Test MAPE:", np.mean(mape))

Test RMSE: 7885.618155497756
Test MAPE: 2956.786490999079


In [61]:
result_df = train_df[-26:]
result_df.head()

Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970-01-01 00:00:00.020211102,1,20211102,9936.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020211103,1,20211103,11794.4,0,-3824.8,11794.4
1970-01-01 00:00:00.020211104,1,20211104,3296.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020211105,1,20211105,2584.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020211106,1,20211106,2816.0,0,-3824.8,11794.4


In [62]:
len(predictions), result_df.shape

(26, (26, 6))

In [63]:
result_df['XGB_Predictions'] = predictions
result_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper,XGB_Predictions
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970-01-01 00:00:00.020211102,1,20211102,9936.0,0,-3824.8,11794.4,5007.238268
1970-01-01 00:00:00.020211103,1,20211103,11794.4,0,-3824.8,11794.4,12900.267526
1970-01-01 00:00:00.020211104,1,20211104,3296.0,0,-3824.8,11794.4,14797.963205
1970-01-01 00:00:00.020211105,1,20211105,2584.0,0,-3824.8,11794.4,-5225.839705
1970-01-01 00:00:00.020211106,1,20211106,2816.0,0,-3824.8,11794.4,1276.412998
1970-01-01 00:00:00.020211107,1,20211107,80.0,0,-3824.8,11794.4,2861.918742
1970-01-01 00:00:00.020211108,1,20211108,1808.0,0,-3824.8,11794.4,2914.182341
1970-01-01 00:00:00.020211109,1,20211109,11794.4,0,-3824.8,11794.4,3958.009791
1970-01-01 00:00:00.020211110,1,20211110,1328.0,0,-3824.8,11794.4,16753.46943
1970-01-01 00:00:00.020211111,1,20211111,1328.0,0,-3824.8,11794.4,-4364.202498


In [64]:
result_df.to_csv("result.csv")

# **AutoArima**

In [65]:
train_df[:-26].head(5)

Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970-01-01 00:00:00.020190902,1,20190902,11794.4,0,-3824.8,11794.4
1970-01-01 00:00:00.020190903,1,20190903,7432.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190904,1,20190904,1424.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190905,1,20190905,608.0,0,-3824.8,11794.4
1970-01-01 00:00:00.020190906,1,20190906,776.0,0,-3824.8,11794.4


In [74]:
train_df.columns


Index(['productsGroup_key', 'date_key', 'quantitySales', 'isStationary', 'lower', 'upper'], dtype='object')

In [75]:
train_df_arima = train_df.drop(columns = ['productsGroup_key', 'date_key', 'isStationary', 'lower', 'upper'], axis=1)
train_df_arima.head()

Unnamed: 0_level_0,quantitySales
date_key,Unnamed: 1_level_1
1970-01-01 00:00:00.020190902,11794.4
1970-01-01 00:00:00.020190903,7432.0
1970-01-01 00:00:00.020190904,1424.0
1970-01-01 00:00:00.020190905,608.0
1970-01-01 00:00:00.020190906,776.0


In [76]:
arima_train = train_df_arima[:-26]
arima_test = train[-26:]

In [77]:
# Arima Training
now = datetime.now()
print("Arima process started at : ",now)

arima_stepwise_model = auto_arima(train_df_arima, start_p=0, start_q=0,
                           max_p=13, max_q=13, m=12,
                           start_P=0, seasonal=True,
                           d=1, D=1, trace=True,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True,
                           n_jobs=4)#n_jobs for parallel process
end = datetime.now()
print("Arima process finished at : ", end)
print("Process took : ", end-now)

print(arima_stepwise_model)

Arima process started at :  2022-05-19 11:10:01.445091
Performing stepwise search to minimize aic


  'Falling back to stepwise parameter search.' % n_jobs)


 ARIMA(0,1,0)(0,1,1)[12]             : AIC=inf, Time=1.57 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=16560.599, Time=0.08 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=16236.397, Time=2.70 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=inf, Time=3.86 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=16468.872, Time=0.16 sec
 ARIMA(1,1,0)(2,1,0)[12]             : AIC=16145.625, Time=2.99 sec
 ARIMA(1,1,0)(2,1,1)[12]             : AIC=inf, Time=13.13 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=inf, Time=3.76 sec
 ARIMA(0,1,0)(2,1,0)[12]             : AIC=16157.186, Time=1.51 sec
 ARIMA(2,1,0)(2,1,0)[12]             : AIC=16069.639, Time=3.49 sec
 ARIMA(2,1,0)(1,1,0)[12]             : AIC=16209.572, Time=1.21 sec
 ARIMA(2,1,0)(2,1,1)[12]             : AIC=inf, Time=13.69 sec
 ARIMA(2,1,0)(1,1,1)[12]             : AIC=inf, Time=8.19 sec
 ARIMA(3,1,0)(2,1,0)[12]             : AIC=16036.064, Time=3.52 sec
 ARIMA(3,1,0)(1,1,0)[12]             : AIC=16177.660, Time=1.44 sec
 ARIMA(3,1,0)(

In [78]:
# Fit arima model
arima_predicts = arima_stepwise_model.fit(arima_train)

In [79]:
# Predict results with auto-arima
future_forecast = arima_predicts.predict(n_periods=26)

In [80]:
arima_rmse = sqrt(mean_squared_error(train_df_arima['quantitySales'][-26:], future_forecast))

In [81]:
arima_mape = mean_absolute_percentage_error(train_df_arima['quantitySales'][-26:], future_forecast)

In [82]:
print("Test RMSE:", np.mean(arima_rmse))
print("Test MAPE:", np.mean(arima_mape))

Test RMSE: 4071.817181438609
Test MAPE: 711.1249516531625


In [83]:
result_df['Arima_Predictions'] = future_forecast
result_df.to_csv("result.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
result_df.columns

Index(['productsGroup_key', 'date_key', 'quantitySales', 'isStationary', 'lower', 'upper', 'XGB_Predictions', 'Arima_Predictions'], dtype='object')

In [91]:
result_df[:26].shape

(26, 8)

In [315]:
# define parameters
verbose, epochs, batch_size = 1, 25, 50
n_timesteps, n_features, n_outputs = train_X_lstm.shape[1], train_X_lstm.shape[2], train_Y_lstm.shape[1]

In [316]:
n_timesteps, n_features, n_outputs

(1, 30, 1)

In [324]:
# define model
lstm_model = Sequential()
lstm_model.add(Conv1D(filters=32, kernel_size=5,
                  strides=1, padding="causal",
                  activation="relu",
                  input_shape=(n_timesteps, n_features)))

In [325]:
lstm_model.add(LSTM(500, activation='relu', return_sequences=True))
lstm_model.add(LSTM(500, activation='relu', return_sequences=True))

lstm_model.add(TimeDistributed(Dense(1, activation='relu')))
lstm_model.compile(loss='mae', optimizer=Adam(lr=0.001),
              metrics=['acc'])

  super(Adam, self).__init__(name, **kwargs)


In [326]:
train_X_lstm.shape, train_Y_lstm.shape

((761, 1, 30), (761, 1, 1))

In [327]:
now = datetime.now()

history = lstm_model.fit(train_X_lstm, train_Y_lstm, epochs=epochs, batch_size=batch_size, validation_data=(test_X_lstm, test_Y_lstm), 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=verbose, shuffle=False)

lstm_model.summary()

end = datetime.now()
print("LSTM process finished at : ", end)
print("LSTM process took : ", end-now)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_27 (Conv1D)          (None, 1, 32)             4832      
                                                                 
 lstm_60 (LSTM)              (None, 1, 500)            1066000   
                                                                 
 lstm_61 (LSTM)              (None, 1, 500)            2002000   
                                                                 
 time_distributed_29 (TimeDi  (None, 1, 1)             501       
 stributed)                                                      
                                                                 
Total params: 3,073,333
Trainable params: 3,073,333
Non-trainable params: 0
____________________________________________________

In [328]:
# Predict results with LSTM Model
lstm_predicts = lstm_model.predict(test_X_lstm, verbose=verbose)



In [329]:
# Invert scaled predictons to time series
predictions = list()
start = 26 # test period
l = len(test_scaled) - start

lstm_rmse = []
lstm_mape = []

for i in range(len(test_scaled)):
    X1, y = test_scaled[l, 0:-1], test_scaled[l, -1]
    X1 = X1[-30:] # ts_window

    X1 = X1.reshape((1,-1))

    yhat = invert_scale(scaler, X1[0], lstm_predicts[i][0])
        
    l=l+1

    yhat = inverse_difference(train_df['quantitySales'], yhat, len(test_scaled) + 1 - i)

    predictions.append(yhat)

    lstm_rmse.append(sqrt(mean_squared_error([train_df['quantitySales'][n_train:][i]], [yhat])))
    lstm_mape.append(mean_absolute_percentage_error([train_df['quantitySales'][n_train:][i]], [yhat]))

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [330]:
print("LSTM RMSE:", np.mean(lstm_rmse))
print("LSTM MAPE:", np.mean(lstm_mape))

LSTM RMSE: 11830.27692307689
LSTM MAPE: 978.0004163824686


In [104]:
print("LSTM RMSE:", np.mean(lstm_rmse))
print("LSTM MAPE:", np.mean(lstm_mape))

LSTM RMSE: 5828.060676541682
LSTM MAPE: 1684.9838376581667


In [331]:
result_df['LSTM_Predictions'] = predictions
result_df.head(9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper,XGB_Predictions,Arima_Predictions,LSTM_Predictions
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.020211102,1,20211102,9936.0,0,-3824.8,11794.4,5007.238268,9739.642394,-11602.4
1970-01-01 00:00:00.020211103,1,20211103,11794.4,0,-3824.8,11794.4,12900.267526,6892.675315,-1770.4
1970-01-01 00:00:00.020211104,1,20211104,3296.0,0,-3824.8,11794.4,14797.963205,9120.716069,88.0
1970-01-01 00:00:00.020211105,1,20211105,2584.0,0,-3824.8,11794.4,-5225.839705,4970.170877,-8410.4
1970-01-01 00:00:00.020211106,1,20211106,2816.0,0,-3824.8,11794.4,1276.412998,199.404751,-9122.4
1970-01-01 00:00:00.020211107,1,20211107,80.0,0,-3824.8,11794.4,2861.918742,-119.007167,-8890.4
1970-01-01 00:00:00.020211108,1,20211108,1808.0,0,-3824.8,11794.4,2914.182341,4414.576242,-11626.4
1970-01-01 00:00:00.020211109,1,20211109,11794.4,0,-3824.8,11794.4,3958.009791,6011.956767,-9898.4
1970-01-01 00:00:00.020211110,1,20211110,1328.0,0,-3824.8,11794.4,16753.46943,4730.531214,88.0


In [332]:
result_df.to_csv("result.csv")

In [333]:
result_df.head()

Unnamed: 0_level_0,productsGroup_key,date_key,quantitySales,isStationary,lower,upper,XGB_Predictions,Arima_Predictions,LSTM_Predictions
date_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1970-01-01 00:00:00.020211102,1,20211102,9936.0,0,-3824.8,11794.4,5007.238268,9739.642394,-11602.4
1970-01-01 00:00:00.020211103,1,20211103,11794.4,0,-3824.8,11794.4,12900.267526,6892.675315,-1770.4
1970-01-01 00:00:00.020211104,1,20211104,3296.0,0,-3824.8,11794.4,14797.963205,9120.716069,88.0
1970-01-01 00:00:00.020211105,1,20211105,2584.0,0,-3824.8,11794.4,-5225.839705,4970.170877,-8410.4
1970-01-01 00:00:00.020211106,1,20211106,2816.0,0,-3824.8,11794.4,1276.412998,199.404751,-9122.4
