# **Stacked models**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.stats import skew

In [2]:
df_train = pd.read_csv('/content/train_data.csv', parse_dates=["date"])
df_test_data = pd.read_csv('/content/test_data.csv', parse_dates=["date"])
df_test = df_test_data.copy()
df_train

Unnamed: 0,date,hour,demand
0,2018-08-18,9,91
1,2018-08-18,10,21
2,2018-08-18,13,23
3,2018-08-18,14,104
4,2018-08-18,15,81
...,...,...,...
18242,2021-02-28,19,95
18243,2021-02-28,20,88
18244,2021-02-28,21,39
18245,2021-02-28,22,104


In [3]:
# generating multiple features from date for train data

df_train['dayofweek'] = df_train['date'].dt.dayofweek
df_train['quarter'] = df_train['date'].dt.quarter
df_train['month'] = df_train['date'].dt.month
df_train['monthofquarter'] = df_train['month']%4
df_train['dayofyear'] = df_train['date'].dt.dayofyear
df_train['dayofmonth'] = df_train['date'].dt.day
df_train['weekofyear'] = df_train['date'].dt.weekofyear
df_train['year'] = df_train['date'].dt.year
df_train.drop('date',axis=1, inplace=True)

cols = list(df_train.columns)
cols.remove('demand')
df_train

  if __name__ == '__main__':


Unnamed: 0,hour,demand,dayofweek,quarter,month,monthofquarter,dayofyear,dayofmonth,weekofyear,year
0,9,91,5,3,8,0,230,18,33,2018
1,10,21,5,3,8,0,230,18,33,2018
2,13,23,5,3,8,0,230,18,33,2018
3,14,104,5,3,8,0,230,18,33,2018
4,15,81,5,3,8,0,230,18,33,2018
...,...,...,...,...,...,...,...,...,...,...
18242,19,95,6,1,2,2,59,28,8,2021
18243,20,88,6,1,2,2,59,28,8,2021
18244,21,39,6,1,2,2,59,28,8,2021
18245,22,104,6,1,2,2,59,28,8,2021


In [4]:
# generating multiple features from date for test data

df_test['dayofweek'] = df_test['date'].dt.dayofweek
df_test['quarter'] = df_test['date'].dt.quarter
df_test['month'] = df_test['date'].dt.month
df_test['monthofquarter'] = df_test['month']%4
df_test['dayofyear'] = df_test['date'].dt.dayofyear
df_test['dayofmonth'] = df_test['date'].dt.day
df_test['weekofyear'] = df_test['date'].dt.weekofyear
df_test['year'] = df_test['date'].dt.year
df_test.drop('date',axis=1, inplace=True)

df_test

  if __name__ == '__main__':


Unnamed: 0,hour,dayofweek,quarter,month,monthofquarter,dayofyear,dayofmonth,weekofyear,year
0,0,0,1,3,3,60,1,9,2021
1,1,0,1,3,3,60,1,9,2021
2,2,0,1,3,3,60,1,9,2021
3,3,0,1,3,3,60,1,9,2021
4,5,0,1,3,3,60,1,9,2021
...,...,...,...,...,...,...,...,...,...
7645,19,0,1,3,3,87,28,13,2022
7646,20,0,1,3,3,87,28,13,2022
7647,21,0,1,3,3,87,28,13,2022
7648,22,0,1,3,3,87,28,13,2022


In [5]:
# we are going to keep only 5% of the dataset in test dataset

X = df_train.loc[:, cols].values
y = df_train.loc[:, 'demand'].values
X_test_dataset = df_test.loc[:, cols].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/20)

In [31]:
# hyperparameter tuning of xgb
"""
{'learning_rate': 0.07, 'max_depth': 7, 'n_estimators': 200}
{'learning_rate': 0.02, 'max_depth': 7, 'n_estimators': 400}
"""
# parameters = {
#                 'learning_rate': [0.02, 0.1],
#                 'max_depth': [7, 35, 100],
#                 'n_estimators': [200,  600]
#             }

# xgb = XGBRegressor()
# hyper_params = GridSearchCV(estimator=xgb, param_grid=parameters, n_jobs=-1, cv=5)
# hyper_params.fit(X_train, y_train)

# # find out the best hyper parameters
# hyper_params.best_params_

"\n{'learning_rate': 0.07, 'max_depth': 7, 'n_estimators': 200}\n{'learning_rate': 0.02, 'max_depth': 7, 'n_estimators': 400}\n"

In [30]:
# hyperparameter tuning of random forest
"""
{'max_depth': 7, 'n_estimators': 1000}
{'max_depth': 60, 'n_estimators': 1500}
"""
# parameters = {
#                 'max_depth': [30, 100],
#                 'n_estimators': [  1500, 2000]
#             }

# rf = RandomForestRegressor()
# hyper_params = GridSearchCV(estimator=rf, param_grid=parameters, n_jobs=-1, cv=5)

# hyper_params.fit(X_train, y_train)
# # find out the best hyper parameters
# hyper_params.best_params_

"\n{'max_depth': 7, 'n_estimators': 1000}\n{'max_depth': 60, 'n_estimators': 1500}\n"

In [25]:
# hyperparameter tuning of lgbm
"""
{'learning_rate': 0.15,
 'n_estimators': 200,
 'num_leaves': 12,
 'objective': 'regression'}
 
 {'learning_rate': 0.02,
 'n_estimators': 300,
 'num_leaves': 70,
 'objective': 'regression'}
"""
parameters = {  'objective':['regression'],
                'learning_rate': [0.02, 0.09],
                'num_leaves': [ 30, 70, 100],
                'n_estimators': [ 200, 300, 500]
            }
lgb1 = lgb.LGBMRegressor()
hyper_params = GridSearchCV(estimator=lgb1, param_grid=parameters, n_jobs=-1, cv=5)

hyper_params.fit(X_train, y_train)
# find out the best hyper parameters
hyper_params.best_params_

{'learning_rate': 0.02,
 'n_estimators': 300,
 'num_leaves': 70,
 'objective': 'regression'}

In [28]:
# ensembled prediction using rf over splitted test data
rand_forest_regressor = RandomForestRegressor(max_depth = 60, n_estimators = 1500)
rand_forest_regressor.fit(X_train, y_train)

y_rand_forest_predict = rand_forest_regressor.predict(X_test)
random_forest_model_error = sqrt(mean_squared_error(y_test, y_rand_forest_predict))
print(f' Random Forest Mean Squared Error - {random_forest_model_error}')

 Random Forest Mean Squared Error - 34.41734686532569


In [22]:
# ensembled prediction using xgb over splitted test data
XGB_model = XGBRegressor(learning_rate = 0.02, max_depth = 7, n_estimators = 400)
XGB_model.fit(X_train, y_train)
y_XGB_predict = XGB_model.predict(X_test)
print(y_XGB_predict.shape, y_test.shape)
XGB_model_error = sqrt(mean_squared_error(y_test, y_XGB_predict))

print(f'XGBoost Mean Squared Error - {XGB_model_error}')

(913,) (913,)
XGBoost Mean Squared Error - 32.31509767681853


In [20]:
# ensembled prediction using lgb over splitted test data
lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=70, n_estimators=300, learning_rate = 0.02 )

lgb_model.fit(X_train, y_train)
y_LGB_predict = lgb_model.predict(X_test)

LGB_model_error = sqrt(mean_squared_error(y_test, y_LGB_predict))

print(f'LGBM Mean Squared Error - {LGB_model_error}')

LGBM Mean Squared Error - 32.26877864425272


In [12]:
# taking average of different predictions
ensembled_prediction = (0.33*y_XGB_predict + 0.33*y_LGB_predict + 0.33 *y_rand_forest_predict)
ensembled_prediction_error = sqrt(mean_squared_error(y_test, ensembled_prediction))

print(f'Ensembled Mean Squared Error - {ensembled_prediction_error}')

Ensembled Mean Squared Error - 32.68483869663642


In [33]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn import preprocessing

x_data = df_train[cols].values 
y_data = df_train['demand']

min_max_scaler = preprocessing.MinMaxScaler()

x_data = min_max_scaler.fit_transform(x_data)
y_data = y_data.values
print(x_data.shape, y_data.shape)

regressor = Sequential()
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (x_data.shape[1], 1)))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))
regressor.add(Dense(units = 1))
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(x_data, y_data, epochs = 150, batch_size = 32, validation_split=1/20)

x_test = min_max_scaler.transform(df_test.values)
print(x_test.shape)
y_predict = regressor.predict(x_test)

(18247, 9) (18247,)
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
E

In [34]:
# fit whole data using random forest 
rand_forest_regressor = RandomForestRegressor(max_depth = 60, n_estimators= 1500)
rand_forest_regressor.fit(X, y)
y_rand_forest_predict = rand_forest_regressor.predict(X_test_dataset)
# fit whole data using xgb
XGB_model = XGBRegressor(learning_rate = 0.02, max_depth = 7, n_estimators = 400)
XGB_model.fit(X, y)
y_XGB_predict = XGB_model.predict(X_test_dataset)
# fit whole data using lgbm
lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=70, n_estimators=300, learning_rate = 0.02)
lgb_model.fit(X, y)
y_LGB_predict = lgb_model.predict(X_test_dataset)

# ensembled prediction over test data using xgb and random forest
ensembled_prediction1 = (0.5*y_XGB_predict + 0.5 *y_rand_forest_predict)

# ensembled prediction over test data using lgb and random forest
ensembled_prediction2 = (0.5*y_LGB_predict + 0.5 *y_rand_forest_predict)

ensembled_prediction1a = (0.5*y_LGB_predict + 0.5 *y_XGB_predict)

# ensembled prediction over test data lgb, xgb, and rf
ensembled_prediction3 = (0.33*y_LGB_predict + 0.33*y_XGB_predict + 0.33 *y_rand_forest_predict)

# ensembled prediction over test data lgb, xgb, and rf
ensembled_prediction4 = (0.25*y_LGB_predict + 0.25*y_XGB_predict + 0.25 *y_rand_forest_predict + 0.25*y_predict)



In [35]:
df_test_data['demand'] = ensembled_prediction1
df_test_data.to_csv('xgb_rf.csv', index=False)
df_test_data['demand'] = ensembled_prediction2
df_test_data.to_csv('lgb_rf.csv', index=False)
df_test_data['demand'] = ensembled_prediction3
df_test_data.to_csv('lgb_xgb_rf.csv', index=False)
df_test_data['demand'] = ensembled_prediction1a
df_test_data.to_csv('lgb_xgb.csv', index=False)
df_test_data['demand'] = ensembled_prediction4
df_test_data.to_csv('lstm_lgb_xgb_rf.csv', index=False)