# LGBM for the traffic flow data

https://www.kaggle.com/robikscube/tutorial-time-series-forecasting-with-xgboost

Importing libraries

In [None]:
# general routine set up:

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# get wd and change the wd plus import libraries
import os
os.getcwd()
#os.chdir("/Users/Manu/Dropbox/CBS MSc Thesis Research Folder/DATA & Code/Model Specific Notebooks")


# standard
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
from math import sqrt
import pickle

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
import tensorflow.keras.backend as K
from lightgbm import LGBMRegressor

Loading the data

In [None]:
# this allows for accessing files stored in your google drive using the path "/gdrive/"
# mounting google drive locally:

from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [None]:
#loading the hourly traffic data (1 year of data; June 2018 to June 2019)
filename = "/gdrive/My Drive/Colab Notebooks/taxi_series_H"
infile = open(filename,'rb')
taxidemand_ts = pickle.load(infile)
infile.close()

In [None]:
taxidemand_ts.shape

(8760,)

Preprocessing: already done


Time Series data must be re-framed as a supervised learning dataset before we can start using machine learning algorithms.  
There is no concept of input and output features in time series. Instead, we must choose the variable to be predicted and use feature engineering to construct all of the inputs that will be used to make predictions for future time steps.

## Feature Engineering for Times Series

[feature_eng_ts.png](attachment:feature_eng_ts.png)

In this tutorial, we will look at three classes of features that we can create from our time series dataset:

    Date Time Features: these are components of the time step itself for each observation.
    Lag Features: these are values at prior time steps.
    Window Features: these are a summary of values over a fixed window of prior time steps.


Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.

The goal of feature engineering is to provide strong and ideally simple relationships between new input features and the output feature for the supervised learning algorithm to model.

**Preprocessing for Multioutput Forecasting**

### Preprocessing (For multi output model)
For supervised machine learning methods to work on time series it is necessary to do a certain amount of preprocessing. This includes feature generation such as embedding(lags). 

Preprocessing for 48 steps ahead

In [None]:
## 
taxi_ts_48 = taxidemand_ts.iloc[0:-192 + 24]
taxi_ts_48.shape

(8592,)

In [None]:
## Set paramaters
data = taxi_ts_48.copy()
input_lags = 60 ## number of lags to be used for input. should be 2,5 times the seasonal period 
output_lags = 48 ## number of future oberservations to be forecasted
n_test = 48 ## size of test set 


In [None]:
## Split data in train and test set
train = data[0:-n_test]
test = data[-n_test:]
print(train.shape)
print(test.shape)

(8544,)
(48,)


In [None]:
## Create dataframe with the correct dimensions. Each row represents the past 54 observations and the 24 future observations
df = pd.DataFrame()
n_train = len(train)

## create inputs lags
for i in range(input_lags,0,-1):
    df['t-' + str(i)] = train.shift(i)

## create output lags
for j in range(0,output_lags,1):
    df['t+' + str(j)] = train.shift(-j)

## remove the first input_lags rows and last output_lags rows   
df = df[input_lags:(n_train-output_lags+1)]  

print(df.shape)

(8437, 108)


In [None]:
## Split train into features X and targets Y
X_train = df.iloc[:,:input_lags] 
y_train = df.iloc[:,input_lags:]

## To create the test features X_test we cannot use any of the test set, since that includes the data held out
# for testing. We therefor use the last input_lags number of observations in the training set. These are however
# split between the targets and features and requires a combination of the two. (output_lags) from the targets 
# and (input_lags - output_lags) from the features
X_test = X_train.iloc[len(X_train) - 1,:][output_lags:] ## First get the last (input_lags - output_lags) from the features 
X_test = X_test.append(y_train.iloc[len(y_train) - 1,:]) ## Second, add the last (output_lags) from the targets

In [None]:
## Remodel as numpy arrays and reshape
X_train_multi = X_train.values ## should be (n - input_lags - outputlags - n_test + 1) x (input_lags)
y_train_multi = y_train.values ## should be (n - input_lags - outputlags - n_test + 1) x (output_lags)
X_test_multi = X_test.values.reshape(1,input_lags) ## should be (1) x (input_lags) 
y_test_multi = test.values.reshape(1,n_test) ## should be (1) x (n_test)

print("X_train_multi: " + "type: " + str(type(X_train_multi)) + "\tshape: " + str(X_train_multi.shape))
print("y_train_multi: " + "type: " + str(type(y_train_multi)) + "\tshape: " + str(y_train_multi.shape))
print("X_test_multi: " + "type: " + str(type(X_test_multi)) + "\tshape: " + str(X_test_multi.shape))
print("y_test_multi: " + "type: " + str(type(y_test_multi)) + "\tshape: " + str(y_test_multi.shape))

X_train_multi: type: <class 'numpy.ndarray'>	shape: (8437, 60)
y_train_multi: type: <class 'numpy.ndarray'>	shape: (8437, 48)
X_test_multi: type: <class 'numpy.ndarray'>	shape: (1, 60)
y_test_multi: type: <class 'numpy.ndarray'>	shape: (1, 48)


$\textbf{General setup of the algorithm}:$ As far as general parameters go, the booster "gbtree" has been used here, ie. I have been using tree based models in each iteration instead of a linear model, which is rarely used. I have started out with a number of estimators of 1,000 and then I have fine tuned the following parameters after try 2: max_depth and min_child_weight (gsearch1), reg_alpha (gsearch2), gamma (gsearch3), subsample and colsample_bytree (gsearch4), reg_lambda (gsearch5).  
I have used a feature importance plot of the XGBClassfier model to select the most important features to include. I have included the top 10 features out of the 24 features provided from try 3 onwards.
The fine tuned parameters have the following influence on the xg boosting algorithm:  
  
$\textit{max_depth:}$ specifies the max depth of a tree and can be used to control overfitting as higher depth will allow the model to learn relations very specific to a particular sample   
$\textit{min_child_weight:}$ this sets the minimum sum of weights of all observations required in a child. Higher values prevent the model from learning too specific relations.  
$\textit{reg_alpha:}$ L1 regularization term. Can be used in case of high dimensionality to make the algorithm run faster. Can be a solution to overfitting in case of a relatively small dataset.  
$\textit{gamma:}$ sets the minimum loss function required to make a split.  
$\textit{subsample:}$ sets the fraction of observations to be random samples of each tree. lower values prevent overfitting but small too small values might lead to underfitting.  
$\textit{colsample_bytree:}$ fraction of columns to be random samples of each tree.  
$\textit{reg_lambda:}$ L2 regularization term. can be a solution to overfitting in case of a relatively small dataset. Can be explored to reduce overfitting.

#LightGBM (Gradient Boosting)



https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

lightgbm parameters:

https://lightgbm.readthedocs.io/en/latest/Parameters.html


## Fitting the optimal model

#LightGBM optimal model 48 step ahead predictions



In [None]:
# training the best model (48 steps ahead)
lgbm_reg_opt_new = LGBMRegressor(random_state=1,learning_rate=0.1, objective = "root_mean_squared_error", n_estimators =500, colsample_bytree = 0.8, max_depth = 8, subsample= 1.0, num_leaves=30, min_child_samples=20, min_child_weight =3, reg_lambda = 0.7, reg_alpha= 0.0, eval_metric = "rmse",boosting_type="gbdt")

lgbm_moreg_fit_new = MultiOutputRegressor(lgbm_reg_opt_new).fit(X_train_multi, y_train_multi)

## Obtaining predictions

**Multi-output model**

LIGHTGBM

In [None]:
# 48 steps ahead predictions
lgbm_preds_new = lgbm_moreg_fit_new.predict(X_test_multi)

In [None]:
# MSE
mse_lgbm_new = mean_squared_error(y_test_multi, lgbm_preds_new)
mse_lgbm_new

719932.0755213862

In [None]:
# RMSE
sqrt(mse_lgbm_new)

848.4881115969665

LGBM predictions 72 steps ahead

In [None]:
##
taxi_ts_72 = taxidemand_ts.iloc[0:-192 + 24*2]
taxi_ts_72.shape

(8616,)

In [None]:
## Set paramaters
data = taxi_ts_72.copy()
input_lags = 60 # 2 and a half times the seasonal period
output_lags = 72 # we predict 72 hours ahead
n_test = 72 # output_lags  (changed)

In [None]:
## Split data in train and test set
train = data[0:-n_test]
test = data[-n_test:]
print(train.shape)
print(test.shape)

(8544,)
(72,)


In [None]:
## Create lagged values for both input and output window (24)
data = train.copy()
n_train = len(data)

##Create lagged values for input
df = pd.DataFrame()
for i in range(input_lags,0,-1):
    df['t-' + str(i)] = data.shift(i)

##Create lagged values for output
for j in range(0,output_lags,1):
    df['t+' + str(j)] = data.shift(-j)
    
df = df[input_lags:(n_train-output_lags+1)]

In [None]:
## splitting the training set into labels and features
X_train = df.iloc[:,:input_lags] # from the beginning to input_lags
Y_train = df.iloc[:,input_lags:] # from input_lags to the end

## Use the last window of the training set as the features for the test set. This requires a combination of 
## X_train and Y_train.
#X_test = X_train.iloc[len(X_train) - 1,:][output_lags:]
#X_test = X_test.append(Y_train.iloc[len(Y_train) - 1,:]).values.reshape(1,input_lags)
X_test = Y_train.iloc[len(Y_train) - 1,:][-input_lags:].values.reshape(1,input_lags)
Y_test = test[:output_lags].values.reshape(1,output_lags)

X_train = X_train.values # 54 steps back (54 lags)
Y_train = Y_train.values # 24 steps ahead

print("X_train: " + "type: " + str(type(X_train)) + "\tshape: " + str(X_train.shape))
print("Y_train: " + "type: " + str(type(Y_train)) + "\tshape: " + str(Y_train.shape))
print("X_test: " + "type: " + str(type(X_test)) + "\tshape: " + str(X_test.shape))
print("Y_test: " + "type: " + str(type(Y_test)) + "\tshape: " + str(Y_test.shape))

X_train: type: <class 'numpy.ndarray'>	shape: (8413, 60)
Y_train: type: <class 'numpy.ndarray'>	shape: (8413, 72)
X_test: type: <class 'numpy.ndarray'>	shape: (1, 60)
Y_test: type: <class 'numpy.ndarray'>	shape: (1, 72)


In [None]:
# best model (72 steps ahead)
lgbm_reg_opt_new = LGBMRegressor(random_state=1,learning_rate=0.1, objective = "root_mean_squared_error", n_estimators =500, colsample_bytree = 0.8, max_depth = 8, subsample= 1.0, num_leaves=30, min_child_samples=20, min_child_weight =3, reg_lambda = 0.7, reg_alpha= 0.0, eval_metric = "rmse",boosting_type="gbdt")

lgbm_moreg_fit_new = MultiOutputRegressor(lgbm_reg_opt_new).fit(X_train, Y_train)





Obtaining predictions 72 hours ahead


In [None]:
lgbm_preds_new = lgbm_moreg_fit_new.predict(X_test_multi)

In [None]:
# MSE
mse_lgbm_new = mean_squared_error(Y_test, lgbm_preds_new)
mse_lgbm_new

725239.0657029544

In [None]:
# RMSE
sqrt(mse_lgbm_new)

851.6096909400188

96 hours ahead predictions

In [None]:
##
taxi_ts_96 = taxidemand_ts.iloc[0:-192 + 24*3]
taxi_ts_96.shape

(8640,)

In [None]:
## Set paramaters
data = taxi_ts_96.copy()
input_lags = 60 # 2 and a half times the seasonal period
output_lags = 96 # we predict 96 hours ahead
n_test = 96 # output_lags  (changed)

In [None]:
## Split data in train and test set
train = taxi_ts_96[0:-n_test]
test = taxi_ts_96[-n_test:]
print(train.shape)
print(test.shape)

(8544,)
(96,)


In [None]:
## Create lagged values for both input and output window (24)
data = train.copy()
n_train = len(data)

##Create lagged values for input
df = pd.DataFrame()
for i in range(input_lags,0,-1):
    df['t-' + str(i)] = data.shift(i)

##Create lagged values for output
for j in range(0,output_lags,1):
    df['t+' + str(j)] = data.shift(-j)
    
df = df[input_lags:(n_train-output_lags+1)]

In [None]:
## splitting the training set into labels and features
X_train = df.iloc[:,:input_lags] # from the beginning to input_lags
Y_train = df.iloc[:,input_lags:] # from input_lags to the end

## Use the last window of the training set as the features for the test set. This requires a combination of 
## X_train and Y_train.
#X_test = X_train.iloc[len(X_train) - 1,:][output_lags:]
#X_test = X_test.append(Y_train.iloc[len(Y_train) - 1,:]).values.reshape(1,input_lags)
X_test = Y_train.iloc[len(Y_train) - 1,:][-input_lags:].values.reshape(1,input_lags)
Y_test = test[:output_lags].values.reshape(1,output_lags)

X_train = X_train.values # 54 steps back (54 lags)
Y_train = Y_train.values # 24 steps ahead

print("X_train: " + "type: " + str(type(X_train)) + "\tshape: " + str(X_train.shape))
print("Y_train: " + "type: " + str(type(Y_train)) + "\tshape: " + str(Y_train.shape))
print("X_test: " + "type: " + str(type(X_test)) + "\tshape: " + str(X_test.shape))
print("Y_test: " + "type: " + str(type(Y_test)) + "\tshape: " + str(Y_test.shape))

X_train: type: <class 'numpy.ndarray'>	shape: (8389, 60)
Y_train: type: <class 'numpy.ndarray'>	shape: (8389, 96)
X_test: type: <class 'numpy.ndarray'>	shape: (1, 60)
Y_test: type: <class 'numpy.ndarray'>	shape: (1, 96)


In [None]:
# best model (96 steps ahead)
lgbm_reg_opt_new = LGBMRegressor(random_state=1,learning_rate=0.1, objective = "root_mean_squared_error", n_estimators =500, colsample_bytree = 0.8, max_depth = 8, subsample= 1.0, num_leaves=30, min_child_samples=20, min_child_weight =3, reg_lambda = 0.7, reg_alpha= 0.0, eval_metric = "rmse",boosting_type="gbdt")

lgbm_moreg_fit_new = MultiOutputRegressor(lgbm_reg_opt_new).fit(X_train, Y_train)

Obtaining predictions 96 hours ahead

In [None]:
lgbm_preds_new = lgbm_moreg_fit_new.predict(X_test_multi)

In [None]:
# MSE
mse_lgbm_new = mean_squared_error(Y_test, lgbm_preds_new)
mse_lgbm_new

647761.0871661515

In [None]:
# RMSE
sqrt(mse_lgbm_new)

804.8360622922854