# Using XGBoost to Forecast NSW Energy Demand
This notebook will be used for developing an XGBoost model that will be used for forecasting the demand in NSW, with the particular goal of looking at how residential (small scale) solar PV impacts the daily maximum, minimum and range of demand in NSW.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import multiprocessing

In [2]:
# set style for plots
sns.set_theme()

In [3]:
# read in data
df = pd.read_csv('../data/raw/all_data.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
df.sort_index()
df.head()

Unnamed: 0_level_0,demand_min,demand_max,temp_min,temp_max,temp_mean,units,cum_units,output,cum_output,population
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-01,6157.36,8922.42,22.1,28.8,25.094,1790.0,21518.0,2767.019,30362.926,7144292.0
2010-01-02,6112.73,9326.64,21.6,29.4,24.765385,1810.677419,21596.419355,2811.171258,30496.336935,7144495.0
2010-01-03,6014.91,8277.85,17.9,21.5,19.429825,1831.354839,21674.83871,2855.323516,30629.747871,7144699.0
2010-01-04,6023.79,9522.3,17.9,23.9,20.625926,1852.032258,21753.258065,2899.475774,30763.158806,7144902.0
2010-01-05,6287.12,10728.72,15.4,27.7,22.660417,1872.709677,21831.677419,2943.628032,30896.569742,7145106.0


In [4]:
def create_date_predictors(dataframe):
    df = dataframe.copy()
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    return df

In [5]:
# Add in datetime predictors
df = create_date_predictors(df)

In [6]:
df.head()

Unnamed: 0_level_0,demand_min,demand_max,temp_min,temp_max,temp_mean,units,cum_units,output,cum_output,population,dayofweek,month,year,dayofyear,dayofmonth
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-01-01,6157.36,8922.42,22.1,28.8,25.094,1790.0,21518.0,2767.019,30362.926,7144292.0,4,1,2010,1,1
2010-01-02,6112.73,9326.64,21.6,29.4,24.765385,1810.677419,21596.419355,2811.171258,30496.336935,7144495.0,5,1,2010,2,2
2010-01-03,6014.91,8277.85,17.9,21.5,19.429825,1831.354839,21674.83871,2855.323516,30629.747871,7144699.0,6,1,2010,3,3
2010-01-04,6023.79,9522.3,17.9,23.9,20.625926,1852.032258,21753.258065,2899.475774,30763.158806,7144902.0,0,1,2010,4,4
2010-01-05,6287.12,10728.72,15.4,27.7,22.660417,1872.709677,21831.677419,2943.628032,30896.569742,7145106.0,1,1,2010,5,5


In [7]:
# Create 5, 1 year splits
splits = TimeSeriesSplit(n_splits=5, test_size=365)
# set up data for model
y_min = df['demand_min']
y_max = df['demand_max']
X = df.drop(['demand_max', 'demand_min'], axis=1)

In [19]:
# set up model
search_params = {'max_depth': [2, 4, 6, 8], 'n_estimators':[100, 250, 500, 750, 1000], 'learning_rate': [0.01, 0.05, 0.1]}
xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count()//2)
clf_max = GridSearchCV(estimator=xgb_model,
                       param_grid=search_params,
                       cv=splits,
                       n_jobs=2,
                       verbose=True)

In [20]:
clf_max.fit(X, y_max, verbose=100)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [21]:
clf_max.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}