# Load Forecasting Training

This notebook is responsible for reading in data prepped with weather and building out models. 

We use the Gradient Boost Regressor, along with lags, to determine a time series forecast model.

In [None]:
# sklearn
# ==============================================================================
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_error, accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor, plot_importance
import cloudpickle

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# PySpark 
# ==============================================================================
from pyspark.sql.functions import col, sum,avg, lit

import os



## Import Prepped Data

We import the data from the PreProcess Step

In [None]:
# mount the data lake
### 
###  It is up to you on how to mount the data lake container - however, it must be mounted to /mnt/lf for this example
###

Mounted load-forecasting successfully
Out[5]: 'wasbs://load-forecasting@stgmadlssharedcc.blob.core.windows.net'

In [None]:
training_df = spark.read.parquet('/mnt/lf/training/consumptiondata_withweather_by_group.parquet')
display(training_df)

training_df = training_df.na.fill(value=999)

ForecastTimestamp,TemperatureC,DewPointC,RelativeHumidity,PrecipitationAmountmm,WindDirectionDegrees,WindSpeedKmh,VisibilityKm,StationPressurekPa,Humidex,WindChillC,Weather,KWHConsumption,Grouping
2022-01-01T00:00:00.000+0000,-25.799999237060547,-29.200000762939453,73.0,0.0,23.0,15.0,,88.5999984741211,,-36.0,,63.92999992519617,80
2022-01-01T01:00:00.000+0000,-25.600000381469727,-29.200000762939453,72.0,0.0,23.0,13.0,,88.52999877929688,,-35.0,,81.27999954484403,80
2022-01-01T02:00:00.000+0000,-25.600000381469727,-29.399999618530277,70.0,0.0,23.0,15.0,,88.48999786376953,,-36.0,,77.45999948680401,80
2022-01-01T03:00:00.000+0000,-25.200000762939453,-28.799999237060547,72.0,0.0,23.0,9.0,,88.43000030517578,,-33.0,,74.98999984003603,80
2022-01-01T04:00:00.000+0000,-25.299999237060547,-29.100000381469727,70.0,0.0,20.0,4.0,,88.38999938964844,,-29.0,,75.10999999567866,80
2022-01-01T05:00:00.000+0000,-25.100000381469727,-28.899999618530277,71.0,0.0,21.0,4.0,,88.33000183105469,,-29.0,,64.67999978549778,80
2022-01-01T06:00:00.000+0000,-24.100000381469727,-27.899999618530277,71.0,0.0,22.0,8.0,,88.25,,-31.0,,65.93000025860965,80
2022-01-01T07:00:00.000+0000,-24.700000762939453,-28.5,71.0,0.0,22.0,1.0,,88.25,,-26.0,,63.04000023007393,80
2022-01-01T08:00:00.000+0000,-23.700000762939453,-27.5,71.0,0.0,24.0,4.0,,88.18000030517578,,-28.0,,58.24999981932342,80
2022-01-01T09:00:00.000+0000,-23.5,-27.700000762939453,68.0,0.0,20.0,5.0,,88.1500015258789,,-28.0,,53.65999955870211,80


## Hyperparameters

\#todo grid search

In [None]:

xgboost_hyperparams = {
    "n_estimators": 1000,
    "max_depth": 4,
    "subsample": 1.0, #use all sample
    "colsample_bytree": 1.0,
    "eta": 0.01,
}

gbr_hyperparams = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    #"loss": "squared_error",
}

## Pandas Training / sklearn Training

In [None]:
from datetime import datetime

def train_meter_model(meter_train_pdf):
    start = datetime.now()
    #meter_num = meter_train_pdf["MeterNumber"][0]
    grouping = str(meter_train_pdf["Grouping"][0])
    
    model_name = 'MeterModel_'+grouping+'.pkl'
    model_dir = '/dbfs/mnt/lf/models/'+grouping
    model_save_path = model_dir + '/' + model_name
    
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    
    #if os.path.isfile(model_save_path):
    #    print("model "+str(meter_num)+" already created")
    #    return pd.DataFrame({'meter_num':meter_num,'mae':[0], 'rmse': [0], 'model_name':[model_name]})
    
    print("training meter model: "+str(grouping))
    
    meter_train_pdf['ForecastTime'] = pd.to_datetime(meter_train_pdf['ForecastTimestamp'])
    meter_train_pdf = meter_train_pdf.set_index('ForecastTime')
    meter_train_pdf = meter_train_pdf.sort_index()
    meter_train_pdf = meter_train_pdf.drop(columns='ForecastTimestamp')
    #meter_train_pdf = meter_train_pdf.drop(columns='MeterNumber')
    meter_train_pdf = meter_train_pdf.drop(columns='Weather') # we don't care about this, future improvement maybe
    meter_train_pdf = meter_train_pdf.drop(columns='Grouping')
    
    #def train_meter_model(inputdf):
    target_column = 'KWHConsumption'
    lags_to_use = [1,2,3,4,22,23,24,46,47,48]

    test_size=96
    
    #generate the time params we are interested in
    meter_train_pdf['ReadingMonth'] = meter_train_pdf.index.month
    meter_train_pdf['ReadingWeekDay'] = meter_train_pdf.index.day_of_week + 1
    meter_train_pdf['ReadingHour'] = meter_train_pdf.index.hour + 1
    meter_train_pdf['ReadingDay'] = meter_train_pdf.index.day


    #generate the lag data
    for lag in lags_to_use:
        meter_train_pdf['lag_'+str(lag)] = meter_train_pdf[target_column].shift(lag)
    meter_train_pdf = meter_train_pdf[max(lags_to_use):]

    #get the x,y values
    consuption_pdf = meter_train_pdf['KWHConsumption']
    meter_train_pdf = meter_train_pdf.drop(columns='KWHConsumption')

    X_train = meter_train_pdf[:-test_size]
    X_test = meter_train_pdf[-test_size:]
    Y_train = consuption_pdf[:-test_size]
    Y_test = consuption_pdf[-test_size:]
    
    if len(X_train) == 0:
        return pd.DataFrame({'group':grouping,'mae':[None], 'rmse': [None], 'model_name':[None]})
    
    print("training meter model size: "+str(grouping) +":"+str(len(X_train)))
    
    print("fiting meter model: "+str(grouping))
    
    #model = XGBRegressor(**xgboost_hyperparams)
    model = GradientBoostingRegressor(**gbr_hyperparams)
    model.fit(X_train, Y_train)

    mae = mean_absolute_error(Y_test, model.predict(X_test))
    print("The mean squared error (MAE) on test set: {:.4f}".format(mae))

    rmse = mean_squared_error(Y_test, model.predict(X_test), squared=False)
    print("The root mean squared error (RMSE) on test set: {:.4f}".format(rmse))

    with open(model_save_path, mode='wb') as file:
       cloudpickle.dump(model, file)
    
    print("completed training meter model: "+str(grouping))
    
    end = datetime.now()
    
    delta = end - start

    # time difference in seconds
    print(f"Model train time {grouping}: {delta.total_seconds()} seconds")
    
    #generate plots and information
    y_pred = model.predict(X_test)
    fig, ax = plt.subplots(figsize=(12, 3.5))
    #Y_test.plot(linewidth=2, label='test', ax=ax)
    ax.plot(X_test.index,y_pred,linewidth=2, color = "red", label = "pred")
    ax.plot(X_test.index,Y_test,linewidth=2, color = "blue", label = "test")
    ax.legend();
    plt.savefig(model_dir + '/'+model_name+'_test_results.png')
    
    return pd.DataFrame({'group':grouping,'mae':[mae], 'rmse': [rmse], 'model_name':[model_name]})

In [None]:
#training_df.repartition("MeterNumber")  
#Temp
train_result = training_df\
    .groupby("Grouping")\
    .applyInPandas(train_meter_model, schema = "group string, mae float, rmse float, model_name string")

In [None]:
#train ALL the models, let's gooooooo
#display(train_result.head(1))

train_result.collect()
train_result.write.parquet('/mnt/lf/training/train_results_group1.parquet')

