# Machine learning application: Forecasting wind power

<table>
  <tr><td>
    <img src="https://github.com/dmatrix/mlflow-workshop-part-3/raw/master/images/wind_farm.jpg"
         alt="Keras NN Model as Logistic regression"  width="800">
  </td></tr>
</table>

In this notebook, we will use the MLflow Model Registry to build a machine learning application that forecasts the daily power output of a [wind farm](https://en.wikipedia.org/wiki/Wind_farm). 

Wind farm power output depends on weather conditions: generally, more energy is produced at higher wind speeds. Accordingly, the machine learning models used in the notebook predict power output based on weather forecasts with three features: `wind direction`, `wind speed`, and `air temperature`.

*This notebook uses altered data from the [National WIND Toolkit dataset](https://www.nrel.gov/grid/wind-toolkit.html) provided by NREL, which is publicly available and cited as follows:*

*Draxl, C., B.M. Hodge, A. Clifton, and J. McCaa. 2015. Overview and Meteorological Validation of the Wind Integration National Dataset Toolkit (Technical Report, NREL/TP-5000-61740). Golden, CO: National Renewable Energy Laboratory.*

*Draxl, C., B.M. Hodge, A. Clifton, and J. McCaa. 2015. "The Wind Integration National Dataset (WIND) Toolkit." Applied Energy 151: 355366.*

*Lieberman-Cribbin, W., C. Draxl, and A. Clifton. 2014. Guide to Using the WIND Toolkit Validation Code (Technical Report, NREL/TP-5000-62595). Golden, CO: National Renewable Energy Laboratory.*

*King, J., A. Clifton, and B.M. Hodge. 2014. Validation of Power Output for the WIND Toolkit (Technical Report, NREL/TP-5D00-61714). Golden, CO: National Renewable Energy Laboratory.*


### Classes and Utility functions

In [1]:
!pip install mlflow



In [2]:
import pandas as pd
import time
import warnings
from mlflow.tracking.client import MlflowClient
from mlflow.entities.model_registry.model_version_status import ModelVersionStatus
warnings.filterwarnings("ignore")

In [3]:
class Utils:
  @staticmethod
  def load_data(path, index_col=0):
    df = pd.read_csv(path,index_col=0)
    return df
  
  @staticmethod
  def get_training_data(df):
    # From 2014 through 2018 and drop the power column since that
    # is our dependent variable
    
    training_data = pd.DataFrame(df["2014-01-01":"2018-01-01"])
    X = training_data.drop(columns="power")
    
    # Get our dependent variable values
    y = training_data["power"]
    return X, y

  @staticmethod
  def get_validation_data(df):
    # From 2018 through 2019 and drop the power column since that
    # our dependent variable
    
    validation_data = pd.DataFrame(df["2018-01-01":"2019-01-01"])
    X = validation_data.drop(columns="power")
    
    # Get our dependent variable values
    y = validation_data["power"]
    return X, y

  @staticmethod
  def get_weather_and_forecast(df):
    format_date = lambda pd_date : pd_date.date().strftime("%Y-%m-%d")
    
    # Get some time stamps
    today = pd.Timestamp('today').normalize()
    week_ago = today - pd.Timedelta(days=5)
    week_later = today + pd.Timedelta(days=5)
    
    # Get past power output from a week ago to today
    past_power_output = pd.DataFrame(df)[format_date(week_ago):format_date(today)]
    
    # Get weather and forewcast a week ago to today + a week later
    weather_and_forecast = pd.DataFrame(df)[format_date(week_ago):format_date(week_later)]
    if len(weather_and_forecast) < 10:
      # Get last 5 rows and 10 columns
      past_power_output = pd.DataFrame(df).iloc[-10:-5]
      # Get last 10 rows and 10 columns
      weather_and_forecast = pd.DataFrame(df).iloc[-10:]

    return weather_and_forecast.drop(columns="power"), past_power_output["power"]

In [27]:
import pandas as pd
import matplotlib.dates as mdates
from matplotlib import pyplot as plt

In [28]:
class PlotUtils:
    @staticmethod
    def plot(model_uri, power_predictions, past_power_output):
      index = power_predictions.index
      fig = plt.figure(figsize=(11, 7))
      ax = fig.add_subplot(111)
      ax.set_xlabel("Date", size=20, labelpad=20)
      ax.set_ylabel("Power\noutput\n(MW)", size=20, labelpad=60, rotation=0)
      ax.tick_params(axis='both', which='major', labelsize=17)
      ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d'))
      ax.plot(index[:len(past_power_output)], past_power_output, label="True", color="red", alpha=0.5, linewidth=4)
      ax.plot(index, power_predictions, "--", label="Predicted by {}".format(model_uri), color="blue", linewidth=3)
      ax.set_ylim(ymin=0, ymax=max(3500, int(max(power_predictions.values) * 1.3)))
      ax.legend(fontsize=14)
      plt.title("Wind farm power output and projections", size=24, pad=20)
      plt.tight_layout()
      display(plt.show())
      
    @staticmethod
    def forecast_power(model_uri, wind_farm_data):
      '''
      Function that loads a pyfunc flavor of the model, predicts with unseen data
      and compares with actual forecast. 
      '''
      print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_uri))
      model = mlflow.pyfunc.load_model(model_uri)
      weather_data, past_power_output = Utils.get_weather_and_forecast(wind_farm_data)
      
      # Score our model
      power_predictions = pd.DataFrame(model.predict(weather_data))
      power_predictions.index = pd.to_datetime(weather_data.index)
      PlotUtils.plot(model_uri, power_predictions, past_power_output)

In [5]:
import mlflow

print("Using mlflow version {}".format(mlflow.__version__))

Using mlflow version 1.11.0


In [16]:
import mlflow.sklearn
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [17]:
class RFRModel():
  def __init__(self, params={}):
    self.rf = RandomForestRegressor(**params)
    self.params = params
    self._mse = None
    self._rsme = None
    
  @classmethod
  def new_instance(cls, params={}):
    return cls(params)
  
  def model(self):
    return self.rf
  
  @property
  def mse(self):
    return self._mse
  
  @mse.setter
  def mse(self, value):
    self._mse = value
  
  @property
  def rsme(self):
    return self._rsme
  
  @rsme.setter
  def rsme(self, value):
    self._rsme = value
  
  def mlflow_run(self, X_train, y_train, val_x, val_y, run_name="Random Forest Regressor: Power Forecasting Model"):
    """
      Method to do all the tracking using MLflow Fluent APIs:
      log parameters, metrics, and arifacts (models)
    """
    with mlflow.start_run(run_name=run_name) as run:
      
      # Single call to log all parameters
      mlflow.log_params(self.params)
      
      # stand code for trainging model
      self.rf.fit(X_train, y_train)
      self._mse = mean_squared_error(self.rf.predict(val_x), val_y)
      self._rsme = np.sqrt(self._mse)
      print("Validation MSE: %d" % self._mse)
      print("Validation RMSE: %d" % self._rsme)
      
      # Log metrics
      mlflow.log_metric("mse", self._mse)
      mlflow.log_metric("rmse", self._rsme)
      
      # log the model as artifact
      mlflow.sklearn.log_model(sk_model= self.model(),
        artifact_path="sklearn-model")
      run_id = run.info.run_id
    return run_id

In [18]:
csv_path = "https://raw.githubusercontent.com/dmatrix/mlflow-workshop-part-3/master/src/data/windfarm_data.csv"
wind_farm_data = Utils.load_data(csv_path, index_col=0)
model_name="PowerForecastingModel"

In [19]:
wind_farm_data.head(5)

Unnamed: 0,temperature_00,wind_direction_00,wind_speed_00,temperature_08,wind_direction_08,wind_speed_08,temperature_16,wind_direction_16,wind_speed_16,power
2014-01-01,4.702022,106.74259,4.743292,7.189482,100.41638,6.593832,8.172301,99.288,5.967206,1959.3535
2014-01-02,7.695733,98.036705,6.142715,9.977118,94.03181,4.383676,9.690135,204.25444,1.696528,1266.6239
2014-01-03,9.608235,274.0612,10.514304,10.840864,242.87563,16.869741,8.991079,250.2683,12.038399,7545.6797
2014-01-04,6.955563,257.91022,7.18917,5.317223,254.2617,9.069233,3.021174,284.06537,4.590843,3791.0408
2014-01-05,0.830547,265.3944,4.263086,2.480239,104.79496,3.042063,4.227131,263.4169,3.899182,880.6115


In [20]:
X_train, y_train = Utils.get_training_data(wind_farm_data)
X_val, y_val = Utils.get_validation_data(wind_farm_data)

In [21]:
# Use sqlite:///mlruns.db as the local store for tracking and registery
mlflow.set_tracking_uri("sqlite:///mlruns.db")

In [22]:
params_list = [
        {"n_estimators": 100, "max_depth": 10},
        {"n_estimators": 200,"max_depth": 15},
        {"n_estimators": 300, "max_depth": 20 }]


# Iterate over few different tuning parameters
for params in params_list:
  
  # Create a new instance
  rfr = RFRModel.new_instance(params)
  print("Using paramerts={}".format(params))
  
  # Track all parameters, metircs, and artifacts (model)
  runID = rfr.mlflow_run(X_train, y_train, X_val, y_val)
  print("MLflow run_id={} completed with MSE={} and RMSE={}".format(runID, rfr.mse,rfr.rsme))

Using paramerts={'n_estimators': 100, 'max_depth': 10}


2020/09/13 19:43:45 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2020/09/13 19:43:45 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

Validation MSE: 46685
Validation RMSE: 216
MLflow run_id=0702cdac8e594b03b8d05c71c8b2ab79 completed with MSE=46685.9426201263 and RMSE=216.06930050362615
Using paramerts={'n_estimators': 200, 'max_depth': 15}


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Validation MSE: 45393
Validation RMSE: 213


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


MLflow run_id=abe18254d4274d8fbe2757e23debfe99 completed with MSE=45393.60795511416 and RMSE=213.0577573220796
Using paramerts={'n_estimators': 300, 'max_depth': 20}


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Validation MSE: 44030
Validation RMSE: 209


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


MLflow run_id=41a9f2334881462cac7af307eb3a6c7d completed with MSE=44030.004388595866 and RMSE=209.83327760056522


In [23]:
get_ipython().system_raw("mlflow ui --backend-store-uri sqlite:///mlruns.db --port 5000 &")# run tracking UI in the background

In [31]:
import mlflow.pyfunc
model_staging_uri = "models:/{model_name}/production".format(model_name=model_name)

print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_staging_uri))
model_production = mlflow.pyfunc.load_model(model_staging_uri)

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Loading registered model version from URI: 'models:/PowerForecastingModel/production'


In [37]:
# Load test data
score_weather_cvs = "data/score_windfarm_data.csv"
score_df = Utils.load_data(score_weather_cvs,index_col=0)
score_df.head()

Unnamed: 0,temperature_00,wind_direction_00,wind_speed_00,temperature_08,wind_direction_08,wind_speed_08,temperature_16,wind_direction_16,wind_speed_16,power
2020-12-27,7.123225,103.17663,8.133746,6.454002,107.79322,6.326991,7.219884,119.070526,3.062219,2621.476
2020-12-28,5.37627,118.08433,5.558247,8.118839,116.193535,8.565966,9.307176,120.26443,11.993913,5423.625
2020-12-29,8.593436,115.43259,12.18185,8.587968,112.93136,11.970859,8.956771,110.161095,11.301485,9132.115
2020-12-30,8.069033,103.169685,9.983466,7.930485,106.04551,6.381556,8.228901,111.60216,4.087358,3667.9927


In [36]:
score_df = score_df.drop(columns=["power"])
predictions = model_production.predict(score_df)
print(predictions)

[2801.430854   5112.81041533 8761.61678367 3800.698117  ]
