In [85]:
import datetime as dt
import math

import ephem
import plotly.express as px
import plotly.io as pio
import polars as pl
from flaml.automl import AutoML

from wtg_power_prediction.dataset import load_training_dataset, load_turbine_metadata

pio.templates.default = "plotly_dark"


turbines = [1, 2, 3, 4, 5, 7]
df = load_training_dataset()

In [86]:
def sun_altitude(timestamp_utc: dt.datetime, observer: ephem.Observer, sun: ephem.Sun) -> float:
    observer.date = timestamp_utc
    sun.compute(observer)
    return sun.alt


wf_lat_lon = load_turbine_metadata().select(pl.col("Latitude").mean(), pl.col("Longitude").mean()).collect()


observer = ephem.Observer()
observer.lat = str(wf_lat_lon.select("Latitude").item())
observer.lon = str(wf_lat_lon.select("Longitude").item())
sun = ephem.Sun()


df = (
    df.with_columns(
        pl.col("TimeStamp_StartFormat")
        .sub(dt.datetime(2016, 1, 1, tzinfo=dt.UTC))
        .dt.total_seconds()
        .alias("seconds_since_2016"),
        *[pl.col(f"wtc_ScYawPos_mean;{wtg}").radians().sin().alias(f"wtc_ScYawPos_mean_sin;{wtg}") for wtg in turbines],
        *[pl.col(f"wtc_ScYawPos_mean;{wtg}").radians().cos().alias(f"wtc_ScYawPos_mean_cos;{wtg}") for wtg in turbines],
        pl.col("TimeStamp_StartFormat").dt.minute().mul(2 * math.pi / 60).sin().alias("minutes_sin"),
        pl.col("TimeStamp_StartFormat").dt.minute().mul(2 * math.pi / 60).cos().alias("minutes_cos"),
        pl.col("TimeStamp_StartFormat").dt.hour().mul(2 * math.pi / 24).sin().alias("hours_sin"),
        pl.col("TimeStamp_StartFormat").dt.hour().mul(2 * math.pi / 24).cos().alias("hours_cos"),
        pl.col("TimeStamp_StartFormat").dt.ordinal_day().mul(2 * math.pi / 365).sin().alias("days_sin"),
        pl.col("TimeStamp_StartFormat").dt.ordinal_day().mul(2 * math.pi / 365).cos().alias("days_cos"),
        pl.col("TimeStamp_StartFormat").dt.month().mul(2 * math.pi / 12).sin().alias("months_sin"),
        pl.col("TimeStamp_StartFormat").dt.month().mul(2 * math.pi / 12).cos().alias("months_cos"),
        pl.concat_list([pl.col(f"wtc_AmbieTmp_mean;{wtg}") for wtg in turbines]).list.mean().alias("ambient_temp_mean"),
    )
    .filter(pl.col("is_valid"))
    .collect()
    .with_columns(
        pl.col("TimeStamp_StartFormat")
        .map_elements(lambda ts: sun_altitude(ts, observer, sun), return_dtype=pl.Float64)
        .mul(180 / math.pi)
        .alias("sun_altitude"),
    )
)

In [87]:
test_year = 2019
ref_wtgs = [2, 3, 4, 5, 7]
test_wtg = 1


target_ws = f"wtc_AcWindSp_mean;{test_wtg}"
target_power_data = df.select("target").to_series()


df_ws = df.select(
    pl.col("TimeStamp_StartFormat"),
    *[pl.col(f"wtc_AcWindSp_mean;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_AcWindSp_stddev;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_AcWindSp_min;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_AcWindSp_max;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ScYawPos_mean_sin;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ScYawPos_mean_cos;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ScYawPos_stddev;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ScReToOp_timeon;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ActPower_mean;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ActPower_stddev;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ActPower_min;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_ActPower_max;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_GenRpm_mean;{ref_wtg}") for ref_wtg in ref_wtgs],
    *[pl.col(f"wtc_PitcPosA_mean;{ref_wtg}") for ref_wtg in ref_wtgs],
    pl.col("ambient_temp_mean"),
    pl.col("sun_altitude"),
    pl.col("seconds_since_2016"),
    pl.col("hours_sin"),
    pl.col("hours_cos"),
    pl.col("days_sin"),
    pl.col("days_cos"),
    pl.col("months_sin"),
    pl.col("months_cos"),
    pl.col(target_ws),
)

df_x_train_ws = (
    df_ws.filter(pl.col("TimeStamp_StartFormat").dt.year().ne(test_year))
    .select(pl.exclude(target_ws))
    .drop("TimeStamp_StartFormat")
)
df_y_train_ws = df_ws.filter(pl.col("TimeStamp_StartFormat").dt.year().ne(test_year)).select(target_ws).to_series()

df_x_test_ws = (
    df_ws.filter(pl.col("TimeStamp_StartFormat").dt.year().eq(test_year))
    .select(pl.exclude(target_ws))
    .drop("TimeStamp_StartFormat")
)
df_y_test_ws = df_ws.filter(pl.col("TimeStamp_StartFormat").dt.year().eq(test_year)).select(target_ws).to_series()

In [88]:
automl_ws = AutoML()

automl_settings = {
    "time_budget": 180,
    "task": "regression",
    "metric": "mae",
    "estimator_list": [
        "xgboost",
    ],
    "log_file_name": "automl.log",
    "seed": 42,
    "eval_method": "cv",
    "n_splits": 5,
    "split_type": "time",
    "early_stop": True,
}

automl_ws.fit(
    X_train=df_x_train_ws.to_pandas(),
    y_train=df_y_train_ws.to_pandas(),
    **automl_settings,
)

[flaml.automl.logger: 05-19 16:21:50] {1728} INFO - task = regression
[flaml.automl.logger: 05-19 16:21:50] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 05-19 16:21:50] {1838} INFO - Minimizing error metric: mae
[flaml.automl.logger: 05-19 16:21:50] {1955} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 05-19 16:21:50] {2258} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 05-19 16:21:51] {2393} INFO - Estimated sufficient time budget=10996s. Estimated necessary time budget=11s.
[flaml.automl.logger: 05-19 16:21:51] {2442} INFO -  at 2.5s,	estimator xgboost's best error=2.0385,	best estimator xgboost's best error=2.0385
[flaml.automl.logger: 05-19 16:21:51] {2258} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 05-19 16:21:52] {2442} INFO -  at 3.3s,	estimator xgboost's best error=2.0385,	best estimator xgboost's best error=2.0385
[flaml.automl.logger: 05-19 16:21:52] {2258} INFO - iteration 2, current learner

In [89]:
train_prediction_ws = automl_ws.predict(df_x_train_ws.to_pandas())
test_prediction_ws = automl_ws.predict(df_x_test_ws.to_pandas())

# get the MAE of the train set
mae = abs(df_y_train_ws - train_prediction_ws).mean()
print(f"MAE (train): {mae:.2f} m/s")

# get the MAE of the test set
mae = abs(df_y_test_ws - test_prediction_ws).mean()
print(f"MAE (test): {mae:.2f} m/s")

MAE (train): 0.27 m/s
MAE (test): 0.30 m/s


In [90]:
px.scatter(
    x=df_y_train_ws,
    y=train_prediction_ws,
    opacity=0.1,
).show()

px.scatter(
    x=df_y_test_ws,
    y=test_prediction_ws,
    opacity=0.1,
).show()

In [91]:
feature_importances_ws = pl.DataFrame(
    {
        "Name": automl_ws.feature_names_in_,
        "Importance": automl_ws.feature_importances_,
    },
)
px.bar(
    feature_importances_ws.sort("Importance"),
    x="Importance",
    y="Name",
    orientation="h",
    labels={"x": "Importance", "y": "Feature"},
    height=len(feature_importances_ws) * 20,
)

In [92]:
automl_ws.best_config

{'n_estimators': 1109,
 'max_leaves': 18,
 'min_child_weight': np.float64(0.40385496411102617),
 'learning_rate': np.float64(0.0951546340177734),
 'subsample': np.float64(0.7621325607358561),
 'colsample_bylevel': np.float64(0.896142769508154),
 'colsample_bytree': np.float64(0.9993271961638156),
 'reg_alpha': np.float64(0.0014585172191691575),
 'reg_lambda': np.float64(17.50258170562381)}

In [93]:
prediction = automl_ws.predict(df_ws.select(pl.exclude(target_ws)).drop("TimeStamp_StartFormat").to_pandas())

df_power = df_ws.with_columns(
    engineered_wind_speed=prediction,
    target=target_power_data,
).with_columns(
    pl.col("engineered_wind_speed").clip(0, 25).alias("engineered_wind_speed"),
)

df_x_train_power = (
    df_power.filter(pl.col("TimeStamp_StartFormat").dt.year().ne(test_year))
    .select(pl.exclude("target"))
    .drop("TimeStamp_StartFormat")
)
df_y_train_power = df_power.filter(pl.col("TimeStamp_StartFormat").dt.year().ne(test_year)).select("target").to_series()

df_x_test_power = (
    df_power.filter(pl.col("TimeStamp_StartFormat").dt.year().eq(test_year))
    .select(pl.exclude("target"))
    .drop("TimeStamp_StartFormat")
)
df_y_test_power = df_power.filter(pl.col("TimeStamp_StartFormat").dt.year().eq(test_year)).select("target").to_series()

In [94]:
automl_power = AutoML()

automl_power_settings = {
    "time_budget": 180,
    "task": "regression",
    "metric": "mae",
    "estimator_list": [
        "xgboost",
    ],
    "log_file_name": "automl.log",
    "seed": 42,
    "eval_method": "cv",
    "n_splits": 5,
    "split_type": "time",
    "early_stop": True,
}

automl_power.fit(
    X_train=df_x_train_power.to_pandas(),
    y_train=df_y_train_power.to_pandas(),
    **automl_settings,
)

[flaml.automl.logger: 05-19 16:25:00] {1728} INFO - task = regression
[flaml.automl.logger: 05-19 16:25:00] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 05-19 16:25:00] {1838} INFO - Minimizing error metric: mae
[flaml.automl.logger: 05-19 16:25:00] {1955} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 05-19 16:25:00] {2258} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 05-19 16:25:01] {2393} INFO - Estimated sufficient time budget=10948s. Estimated necessary time budget=11s.
[flaml.automl.logger: 05-19 16:25:01] {2442} INFO -  at 2.6s,	estimator xgboost's best error=405.1965,	best estimator xgboost's best error=405.1965
[flaml.automl.logger: 05-19 16:25:01] {2258} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 05-19 16:25:03] {2442} INFO -  at 4.0s,	estimator xgboost's best error=405.1965,	best estimator xgboost's best error=405.1965
[flaml.automl.logger: 05-19 16:25:03] {2258} INFO - iteration 2, current

In [96]:
train_prediction_power = automl_power.predict(df_x_train_power.to_pandas())
test_prediction_power = automl_power.predict(df_x_test_power.to_pandas())

# get the MAE of the train set
mae = abs(df_y_train_power - train_prediction_power).mean()
print(f"MAE (train): {mae:.2f} kW")

# get the MAE of the test set
mae = abs(df_y_test_power - test_prediction_power).mean()
print(f"MAE (test): {mae:.2f} kW")

MAE (train): 14.76 kW
MAE (test): 21.26 kW


In [97]:
px.scatter(
    x=df_y_train_power,
    y=train_prediction_power,
    opacity=0.1,
).show()

px.scatter(
    x=df_y_test_power,
    y=test_prediction_power,
    opacity=0.1,
).show()

In [98]:
feature_importances_power = pl.DataFrame(
    {
        "Name": automl_power.feature_names_in_,
        "Importance": automl_power.feature_importances_,
    },
)
px.bar(
    feature_importances_power.sort("Importance"),
    x="Importance",
    y="Name",
    orientation="h",
    labels={"x": "Importance", "y": "Feature"},
    height=len(feature_importances_power) * 20,
)

In [99]:
automl_power.best_config

{'n_estimators': 151,
 'max_leaves': 176,
 'min_child_weight': np.float64(0.9651241083426156),
 'learning_rate': np.float64(0.06822766148460396),
 'subsample': 1.0,
 'colsample_bylevel': np.float64(0.968674810769053),
 'colsample_bytree': 1.0,
 'reg_alpha': np.float64(0.0016766860807409002),
 'reg_lambda': np.float64(1.221110068606527)}