In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import matplotlib as plt
from datetime import datetime, timedelta
import re
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_fpath =  "training_dataset.parquet"
submission_data_fpath =  "submission_dataset.parquet"

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
input_df = pd.read_parquet(train_data_fpath)
test_df = pd.read_parquet(submission_data_fpath)
input_df.head(3)
input_df = input_df[~input_df.target.isna()]

In [5]:
input_df["hour"] = input_df["TimeStamp_StartFormat"].dt.hour
test_df["hour"] = test_df["TimeStamp_StartFormat"].dt.hour

In [6]:
len(input_df)

207924

In [7]:
input_df.columns

Index(['TimeStamp_StartFormat', 'wtc_AcWindSp_mean;1', 'wtc_AcWindSp_mean;2',
       'wtc_AcWindSp_mean;3', 'wtc_AcWindSp_mean;4', 'wtc_AcWindSp_mean;5',
       'wtc_AcWindSp_mean;7', 'wtc_AcWindSp_min;1', 'wtc_AcWindSp_min;2',
       'wtc_AcWindSp_min;3',
       ...
       'ERA5_cloud_cover', 'ERA5_wind_speed_10m', 'ERA5_wind_speed_100m',
       'ERA5_wind_direction_10m', 'ERA5_wind_direction_100m',
       'ERA5_wind_gusts_10m', 'id', 'is_valid', 'target', 'hour'],
      dtype='object', length=190)

In [8]:
input_df.head(3)

Unnamed: 0,TimeStamp_StartFormat,wtc_AcWindSp_mean;1,wtc_AcWindSp_mean;2,wtc_AcWindSp_mean;3,wtc_AcWindSp_mean;4,wtc_AcWindSp_mean;5,wtc_AcWindSp_mean;7,wtc_AcWindSp_min;1,wtc_AcWindSp_min;2,wtc_AcWindSp_min;3,wtc_AcWindSp_min;4,wtc_AcWindSp_min;5,wtc_AcWindSp_min;7,wtc_AcWindSp_max;1,wtc_AcWindSp_max;2,wtc_AcWindSp_max;3,wtc_AcWindSp_max;4,wtc_AcWindSp_max;5,wtc_AcWindSp_max;7,wtc_AcWindSp_stddev;1,wtc_AcWindSp_stddev;2,wtc_AcWindSp_stddev;3,wtc_AcWindSp_stddev;4,wtc_AcWindSp_stddev;5,wtc_AcWindSp_stddev;7,wtc_ScYawPos_mean;1,wtc_ScYawPos_mean;2,wtc_ScYawPos_mean;3,wtc_ScYawPos_mean;4,wtc_ScYawPos_mean;5,wtc_ScYawPos_mean;7,wtc_ScYawPos_min;1,wtc_ScYawPos_min;2,wtc_ScYawPos_min;3,wtc_ScYawPos_min;4,wtc_ScYawPos_min;5,wtc_ScYawPos_min;7,wtc_ScYawPos_max;1,wtc_ScYawPos_max;2,wtc_ScYawPos_max;3,wtc_ScYawPos_max;4,wtc_ScYawPos_max;5,wtc_ScYawPos_max;7,wtc_ScYawPos_stddev;1,wtc_ScYawPos_stddev;2,wtc_ScYawPos_stddev;3,wtc_ScYawPos_stddev;4,wtc_ScYawPos_stddev;5,wtc_ScYawPos_stddev;7,wtc_NacelPos_mean;1,wtc_NacelPos_mean;2,wtc_NacelPos_mean;3,wtc_NacelPos_mean;4,wtc_NacelPos_mean;5,wtc_NacelPos_mean;7,wtc_NacelPos_min;1,wtc_NacelPos_min;2,wtc_NacelPos_min;3,wtc_NacelPos_min;4,wtc_NacelPos_min;5,wtc_NacelPos_min;7,wtc_NacelPos_max;1,wtc_NacelPos_max;2,wtc_NacelPos_max;3,wtc_NacelPos_max;4,wtc_NacelPos_max;5,wtc_NacelPos_max;7,wtc_GenRpm_mean;1,wtc_GenRpm_mean;2,wtc_GenRpm_mean;3,wtc_GenRpm_mean;4,wtc_GenRpm_mean;5,wtc_GenRpm_mean;7,wtc_GenRpm_min;1,wtc_GenRpm_min;2,wtc_GenRpm_min;3,wtc_GenRpm_min;4,wtc_GenRpm_min;5,wtc_GenRpm_min;7,wtc_GenRpm_max;1,wtc_GenRpm_max;2,wtc_GenRpm_max;3,wtc_GenRpm_max;4,wtc_GenRpm_max;5,wtc_GenRpm_max;7,wtc_GenRpm_stddev;1,wtc_GenRpm_stddev;2,wtc_GenRpm_stddev;3,wtc_GenRpm_stddev;4,wtc_GenRpm_stddev;5,wtc_GenRpm_stddev;7,wtc_PitcPosA_mean;1,wtc_PitcPosA_mean;2,wtc_PitcPosA_mean;3,wtc_PitcPosA_mean;4,wtc_PitcPosA_mean;5,wtc_PitcPosA_mean;7,wtc_PitcPosA_min;1,wtc_PitcPosA_min;2,wtc_PitcPosA_min;3,wtc_PitcPosA_min;4,wtc_PitcPosA_min;5,wtc_PitcPosA_min;7,wtc_PitcPosA_max;1,wtc_PitcPosA_max;2,wtc_PitcPosA_max;3,wtc_PitcPosA_max;4,wtc_PitcPosA_max;5,wtc_PitcPosA_max;7,wtc_PitcPosA_stddev;1,wtc_PitcPosA_stddev;2,wtc_PitcPosA_stddev;3,wtc_PitcPosA_stddev;4,wtc_PitcPosA_stddev;5,wtc_PitcPosA_stddev;7,wtc_PitcPosB_mean;1,wtc_PitcPosB_mean;2,wtc_PitcPosB_mean;3,wtc_PitcPosB_mean;4,wtc_PitcPosB_mean;5,wtc_PitcPosB_mean;7,wtc_PitcPosC_mean;1,wtc_PitcPosC_mean;2,wtc_PitcPosC_mean;3,wtc_PitcPosC_mean;4,wtc_PitcPosC_mean;5,wtc_PitcPosC_mean;7,wtc_PowerRef_endvalue;1,wtc_PowerRef_endvalue;2,wtc_PowerRef_endvalue;3,wtc_PowerRef_endvalue;4,wtc_PowerRef_endvalue;5,wtc_PowerRef_endvalue;7,wtc_ScReToOp_timeon;1,wtc_ScReToOp_timeon;2,wtc_ScReToOp_timeon;3,wtc_ScReToOp_timeon;4,wtc_ScReToOp_timeon;5,wtc_ScReToOp_timeon;7,wtc_ActPower_mean;1,wtc_ActPower_mean;2,wtc_ActPower_mean;3,wtc_ActPower_mean;4,wtc_ActPower_mean;5,wtc_ActPower_mean;7,wtc_ActPower_min;1,wtc_ActPower_min;2,wtc_ActPower_min;3,wtc_ActPower_min;4,wtc_ActPower_min;5,wtc_ActPower_min;7,wtc_ActPower_max;1,wtc_ActPower_max;2,wtc_ActPower_max;3,wtc_ActPower_max;4,wtc_ActPower_max;5,wtc_ActPower_max;7,wtc_ActPower_stddev;1,wtc_ActPower_stddev;2,wtc_ActPower_stddev;3,wtc_ActPower_stddev;4,wtc_ActPower_stddev;5,wtc_ActPower_stddev;7,wtc_AmbieTmp_mean;1,wtc_AmbieTmp_mean;2,wtc_AmbieTmp_mean;3,wtc_AmbieTmp_mean;4,wtc_AmbieTmp_mean;5,wtc_AmbieTmp_mean;7,ShutdownDuration;1,ShutdownDuration;2,ShutdownDuration;3,ShutdownDuration;4,ShutdownDuration;5,ShutdownDuration;7,ERA5_temperature_2m,ERA5_relative_humidity_2m,ERA5_dew_point_2m,ERA5_precipitation,ERA5_surface_pressure,ERA5_cloud_cover,ERA5_wind_speed_10m,ERA5_wind_speed_100m,ERA5_wind_direction_10m,ERA5_wind_direction_100m,ERA5_wind_gusts_10m,id,is_valid,target,hour
0,2016-01-01 00:00:00+00:00,6.085917,5.830675,6.613091,6.99851,7.645727,7.04079,2.0,1.1,1.6,2.4,4.3,3.9,10.0,10.0,11.0,10.6,11.8,10.5,1.484347,1.586648,1.571053,1.228338,1.09691,1.317204,83.754387,69.120506,-122.800903,-113.638397,-118.159103,-106.894501,79.5,66.400002,-124.400002,-114.800003,-120.5,-108.599998,86.800003,72.300003,-119.900002,-109.599998,-113.199997,-104.900002,2.266932,1.913408,1.426679,2.130993,1.8788,1.008626,83.751183,69.122231,237.192902,246.365005,241.846603,253.104004,79.5,66.400002,235.600006,245.199997,239.5,251.399994,86.800003,72.300003,240.100006,250.399994,246.800003,255.100006,941.06897,914.643799,1043.970947,1131.11499,1114.666016,1131.890991,649.900024,612.700012,730.099976,940.900024,935.400024,868.900024,1273.400024,1332.099976,1406.0,1535.599976,1391.800049,1430.800049,188.158005,211.196304,177.679596,129.183502,91.292953,128.462006,-0.974687,-0.973245,-0.937077,-0.966635,-0.927565,-0.971865,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.517582,0.514683,0.522298,0.496113,0.497144,0.528061,-0.96637,-1.008135,-0.982902,-0.934692,-0.996075,-0.976442,-0.99509,-0.934227,-0.972555,-0.947758,-0.975915,-0.964212,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,352.522308,361.848511,468.959015,565.597473,533.635376,566.248108,78.0,72.0,180.0,309.0,287.0,248.0,760.0,869.0,974.0,1416.0,949.0,1018.0,187.078903,215.998001,213.243896,198.349503,127.434898,178.835403,0.0,0.827133,0.3154,0.0,0.0,0.0,0,0,0,0,0,0,1.792,73.957634,-2.358,0.0,976.166321,0.0,6.080296,10.040418,233.695404,236.784729,13.0,-210384,True,352.522308,0
1,2016-01-01 00:10:00+00:00,5.824693,5.810768,5.895642,6.5219,6.413868,6.599652,2.1,2.2,1.9,2.2,2.3,2.3,9.6,10.2,8.8,12.5,10.9,11.0,1.141584,1.547302,1.293487,1.441869,1.438043,1.334877,88.496246,75.914597,-115.395798,-110.868401,-113.227501,-102.6987,84.400002,72.300003,-123.900002,-112.5,-116.199997,-108.699997,91.400002,81.400002,-113.5,-109.599998,-112.400002,-101.0,1.879867,2.106318,2.478934,1.365305,1.014673,2.78289,88.491821,75.907959,244.613007,249.131195,246.777298,257.305389,84.400002,72.300003,236.100006,247.5,243.800003,251.300003,91.400002,81.400002,246.5,250.399994,247.600006,259.0,878.17627,891.244385,877.905701,1068.406006,901.705383,1027.93103,691.099976,641.400024,668.200012,780.799988,667.400024,684.5,1142.199951,1382.900024,1157.400024,1478.699951,1270.099976,1248.5,128.342499,172.585403,148.661896,162.483597,134.646103,146.772797,-0.976545,-0.973472,-0.943475,-0.968723,-0.927995,-0.967702,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.514228,0.513102,0.521445,0.498363,0.498069,0.531581,-0.966463,-1.008063,-0.986907,-0.933587,-0.995267,-0.974115,-0.995205,-0.934798,-0.97826,-0.948613,-0.97302,-0.963473,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,294.078888,332.010406,308.005707,490.191193,314.5914,443.461914,119.0,103.0,109.0,233.0,109.0,132.0,564.0,948.0,599.0,1144.0,770.0,736.0,104.474998,165.113998,123.467796,211.253098,119.1884,153.566299,0.0,0.520733,0.07045,0.0,0.0,0.0,0,0,0,0,0,0,1.792,86.580147,-0.208,0.0,982.179504,100.0,3.301515,6.958448,178.264328,187.43132,6.9,-210383,True,294.078888,0
2,2016-01-01 00:20:00+00:00,7.10018,6.386981,7.606015,7.695034,7.812548,7.469052,2.2,2.1,0.0,4.2,4.0,3.4,11.6,12.3,13.6,12.8,11.6,12.8,1.59964,1.661032,2.328598,1.501421,1.426233,1.648712,83.129791,69.414749,-120.673698,-113.834702,-118.3255,-104.277702,81.300003,64.0,-124.699997,-114.599998,-122.199997,-106.400002,87.800003,77.199997,-113.900002,-110.099998,-112.400002,-98.800003,1.907214,2.861577,3.214255,1.593463,3.1479,2.727755,83.128883,69.409897,239.316895,246.165298,241.671005,255.7173,81.300003,64.0,235.300003,245.399994,237.800003,253.600006,87.800003,77.199997,246.100006,249.899994,247.600006,261.200012,1137.56604,1037.644043,1212.978027,1254.245972,1118.692993,1178.525024,752.0,646.099976,754.0,960.5,760.599976,938.299988,1547.199951,1473.300049,1582.0,1564.900024,1456.5,1535.900024,174.083206,234.354401,277.971313,145.854706,153.791,166.3965,-0.968695,-0.978538,-0.932615,-0.967485,-0.928043,-0.97015,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.516025,0.512991,0.511504,0.495999,0.496092,0.528538,-0.963727,-1.008043,-0.977577,-0.935797,-0.994107,-0.976495,-0.986437,-0.937108,-0.970872,-0.948943,-0.974592,-0.966483,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,587.302795,501.303894,816.978821,771.708191,556.22052,646.179321,182.0,105.0,190.0,333.0,205.0,304.0,1458.0,1132.0,2189.0,1740.0,1062.0,1415.0,243.555801,261.407715,491.376892,298.910797,199.444397,274.355103,0.0,0.4834,0.32405,0.0,0.0,0.0,0,0,0,0,0,0,7.742,61.013905,0.692,0.0,985.24054,78.0,3.478505,4.648656,198.435043,198.824799,7.7,-210382,True,587.302795,0


In [9]:
input_df.tail(3)

Unnamed: 0,TimeStamp_StartFormat,wtc_AcWindSp_mean;1,wtc_AcWindSp_mean;2,wtc_AcWindSp_mean;3,wtc_AcWindSp_mean;4,wtc_AcWindSp_mean;5,wtc_AcWindSp_mean;7,wtc_AcWindSp_min;1,wtc_AcWindSp_min;2,wtc_AcWindSp_min;3,wtc_AcWindSp_min;4,wtc_AcWindSp_min;5,wtc_AcWindSp_min;7,wtc_AcWindSp_max;1,wtc_AcWindSp_max;2,wtc_AcWindSp_max;3,wtc_AcWindSp_max;4,wtc_AcWindSp_max;5,wtc_AcWindSp_max;7,wtc_AcWindSp_stddev;1,wtc_AcWindSp_stddev;2,wtc_AcWindSp_stddev;3,wtc_AcWindSp_stddev;4,wtc_AcWindSp_stddev;5,wtc_AcWindSp_stddev;7,wtc_ScYawPos_mean;1,wtc_ScYawPos_mean;2,wtc_ScYawPos_mean;3,wtc_ScYawPos_mean;4,wtc_ScYawPos_mean;5,wtc_ScYawPos_mean;7,wtc_ScYawPos_min;1,wtc_ScYawPos_min;2,wtc_ScYawPos_min;3,wtc_ScYawPos_min;4,wtc_ScYawPos_min;5,wtc_ScYawPos_min;7,wtc_ScYawPos_max;1,wtc_ScYawPos_max;2,wtc_ScYawPos_max;3,wtc_ScYawPos_max;4,wtc_ScYawPos_max;5,wtc_ScYawPos_max;7,wtc_ScYawPos_stddev;1,wtc_ScYawPos_stddev;2,wtc_ScYawPos_stddev;3,wtc_ScYawPos_stddev;4,wtc_ScYawPos_stddev;5,wtc_ScYawPos_stddev;7,wtc_NacelPos_mean;1,wtc_NacelPos_mean;2,wtc_NacelPos_mean;3,wtc_NacelPos_mean;4,wtc_NacelPos_mean;5,wtc_NacelPos_mean;7,wtc_NacelPos_min;1,wtc_NacelPos_min;2,wtc_NacelPos_min;3,wtc_NacelPos_min;4,wtc_NacelPos_min;5,wtc_NacelPos_min;7,wtc_NacelPos_max;1,wtc_NacelPos_max;2,wtc_NacelPos_max;3,wtc_NacelPos_max;4,wtc_NacelPos_max;5,wtc_NacelPos_max;7,wtc_GenRpm_mean;1,wtc_GenRpm_mean;2,wtc_GenRpm_mean;3,wtc_GenRpm_mean;4,wtc_GenRpm_mean;5,wtc_GenRpm_mean;7,wtc_GenRpm_min;1,wtc_GenRpm_min;2,wtc_GenRpm_min;3,wtc_GenRpm_min;4,wtc_GenRpm_min;5,wtc_GenRpm_min;7,wtc_GenRpm_max;1,wtc_GenRpm_max;2,wtc_GenRpm_max;3,wtc_GenRpm_max;4,wtc_GenRpm_max;5,wtc_GenRpm_max;7,wtc_GenRpm_stddev;1,wtc_GenRpm_stddev;2,wtc_GenRpm_stddev;3,wtc_GenRpm_stddev;4,wtc_GenRpm_stddev;5,wtc_GenRpm_stddev;7,wtc_PitcPosA_mean;1,wtc_PitcPosA_mean;2,wtc_PitcPosA_mean;3,wtc_PitcPosA_mean;4,wtc_PitcPosA_mean;5,wtc_PitcPosA_mean;7,wtc_PitcPosA_min;1,wtc_PitcPosA_min;2,wtc_PitcPosA_min;3,wtc_PitcPosA_min;4,wtc_PitcPosA_min;5,wtc_PitcPosA_min;7,wtc_PitcPosA_max;1,wtc_PitcPosA_max;2,wtc_PitcPosA_max;3,wtc_PitcPosA_max;4,wtc_PitcPosA_max;5,wtc_PitcPosA_max;7,wtc_PitcPosA_stddev;1,wtc_PitcPosA_stddev;2,wtc_PitcPosA_stddev;3,wtc_PitcPosA_stddev;4,wtc_PitcPosA_stddev;5,wtc_PitcPosA_stddev;7,wtc_PitcPosB_mean;1,wtc_PitcPosB_mean;2,wtc_PitcPosB_mean;3,wtc_PitcPosB_mean;4,wtc_PitcPosB_mean;5,wtc_PitcPosB_mean;7,wtc_PitcPosC_mean;1,wtc_PitcPosC_mean;2,wtc_PitcPosC_mean;3,wtc_PitcPosC_mean;4,wtc_PitcPosC_mean;5,wtc_PitcPosC_mean;7,wtc_PowerRef_endvalue;1,wtc_PowerRef_endvalue;2,wtc_PowerRef_endvalue;3,wtc_PowerRef_endvalue;4,wtc_PowerRef_endvalue;5,wtc_PowerRef_endvalue;7,wtc_ScReToOp_timeon;1,wtc_ScReToOp_timeon;2,wtc_ScReToOp_timeon;3,wtc_ScReToOp_timeon;4,wtc_ScReToOp_timeon;5,wtc_ScReToOp_timeon;7,wtc_ActPower_mean;1,wtc_ActPower_mean;2,wtc_ActPower_mean;3,wtc_ActPower_mean;4,wtc_ActPower_mean;5,wtc_ActPower_mean;7,wtc_ActPower_min;1,wtc_ActPower_min;2,wtc_ActPower_min;3,wtc_ActPower_min;4,wtc_ActPower_min;5,wtc_ActPower_min;7,wtc_ActPower_max;1,wtc_ActPower_max;2,wtc_ActPower_max;3,wtc_ActPower_max;4,wtc_ActPower_max;5,wtc_ActPower_max;7,wtc_ActPower_stddev;1,wtc_ActPower_stddev;2,wtc_ActPower_stddev;3,wtc_ActPower_stddev;4,wtc_ActPower_stddev;5,wtc_ActPower_stddev;7,wtc_AmbieTmp_mean;1,wtc_AmbieTmp_mean;2,wtc_AmbieTmp_mean;3,wtc_AmbieTmp_mean;4,wtc_AmbieTmp_mean;5,wtc_AmbieTmp_mean;7,ShutdownDuration;1,ShutdownDuration;2,ShutdownDuration;3,ShutdownDuration;4,ShutdownDuration;5,ShutdownDuration;7,ERA5_temperature_2m,ERA5_relative_humidity_2m,ERA5_dew_point_2m,ERA5_precipitation,ERA5_surface_pressure,ERA5_cloud_cover,ERA5_wind_speed_10m,ERA5_wind_speed_100m,ERA5_wind_direction_10m,ERA5_wind_direction_100m,ERA5_wind_gusts_10m,id,is_valid,target,hour
210381,2019-12-31 23:30:00+00:00,8.292965,7.242282,7.387715,8.898215,7.778623,7.802809,4.98,4.98,4.49,5.49,5.18,5.28,10.79,9.54,9.85,11.86,10.55,9.75,0.972607,0.881621,0.918921,0.909908,0.89253,0.779341,-96.173378,-124.966599,-116.936699,-113.882103,-133.844696,-102.936096,-103.0,-134.100006,-121.199997,-117.400002,-140.600006,-107.0,-89.900002,-120.0,-112.699997,-110.599998,-126.599998,-97.300003,3.659225,4.104998,2.660128,2.961186,4.256087,2.623132,263.833313,235.042892,243.069,246.121902,226.157303,257.065613,257.0,225.899994,238.800003,242.600006,219.399994,253.0,270.100006,240.0,247.300003,249.399994,233.399994,262.700012,1222.415039,1080.531006,1110.911011,1355.084961,1130.175049,1169.10498,1074.800049,999.299988,977.299988,1233.400024,973.299988,1094.199951,1318.800049,1179.699951,1209.800049,1504.099976,1268.0,1306.400024,51.818569,45.069172,51.467251,64.891129,66.333107,39.41888,-0.953298,-0.983398,-0.96784,-0.949465,-0.940548,-1.000095,-1.3,-1.3,-1.3,-1.3,-1.3,-1.3,-0.6,-0.6,-0.6,-0.6,-0.6,-0.7,0.338297,0.305152,0.32381,0.347525,0.310555,0.29857,-0.939138,-0.998945,-0.945945,-0.987365,-0.972578,-0.995072,-0.975448,-1.000477,-0.999778,-0.976407,-0.961352,-0.999022,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,687.353088,482.618988,525.045898,893.933289,556.460999,608.810974,470.0,374.0,354.0,698.0,357.0,496.0,834.0,628.0,673.0,1258.0,761.0,817.0,76.277786,61.76931,71.848351,117.3293,94.306686,58.581848,4.618834,4.787467,3.607067,4.939967,4.704933,2.973367,0,0,0,0,0,0,2.942,61.226002,-3.798,0.0,996.174988,55.0,4.701064,9.276314,218.088821,221.941223,8.8,-3,True,687.353088,23
210382,2019-12-31 23:40:00+00:00,9.617872,9.914719,8.692904,10.19371,10.44945,8.951912,6.63,5.9,5.9,7.15,6.11,6.22,12.81,14.24,11.5,12.81,14.0,11.26,0.957572,1.496119,1.032007,0.999657,1.459871,0.746953,-83.940231,-113.025002,-106.912804,-102.196404,-121.0886,-94.316544,-92.199997,-121.099998,-112.699997,-110.900002,-134.5,-97.300003,-80.0,-110.800003,-104.099998,-98.300003,-114.599998,-88.300003,3.346503,3.927145,1.935067,3.785844,4.88392,3.184263,276.072205,246.986893,253.094803,257.814911,238.930298,265.69281,267.899994,238.899994,247.300003,249.100006,225.5,262.700012,280.0,249.199997,255.899994,261.700012,245.399994,271.700012,1445.762939,1430.916016,1345.046997,1484.062012,1450.67395,1357.484985,1124.800049,1049.5,1100.099976,1219.900024,1067.800049,1262.199951,1550.5,1566.400024,1494.400024,1560.900024,1576.900024,1438.699951,86.603371,147.891602,96.851486,76.217873,134.283096,40.993629,-0.959085,-0.983298,-0.970378,-0.949795,-0.928628,-0.999802,-1.3,-1.3,-1.3,-1.3,-1.3,-1.3,-0.6,-0.6,-0.6,-0.6,-0.6,-0.7,0.33654,0.301926,0.32754,0.348239,0.32013,0.298574,-0.941788,-0.996338,-0.945393,-0.990102,-0.96985,-0.995697,-0.978313,-1.000408,-0.999797,-0.980613,-0.95837,-0.999188,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,1107.057007,1170.453003,881.196594,1240.265991,1237.072998,888.611816,540.0,439.0,503.0,682.0,464.0,740.0,1496.0,1829.0,1221.0,1689.0,2067.0,1025.0,195.377304,362.835999,158.093002,212.542206,361.223389,63.843021,3.230533,4.2476,3.0,3.471367,4.1088,2.720033,0,0,0,0,0,0,1.942,90.765717,0.592,0.0,1001.884033,69.0,3.088689,6.236986,209.054504,221.099411,5.6,-2,True,1107.057007,23
210383,2019-12-31 23:50:00+00:00,9.997997,10.40753,9.129176,10.60657,10.89676,9.398952,7.26,5.7,6.11,6.53,5.18,5.28,12.45,13.52,12.1,14.12,14.24,12.33,0.902917,1.177762,0.956576,1.312517,1.392926,1.278592,-77.398903,-104.9664,-102.875099,-96.931488,-112.689903,-90.385643,-83.099998,-110.800003,-107.900002,-99.199997,-115.800003,-98.800003,-74.5,-102.0,-98.900002,-95.900002,-111.300003,-85.900002,2.673303,2.853654,3.227096,1.36421,1.574227,4.197214,282.5979,255.034103,257.122192,263.066193,247.307404,269.609802,276.899994,249.199997,252.100006,260.799988,244.199997,261.200012,285.5,258.0,261.100006,264.100006,248.699997,274.100006,1512.80896,1535.756958,1435.734985,1519.063965,1527.599976,1455.229004,1453.400024,1484.900024,1228.599976,1364.900024,1401.5,1147.0,1560.199951,1567.900024,1550.400024,1574.5,1581.5,1564.699951,29.71623,17.42734,84.587067,44.064838,37.74931,111.423599,-0.961835,-0.975402,-0.970585,-0.946202,-0.932107,-0.999758,-1.3,-1.3,-1.3,-1.3,-1.3,-1.3,-0.6,-0.6,-0.6,-0.6,-0.6,-0.7,0.333266,0.31417,0.325811,0.34406,0.31594,0.298593,-0.938148,-0.999402,-0.947293,-0.991047,-0.974792,-0.993872,-0.979112,-1.000585,-0.999858,-0.981442,-0.954575,-0.99893,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,1328.432983,1478.899048,1095.19397,1501.407959,1533.442017,1210.182007,1043.0,1184.0,692.0,892.0,957.0,570.0,1708.0,1817.0,1488.0,2084.0,2188.0,1794.0,167.616104,149.195007,215.106796,316.128387,276.866394,330.417511,2.906667,3.281917,3.0,3.090867,3.0883,2.973267,0,0,0,0,0,0,5.292,76.816032,1.552,0.0,1000.020325,89.0,3.008322,5.950631,195.422226,204.842361,4.0,-1,True,1328.432983,23


In [10]:
def rel_dir(df):
    for turbine_num in [2, 3, 4, 5, 7]:
        yaw_col = f'wtc_ScYawPos_mean;{turbine_num}'
        wind_dir_col = 'ERA5_wind_direction_10m'  # or 100m depending on your needs
        
        # Calculate relative angle
        df[f'relative_angle_{turbine_num}'] = (df[wind_dir_col] - df[yaw_col]).abs()
        
        # Normalize to [-180, 180]
        df[f'relative_angle_{turbine_num}'] = df[f'relative_angle_{turbine_num}'].apply(
            lambda x: x if x <= 180 else 360 - x
        )
    return df

In [11]:
def day_feature_adder(df):
    day_of_year = df['TimeStamp_StartFormat'].dt.dayofyear
    df['daylight_hours'] = 8 + 4 * np.sin(2 * np.pi * (day_of_year - 80) / 365.25)
    df['is_peak_wind'] = np.where(df['TimeStamp_StartFormat'].dt.month.isin([12, 1, 2]),1,0)



    df['minute_sin'] = np.sin(2 * np.pi * df['TimeStamp_StartFormat'].dt.minute / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['TimeStamp_StartFormat'].dt.minute / 60)

    df['hour_sin'] = np.sin(2 * np.pi * df['TimeStamp_StartFormat'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['TimeStamp_StartFormat'].dt.hour / 24)

    df['dayofyear_sin'] = np.sin(2 * np.pi * df['TimeStamp_StartFormat'].dt.dayofyear / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['TimeStamp_StartFormat'].dt.dayofyear / 365)

    df['month_sin'] = np.sin(2 * np.pi * df['TimeStamp_StartFormat'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['TimeStamp_StartFormat'].dt.month / 12)
    return df

In [12]:
def add_lagged_wind_speeds(df, timestamp_col='TimeStamp_StartFormat', turbine_prefix=';'):
    """
    Add lagged wind speed features for each turbine (15min and 1hr lags)
    
    Args:
        df (pd.DataFrame): Must contain:
            - timestamp_col: DateTime column
            - f'Wind_speed_{turbine_prefix}N' for each turbine
        turbine_prefix (str): Prefix used in turbine column names
        
    Returns:
        pd.DataFrame: Original df + new lagged wind speed columns
    """
    # Make sure we're working with a copy
    df = df.copy()
    
    # Sort by time to ensure proper shifting
    df = df.sort_values(timestamp_col).reset_index(drop=True)
    
    # Time deltas for lags
    lag_30min = timedelta(minutes=10)
    lag_1hr = timedelta(minutes=60)
    
    for turbine_num in [2,3,4,5,7]:#,3,4]:  # For turbines 2,3,4,5,7 (6 being predicted)
        wind_col = f'wtc_ActPower_mean{turbine_prefix}{turbine_num}'#'wtc_GenRpm_mean
        
        # Create temporary DataFrame for merging
        lagged = df[[timestamp_col, wind_col]].copy()
        
        # Calculate 30min lag
        lagged['temp_30min'] = lagged[timestamp_col] + lag_30min
        lagged = lagged.rename(columns={wind_col: f'{wind_col}_lag30min'})
        df = pd.merge_asof(
            df.sort_values(timestamp_col),
            lagged[['temp_30min', f'{wind_col}_lag30min']].sort_values('temp_30min'),
            left_on=timestamp_col,
            right_on='temp_30min',
            direction='backward'
        ).drop(columns='temp_30min')
        
        # Calculate 1hr lag
        lagged['temp_1hr'] = lagged[timestamp_col] + lag_1hr
        lagged = lagged.rename(columns={f'{wind_col}_lag30min': f'{wind_col}_lag1hr'})
        df = pd.merge_asof(
            df.sort_values(timestamp_col),
            lagged[['temp_1hr', f'{wind_col}_lag1hr']].sort_values('temp_1hr'),
            left_on=timestamp_col,
            right_on='temp_1hr',
            direction='backward'
        ).drop(columns='temp_1hr')
    
    return df

In [13]:
def add_lagged_act_power(df, timestamp_col='TimeStamp_StartFormat', 
                         lags=[(10, '10min'),(60, '60min')], 
                         turbine_prefix=';'):
    """
    Add lagged active power features for each turbine
    
    Args:
        df (pd.DataFrame): Must contain:
            - timestamp_col: DateTime column
            - Columns containing 'ActPower' in their names
        lags (list of tuples): List of (minutes, suffix_name) for each lag
        turbine_prefix (str): Prefix used in turbine column names
        
    Returns:
        pd.DataFrame: Original df + new lagged active power columns
    """
    # Make sure we're working with a copy
    df = df.copy()
    
    # Sort by time to ensure proper shifting
    df = df.sort_values(timestamp_col).reset_index(drop=True)
    
    # Find all ActPower columns
    act_power_cols = [col for col in df.columns if "wtc_Ac" in col]#'ActPower''wtc_Ac'
    
    for act_power_col in act_power_cols:
        for minutes, suffix in lags:
            # Create temporary DataFrame for merging
            lagged = df[[timestamp_col, act_power_col]].copy()
            lag_delta = timedelta(minutes=minutes)
            
            # Calculate lag
            lagged['temp_time'] = lagged[timestamp_col] + lag_delta
            lagged = lagged.rename(columns={act_power_col: f'{act_power_col}_lag{suffix}'})
            
            df = pd.merge_asof(
                df.sort_values(timestamp_col),
                lagged[['temp_time', f'{act_power_col}_lag{suffix}']].sort_values('temp_time'),
                left_on=timestamp_col,
                right_on='temp_time',
                direction='backward'
            ).drop(columns='temp_time')
    
    return df

In [14]:
def add_trig_transforms(df, substrings=('Pos', 'direction'), radians=True):
    """
    Adds sin and cos transformations for columns containing any of the specified substrings.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame
    substrings : tuple of str (default=('Pos', 'direction'))
        Substrings to identify columns to transform
    radians : bool (default=True)
        If False, converts values from degrees to radians first
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with new sin/cos columns added
    """
    df_transformed = df.copy()
    
    for col in df.columns:
        if any(sub in col for sub in substrings):
            values = df[col]
            
            if not radians:
                values = np.radians(values)
                
            df_transformed[f'{col}_sin'] = np.sin(values)
            df_transformed[f'{col}_cos'] = np.cos(values)
    
    return df_transformed

In [15]:
from scipy.signal import periodogram

# Dominant frequency in last 6 hours (captures oscillations)
def get_dominant_freq(series):
    f, Pxx = periodogram(series.dropna())
    return f[np.argmax(Pxx)]


In [16]:
def new_features(df):
    df["TimeStamp_StartFormat"] = pd.to_datetime(df['TimeStamp_StartFormat'])
    df["wind_speed2_rolling_15min"] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;2']].groupby('wtc_AcWindSp_mean;2',as_index=False).rolling('30T', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;2":"wind_speed_rolling_15min"})
    df["wind_speed2_rolling_60min"]= df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;2']].groupby('wtc_AcWindSp_mean;2',as_index=False).rolling('1H', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;2":"wind_speed_rolling_60min"})
    #df["wind_speed3_rolling_15min"] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;3']].groupby('wtc_AcWindSp_mean;3',as_index=False).rolling('30T', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;3":"wind_speed_rolling_15min"})
    #df["wind_speed3_rolling_60min"]= df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;3']].groupby('wtc_AcWindSp_mean;3',as_index=False).rolling('1H', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;3":"wind_speed_rolling_60min"})
    #df["wind_speed4_rolling_15min"] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;4']].groupby('wtc_AcWindSp_mean;4',as_index=False).rolling('30T', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;4":"wind_speed_rolling_15min"})
    #df["wind_speed4_rolling_60min"]= df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;4']].groupby('wtc_AcWindSp_mean;4',as_index=False).rolling('1H', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;4":"wind_speed_rolling_60min"})
    #df["wind_speed5_rolling_15min"] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;5']].groupby('wtc_AcWindSp_mean;5',as_index=False).rolling('30T', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;5":"wind_speed_rolling_15min"})
    #df["wind_speed5_rolling_60min"]= df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;5']].groupby('wtc_AcWindSp_mean;5',as_index=False).rolling('1H', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;5":"wind_speed_rolling_60min"})

    # For neighboring turbines (e.g., Kelmarsh 2)
    #df["neighbor_rolling_30min"] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;3']].groupby('wtc_AcWindSp_mean;3',as_index=False).rolling('30T', on='TimeStamp_StartFormat').mean().rename(columns={"wtc_AcWindSp_mean;3":"neighbor_rolling_30min"})
    # Rate of change (derivative)
    df['wind_speed2_diff_10min'] = df['wtc_AcWindSp_mean;2'].diff(periods=6)  # For 10-min data
    df['wind_speed2_pct_change_30min'] = df['wtc_AcWindSp_mean;2'].pct_change(periods=3)
    df['wind_dir2_stability_1hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;2']].groupby('wtc_AcWindSp_mean;2',as_index=False).rolling('1H', on='TimeStamp_StartFormat').std().rename(columns={"wtc_AcWindSp_mean;2":'wind_dir_stability_1hr'})
    #df['wind_speed3_diff_10min'] = df['wtc_AcWindSp_mean;3'].diff(periods=6)  # For 10-min data
    #df['wind_speed3_pct_change_30min'] = df['wtc_AcWindSp_mean;3'].pct_change(periods=3)
    #df['wind_dir3_stability_1hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;3']].groupby('wtc_AcWindSp_mean;3',as_index=False).rolling('1H', on='TimeStamp_StartFormat').std().rename(columns={"wtc_AcWindSp_mean;3":'wind_dir_stability_1hr'})
    #df['wind_speed4_diff_10min'] = df['wtc_AcWindSp_mean;4'].diff(periods=6)  # For 10-min data
    #df['wind_speed4_pct_change_30min'] = df['wtc_AcWindSp_mean;4'].pct_change(periods=3)
    #df['wind_dir4_stability_1hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;4']].groupby('wtc_AcWindSp_mean;4',as_index=False).rolling('1H', on='TimeStamp_StartFormat').std().rename(columns={"wtc_AcWindSp_mean;4":'wind_dir_stability_1hr'})
    #df['wind_speed5_diff_10min'] = df['wtc_AcWindSp_mean;5'].diff(periods=6)  # For 10-min data
    #df['wind_speed5_pct_change_30min'] = df['wtc_AcWindSp_mean;5'].pct_change(periods=3)
    #df['wind_dir5_stability_1hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;5']].groupby('wtc_AcWindSp_mean;5',as_index=False).rolling('1H', on='TimeStamp_StartFormat').std().rename(columns={"wtc_AcWindSp_mean;5":'wind_dir_stability_1hr'})
    #df['wind_speed6_diff_10min'] = df['wtc_AcWindSp_mean;7'].diff(periods=6)  # For 10-min data
    #df['wind_speed6_pct_change_30min'] = df['wtc_AcWindSp_mean;7'].pct_change(periods=3)
    #df['wind_dir6_stability_1hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;7']].groupby('wtc_AcWindSp_mean;7',as_index=False).rolling('1H', on='TimeStamp_StartFormat').std().rename(columns={"wtc_AcWindSp_mean;7":'wind_dir_stability_1hr'})


# Gust detection (1.5x rolling mean)
    df['gust_flag'] = (df['wtc_AcWindSp_mean;2'] > 1.5 * df['wind_speed2_rolling_15min']).astype(int)
    df['upwind_turbine_lag10min'] = df['wtc_AcWindSp_mean;3'].shift(periods=1)  # Assuming 10-min data

# Cross-turbine rolling correlations
    #df['rolling_corr_1hr'] = df[['wtc_AcWindSp_mean;3','wtc_AcWindSp_mean;2']].corr()["wtc_AcWindSp_mean;2"][0]


    df['dominant_freq2_2hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;2']].groupby('wtc_AcWindSp_mean;2',as_index=False).rolling('2H', on='TimeStamp_StartFormat').apply(get_dominant_freq)
    #df['dominant_freq3_2hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;3']].groupby('wtc_AcWindSp_mean;3',as_index=False).rolling('2H', on='TimeStamp_StartFormat').apply(get_dominant_freq)
    #df['dominant_freq4_2hr'] = df[['TimeStamp_StartFormat','wtc_AcWindSp_mean;4']].groupby('wtc_AcWindSp_mean;4',as_index=False).rolling('2H', on='TimeStamp_StartFormat').apply(get_dominant_freq)
    return df

In [17]:
turbine_data = {
    "T01": {"Latitude": 57.49921441, "Longitude": -3.086742896},
    "T02": {"Latitude": 57.49626574, "Longitude": -3.082817716},
    "T03": {"Latitude": 57.50206973, "Longitude": -3.088980479},
    "T04": {"Latitude": 57.50196736, "Longitude": -3.082085466},
    "T05": {"Latitude": 57.49891107, "Longitude": -3.078123820},
    "T06": {"Latitude": 57.50024464, "Longitude": -3.071321578},
    "T07": {"Latitude": 57.50513302, "Longitude": -3.085850762},
    "T08": {"Latitude": 57.50465629, "Longitude": -3.077542275},
    "T09": {"Latitude": 57.50827835, "Longitude": -3.082572740},
    "T10": {"Latitude": 57.50542459, "Longitude": -3.070655631},
    "T11": {"Latitude": 57.51184311, "Longitude": -3.080658605},
    "T12": {"Latitude": 57.51081569, "Longitude": -3.073684523},
    "T13": {"Latitude": 57.51666161, "Longitude": -3.078146735},
    "T14": {"Latitude": 57.51376046, "Longitude": -3.074939555},
    "T15": {"Latitude": 57.49941809, "Longitude": -3.062837374},
    "T16": {"Latitude": 57.50513916, "Longitude": -3.053340822},
    "T17": {"Latitude": 57.50662266, "Longitude": -3.047792556},
    "T18": {"Latitude": 57.50429684, "Longitude": -3.041183981},
    "T19": {"Latitude": 57.50859585, "Longitude": -3.041656952},
    "T20": {"Latitude": 57.51202849, "Longitude": -3.040436127},
    "T21": {"Latitude": 57.50978499, "Longitude": -3.034697307}
}

In [18]:
def weighted_wake_proximity(row, turbine_lat_lon, target_turbine="T01"):
    total_weight = 0
    for tid, coords in turbine_lat_lon.items():
        if tid == target_turbine:
            continue
        if row[f'upstream_{tid}']:
            distance = haversine(coords["Longitude"], coords["Latitude"],
                                turbine_lat_lon[target_turbine]["Longitude"],
                                turbine_lat_lon[target_turbine]["Latitude"])
            total_weight += 1 / (distance + 1e-6)  # Avoid division by zero
    return total_weight



In [19]:
def filter_wake_affected_turbines(
    df, 
    turbine_lat_lon, 
    turbine_id="T01", 
    wind_dir_col='ERA5_wind_direction_100m',
    wake_threshold_deg=10
):
    """
    Creates new wake status columns while preserving all original data.
    
    Args:
        df: Input DataFrame with turbine data columns
        turbine_lat_lon: Dict of {turbine_id: {"Latitude": y, "Longitude": x}}
        turbine_id: Reference turbine ID (default "T01")
        wind_dir_col: Column name for wind direction
        wake_threshold_deg: Wake angle threshold in degrees
        
    Returns:
        DataFrame with new wake status columns (*;waked and *;waking)
    """
    # Convert turbine_id to string
    turbine_id = str(turbine_id)
    
    # Wind direction vector
    wind_dir_rad = np.radians(df[wind_dir_col].values)
    wind_u, wind_v = np.cos(wind_dir_rad), np.sin(wind_dir_rad)
    
    # Reference turbine coordinates
    ref_lat = turbine_lat_lon[turbine_id]["Latitude"]
    ref_lon = turbine_lat_lon[turbine_id]["Longitude"]
    
    # Create new columns for wake status
    result_df = df.copy()
    
    for tid, coords in turbine_lat_lon.items():
        if tid == turbine_id:
            continue  # Skip reference turbine
            
        # Calculate direction vector
        dx = coords["Longitude"] - ref_lon
        dy = coords["Latitude"] - ref_lat
        dist = np.sqrt(dx**2 + dy**2)
        if dist == 0:
            continue
            
        # Normalized direction vector
        dx /= dist
        dy /= dist
        
        # Wake detection
        dot_product = dx * wind_u + dy * wind_v
        angle_diff = np.degrees(np.arccos(np.clip(dot_product, -1, 1)))
        is_waked = angle_diff <= wake_threshold_deg
        
        # Reverse wake detection
        dot_product_rev = -dx * wind_u + -dy * wind_v
        angle_diff_rev = np.degrees(np.arccos(np.clip(dot_product_rev, -1, 1)))
        is_waking = angle_diff_rev <= wake_threshold_deg
        
        # Add wake status columns for each variable
        for col in df.columns:
            if f';{tid[-1]}' in col:  # Match turbine number in column
                base_col = col.split(';')[0]
                result_df[f'{base_col};waked'] = is_waked.astype(int)
                result_df[f'{base_col};waking'] = is_waking.astype(int)
    
    return result_df

In [20]:
input_df= input_df[~input_df.target.isna()]

In [21]:
#pattern = re.compile(r'max|min')#max|min
#test_df = test_df.loc[:, ~test_df.columns.str.contains(pattern)]

In [22]:
#input_df = new_features(input_df)
#test_df= new_features(test_df)

In [23]:
def lagged_valid(df,timestamp_col='TimeStamp_StartFormat',wind_col="is_valid"):
        lag_30min = timedelta(minutes=30)
        lag_1hr = timedelta(minutes=60)
        wind_col
        lagged = df[[timestamp_col, wind_col]].copy()
        
        # Calculate 30min lag
        lagged['temp_30min'] = lagged[timestamp_col] + lag_30min
        lagged = lagged.rename(columns={wind_col: f'{wind_col}_lag30min'})
        df = pd.merge_asof(
            df.sort_values(timestamp_col),
            lagged[['temp_30min', f'{wind_col}_lag30min']].sort_values('temp_30min'),
            left_on=timestamp_col,
            right_on='temp_30min',
            direction='backward'
        ).drop(columns='temp_30min')
        
        # Calculate 1hr lag
        lagged['temp_1hr'] = lagged[timestamp_col] + lag_1hr
        lagged = lagged.rename(columns={f'{wind_col}_lag30min': f'{wind_col}_lag1hr'})
        df = pd.merge_asof(
            df.sort_values(timestamp_col),
            lagged[['temp_1hr', f'{wind_col}_lag1hr']].sort_values('temp_1hr'),
            left_on=timestamp_col,
            right_on='temp_1hr',
            direction='backward'
        ).drop(columns='temp_1hr')
    
        return df

    
   
        # Calculate 30min lag


In [24]:

from math import radians, sin, cos, sqrt, atan2, degrees

def calculate_relative_angle(wind_dir, turbine_dir):
    """Calculate smallest angle between wind and turbine direction"""
    angle_diff = abs(wind_dir - turbine_dir) % 360
    return min(angle_diff, 360 - angle_diff)

def identify_upstream_turbines(df, turbine_lat_lon, target_turbine="T01", wind_dir_col='ERA5_wind_direction_100m'):
    """
    Create mask of turbines upstream of target for each timestep
    Returns: DataFrame with boolean columns 'upstream_T##' for each turbine
    """
    target_lat = turbine_lat_lon[target_turbine]["Latitude"]
    target_lon = turbine_lat_lon[target_turbine]["Longitude"]
    
    upstream_mask = pd.DataFrame(index=df.index)
    
    for tid, coords in turbine_lat_lon.items():
        if tid == target_turbine:
            continue
            
        # Calculate direction from target to current turbine
        dx = coords["Longitude"] - target_lon
        dy = coords["Latitude"] - target_lat
        turbine_dir = np.degrees(np.arctan2(dy, dx)) % 360
        
        # Compare to wind direction
        angle_diffs = df[wind_dir_col].apply(
            lambda wd: calculate_relative_angle(wd, turbine_dir)
        )
        
        # Turbine is upstream if within ±90° of wind direction
        upstream_mask[f'upstream_{tid}'] = angle_diffs <= 45
        
    return upstream_mask

def jensen_wake_deficit(wind_speed, upstream_mask_row, turbine_lat_lon, target_turbine="T01", k=0.04, D_rotor=82):
    """
    Calculate combined wake deficit from all upstream turbines
    """
    target_coords = turbine_lat_lon[target_turbine]
    total_deficit = 0
    
    for tid, is_upstream in upstream_mask_row.items():
        if not is_upstream or not tid.startswith('upstream_'):
            continue
            
        tid = tid.replace('upstream_', '')  # Get original turbine ID
        turbine_coords = turbine_lat_lon[tid]
        
        # Distance between turbines (simplified)
        dx = turbine_coords["Longitude"] - target_coords["Longitude"]
        dy = turbine_coords["Latitude"] - target_coords["Latitude"]
        distance = 111.32 * sqrt(dx**2 + dy**2) * 1000  # Convert to meters
        
        # Jensen wake model
        D_wake = D_rotor + 2 * k * distance
        deficit = (1 - sqrt(1 - (D_rotor/D_wake)**2)) * wind_speed
        total_deficit += deficit
        
    return min(total_deficit, wind_speed)  # Can't lose more than all wind

def add_wake_metrics(df, turbine_lat_lon, target_turbine="T01", wind_speed_col='ERA5_wind_speed_100m'):
    """
    Main function to add wake metrics to DataFrame
    """
    df = df.copy()
    
    # 1. Identify upstream turbines
    upstream_mask = identify_upstream_turbines(df, turbine_lat_lon, target_turbine)
    df = pd.concat([df, upstream_mask], axis=1)
    
    # 2. Calculate wake deficit (Jensen model)
    df['T01_wake_deficit'] = df.apply(
        lambda row: jensen_wake_deficit(
            row[wind_speed_col],
            row[[c for c in upstream_mask.columns]],
            turbine_lat_lon,
            target_turbine
        ),
        axis=1
    )
    
    # 3. Calculate available wind speed
    df['T01_effective_ws'] = df[wind_speed_col] - df['T01_wake_deficit']
    
    
    
    return df

In [25]:
def calculate_relative_angle(wind_dir, turbine_dir):
    """Calculate smallest angle between wind and turbine direction"""
    angle_diff = abs(wind_dir - turbine_dir) % 360
    return min(angle_diff, 360 - angle_diff)

def haversine_distance(lat1, lon1, lat2, lon2):
    """Compute distance in meters between two lat/lon points using Haversine formula"""
    R = 6371000  # Earth radius in meters
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2 - lat1)
    dlambda = radians(lon2 - lon1)
    
    a = sin(dphi / 2)**2 + cos(phi1) * cos(phi2) * sin(dlambda / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def identify_upstream_turbines(df, turbine_lat_lon, target_turbine="T01", wind_dir_col='ERA5_wind_direction_100m', wake_half_angle=15):
    """
    Create mask of turbines upstream of target within wake cone
    """
    target_lat = turbine_lat_lon[target_turbine]["Latitude"]
    target_lon = turbine_lat_lon[target_turbine]["Longitude"]
    
    upstream_mask = pd.DataFrame(index=df.index)
    
    for tid, coords in turbine_lat_lon.items():
        if tid == target_turbine:
            continue

        # Direction from target to this turbine (for alignment)
        dx = coords["Longitude"] - target_lon
        dy = coords["Latitude"] - target_lat
        turbine_dir = degrees(atan2(dy, dx)) % 360
        
        # Calculate angular difference between wind direction and this turbine direction
        angle_diffs = df[wind_dir_col].apply(
            lambda wd: calculate_relative_angle(wd, turbine_dir)
        )
        
        # Turbine is upstream only if it's within the wake cone
        upstream_mask[f'upstream_{tid}'] = angle_diffs <= wake_half_angle
        
    return upstream_mask

def jensen_wake_deficit(wind_speed, upstream_mask_row, turbine_lat_lon, target_turbine="T01", k=0.04, default_D_rotor=82):
    """
    Calculate combined wake deficit from upstream turbines using quadratic sum
    """
    target_coords = turbine_lat_lon[target_turbine]
    target_lat = target_coords["Latitude"]
    target_lon = target_coords["Longitude"]
    
    total_deficit_squared = 0
    
    for colname, is_upstream in upstream_mask_row.items():
        if not is_upstream or not colname.startswith('upstream_'):
            continue
            
        tid = colname.replace('upstream_', '')
        turbine_coords = turbine_lat_lon[tid]
        rotor_diameter = turbine_coords.get("Rotor_diameter", default_D_rotor)
        
        # Distance between turbines
        dist = haversine_distance(
            turbine_coords["Latitude"], turbine_coords["Longitude"],
            target_lat, target_lon
        )
        
        # Jensen model wake radius at downstream distance
        D_wake = rotor_diameter + 2 * k * dist
        
        # Wake deficit from this turbine
        if D_wake > 0:
            deficit = wind_speed * (1 - sqrt(1 - (rotor_diameter / D_wake) ** 2))
            total_deficit_squared += deficit ** 2

    total_deficit = sqrt(total_deficit_squared)
    return min(total_deficit, wind_speed)

def add_wake_metrics(df, turbine_lat_lon, target_turbine="T01", wind_speed_col='ERA5_wind_speed_100m', wind_dir_col='ERA5_wind_direction_100m'):
    """
    Main function to compute and add wake metrics to DataFrame
    """
    df = df.copy()
    
    # Step 1: Identify upstream turbines within wake cone
    upstream_mask = identify_upstream_turbines(df, turbine_lat_lon, target_turbine, wind_dir_col)
    df = pd.concat([df, upstream_mask], axis=1)
    
    # Step 2: Compute Jensen wake deficit
    upstream_cols = [c for c in upstream_mask.columns]
    df[f'{target_turbine}_wake_deficit'] = df.apply(
        lambda row: jensen_wake_deficit(
            row[wind_speed_col],
            row[upstream_cols],
            turbine_lat_lon,
            target_turbine
        ),
        axis=1
    )
    
    # Step 3: Compute effective wind speed
    df[f'{target_turbine}_effective_ws'] = df[wind_speed_col] - df[f'{target_turbine}_wake_deficit']
    
    return df

In [26]:
input_df= add_wake_metrics(input_df, turbine_data, target_turbine="T01")
test_df= add_wake_metrics(test_df, turbine_data, target_turbine="T01")

In [27]:
input_df = lagged_valid(input_df)
test_df= lagged_valid(test_df)

In [28]:
from math import radians, sin, cos, sqrt, asin

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great-circle distance (in meters) between two points 
    on the Earth's surface given their latitude/longitude in degrees.
    """
    # Convert degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    
    # Earth radius in meters (mean radius = 6371 km)
    r = 6371 * 1000 
    return c * r

In [29]:
#input_df['weighted_wake_proximity'] = input_df.apply(
#    lambda row: weighted_wake_proximity(row, turbine_data),
#    axis=1
#)
#test_df['weighted_wake_proximity'] = test_df.apply(
#    lambda row: weighted_wake_proximity(row, turbine_data),
#    axis=1
#)

In [30]:
input_df = add_lagged_wind_speeds(input_df)
test_df= add_lagged_wind_speeds(test_df)

In [31]:
#input_df = day_feature_adder(input_df)
#test_df= day_feature_adder(test_df)

In [32]:
#input_df = add_trig_transforms(input_df)
#test_df= add_trig_transforms(test_df)

In [33]:
#input_df = filter_wake_affected_turbines(input_df, turbine_data)
#test_df = filter_wake_affected_turbines(test_df, turbine_data)

In [34]:
#input_df = add_lagged_act_power(input_df)
#test_df= add_lagged_act_power(test_df)

In [35]:
input_df.head()

Unnamed: 0,TimeStamp_StartFormat,wtc_AcWindSp_mean;1,wtc_AcWindSp_mean;2,wtc_AcWindSp_mean;3,wtc_AcWindSp_mean;4,wtc_AcWindSp_mean;5,wtc_AcWindSp_mean;7,wtc_AcWindSp_min;1,wtc_AcWindSp_min;2,wtc_AcWindSp_min;3,wtc_AcWindSp_min;4,wtc_AcWindSp_min;5,wtc_AcWindSp_min;7,wtc_AcWindSp_max;1,wtc_AcWindSp_max;2,wtc_AcWindSp_max;3,wtc_AcWindSp_max;4,wtc_AcWindSp_max;5,wtc_AcWindSp_max;7,wtc_AcWindSp_stddev;1,wtc_AcWindSp_stddev;2,wtc_AcWindSp_stddev;3,wtc_AcWindSp_stddev;4,wtc_AcWindSp_stddev;5,wtc_AcWindSp_stddev;7,wtc_ScYawPos_mean;1,wtc_ScYawPos_mean;2,wtc_ScYawPos_mean;3,wtc_ScYawPos_mean;4,wtc_ScYawPos_mean;5,wtc_ScYawPos_mean;7,wtc_ScYawPos_min;1,wtc_ScYawPos_min;2,wtc_ScYawPos_min;3,wtc_ScYawPos_min;4,wtc_ScYawPos_min;5,wtc_ScYawPos_min;7,wtc_ScYawPos_max;1,wtc_ScYawPos_max;2,wtc_ScYawPos_max;3,wtc_ScYawPos_max;4,wtc_ScYawPos_max;5,wtc_ScYawPos_max;7,wtc_ScYawPos_stddev;1,wtc_ScYawPos_stddev;2,wtc_ScYawPos_stddev;3,wtc_ScYawPos_stddev;4,wtc_ScYawPos_stddev;5,wtc_ScYawPos_stddev;7,wtc_NacelPos_mean;1,wtc_NacelPos_mean;2,wtc_NacelPos_mean;3,wtc_NacelPos_mean;4,wtc_NacelPos_mean;5,wtc_NacelPos_mean;7,wtc_NacelPos_min;1,wtc_NacelPos_min;2,wtc_NacelPos_min;3,wtc_NacelPos_min;4,wtc_NacelPos_min;5,wtc_NacelPos_min;7,wtc_NacelPos_max;1,wtc_NacelPos_max;2,wtc_NacelPos_max;3,wtc_NacelPos_max;4,wtc_NacelPos_max;5,wtc_NacelPos_max;7,wtc_GenRpm_mean;1,wtc_GenRpm_mean;2,wtc_GenRpm_mean;3,wtc_GenRpm_mean;4,wtc_GenRpm_mean;5,wtc_GenRpm_mean;7,wtc_GenRpm_min;1,wtc_GenRpm_min;2,wtc_GenRpm_min;3,wtc_GenRpm_min;4,wtc_GenRpm_min;5,wtc_GenRpm_min;7,wtc_GenRpm_max;1,wtc_GenRpm_max;2,wtc_GenRpm_max;3,wtc_GenRpm_max;4,wtc_GenRpm_max;5,wtc_GenRpm_max;7,wtc_GenRpm_stddev;1,wtc_GenRpm_stddev;2,wtc_GenRpm_stddev;3,wtc_GenRpm_stddev;4,wtc_GenRpm_stddev;5,wtc_GenRpm_stddev;7,wtc_PitcPosA_mean;1,wtc_PitcPosA_mean;2,wtc_PitcPosA_mean;3,wtc_PitcPosA_mean;4,wtc_PitcPosA_mean;5,wtc_PitcPosA_mean;7,wtc_PitcPosA_min;1,wtc_PitcPosA_min;2,wtc_PitcPosA_min;3,wtc_PitcPosA_min;4,wtc_PitcPosA_min;5,wtc_PitcPosA_min;7,wtc_PitcPosA_max;1,wtc_PitcPosA_max;2,wtc_PitcPosA_max;3,wtc_PitcPosA_max;4,wtc_PitcPosA_max;5,wtc_PitcPosA_max;7,wtc_PitcPosA_stddev;1,wtc_PitcPosA_stddev;2,wtc_PitcPosA_stddev;3,wtc_PitcPosA_stddev;4,wtc_PitcPosA_stddev;5,wtc_PitcPosA_stddev;7,wtc_PitcPosB_mean;1,wtc_PitcPosB_mean;2,wtc_PitcPosB_mean;3,wtc_PitcPosB_mean;4,wtc_PitcPosB_mean;5,wtc_PitcPosB_mean;7,wtc_PitcPosC_mean;1,wtc_PitcPosC_mean;2,wtc_PitcPosC_mean;3,wtc_PitcPosC_mean;4,wtc_PitcPosC_mean;5,wtc_PitcPosC_mean;7,wtc_PowerRef_endvalue;1,wtc_PowerRef_endvalue;2,wtc_PowerRef_endvalue;3,wtc_PowerRef_endvalue;4,wtc_PowerRef_endvalue;5,wtc_PowerRef_endvalue;7,wtc_ScReToOp_timeon;1,wtc_ScReToOp_timeon;2,wtc_ScReToOp_timeon;3,wtc_ScReToOp_timeon;4,wtc_ScReToOp_timeon;5,wtc_ScReToOp_timeon;7,wtc_ActPower_mean;1,wtc_ActPower_mean;2,wtc_ActPower_mean;3,wtc_ActPower_mean;4,wtc_ActPower_mean;5,wtc_ActPower_mean;7,wtc_ActPower_min;1,wtc_ActPower_min;2,wtc_ActPower_min;3,wtc_ActPower_min;4,wtc_ActPower_min;5,wtc_ActPower_min;7,wtc_ActPower_max;1,wtc_ActPower_max;2,wtc_ActPower_max;3,wtc_ActPower_max;4,wtc_ActPower_max;5,wtc_ActPower_max;7,wtc_ActPower_stddev;1,wtc_ActPower_stddev;2,wtc_ActPower_stddev;3,wtc_ActPower_stddev;4,wtc_ActPower_stddev;5,wtc_ActPower_stddev;7,wtc_AmbieTmp_mean;1,wtc_AmbieTmp_mean;2,wtc_AmbieTmp_mean;3,wtc_AmbieTmp_mean;4,wtc_AmbieTmp_mean;5,wtc_AmbieTmp_mean;7,ShutdownDuration;1,ShutdownDuration;2,ShutdownDuration;3,ShutdownDuration;4,ShutdownDuration;5,ShutdownDuration;7,ERA5_temperature_2m,ERA5_relative_humidity_2m,ERA5_dew_point_2m,ERA5_precipitation,ERA5_surface_pressure,ERA5_cloud_cover,ERA5_wind_speed_10m,ERA5_wind_speed_100m,ERA5_wind_direction_10m,ERA5_wind_direction_100m,ERA5_wind_gusts_10m,id,is_valid,target,hour,upstream_T02,upstream_T03,upstream_T04,upstream_T05,upstream_T06,upstream_T07,upstream_T08,upstream_T09,upstream_T10,upstream_T11,upstream_T12,upstream_T13,upstream_T14,upstream_T15,upstream_T16,upstream_T17,upstream_T18,upstream_T19,upstream_T20,upstream_T21,T01_wake_deficit,T01_effective_ws,is_valid_lag30min,is_valid_lag1hr,wtc_ActPower_mean;2_lag30min,wtc_ActPower_mean;2_lag1hr,wtc_ActPower_mean;3_lag30min,wtc_ActPower_mean;3_lag1hr,wtc_ActPower_mean;4_lag30min,wtc_ActPower_mean;4_lag1hr,wtc_ActPower_mean;5_lag30min,wtc_ActPower_mean;5_lag1hr,wtc_ActPower_mean;7_lag30min,wtc_ActPower_mean;7_lag1hr
0,2016-01-01 00:00:00+00:00,6.085917,5.830675,6.613091,6.99851,7.645727,7.04079,2.0,1.1,1.6,2.4,4.3,3.9,10.0,10.0,11.0,10.6,11.8,10.5,1.484347,1.586648,1.571053,1.228338,1.09691,1.317204,83.754387,69.120506,-122.800903,-113.638397,-118.159103,-106.894501,79.5,66.400002,-124.400002,-114.800003,-120.5,-108.599998,86.800003,72.300003,-119.900002,-109.599998,-113.199997,-104.900002,2.266932,1.913408,1.426679,2.130993,1.8788,1.008626,83.751183,69.122231,237.192902,246.365005,241.846603,253.104004,79.5,66.400002,235.600006,245.199997,239.5,251.399994,86.800003,72.300003,240.100006,250.399994,246.800003,255.100006,941.06897,914.643799,1043.970947,1131.11499,1114.666016,1131.890991,649.900024,612.700012,730.099976,940.900024,935.400024,868.900024,1273.400024,1332.099976,1406.0,1535.599976,1391.800049,1430.800049,188.158005,211.196304,177.679596,129.183502,91.292953,128.462006,-0.974687,-0.973245,-0.937077,-0.966635,-0.927565,-0.971865,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.517582,0.514683,0.522298,0.496113,0.497144,0.528061,-0.96637,-1.008135,-0.982902,-0.934692,-0.996075,-0.976442,-0.99509,-0.934227,-0.972555,-0.947758,-0.975915,-0.964212,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,352.522308,361.848511,468.959015,565.597473,533.635376,566.248108,78.0,72.0,180.0,309.0,287.0,248.0,760.0,869.0,974.0,1416.0,949.0,1018.0,187.078903,215.998001,213.243896,198.349503,127.434898,178.835403,0.0,0.827133,0.3154,0.0,0.0,0.0,0,0,0,0,0,0,1.792,73.957634,-2.358,0.0,976.166321,0.0,6.080296,10.040418,233.695404,236.784729,13.0,-210384,True,352.522308,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,10.040418,,,,,,,,,,,,
1,2016-01-01 00:10:00+00:00,5.824693,5.810768,5.895642,6.5219,6.413868,6.599652,2.1,2.2,1.9,2.2,2.3,2.3,9.6,10.2,8.8,12.5,10.9,11.0,1.141584,1.547302,1.293487,1.441869,1.438043,1.334877,88.496246,75.914597,-115.395798,-110.868401,-113.227501,-102.6987,84.400002,72.300003,-123.900002,-112.5,-116.199997,-108.699997,91.400002,81.400002,-113.5,-109.599998,-112.400002,-101.0,1.879867,2.106318,2.478934,1.365305,1.014673,2.78289,88.491821,75.907959,244.613007,249.131195,246.777298,257.305389,84.400002,72.300003,236.100006,247.5,243.800003,251.300003,91.400002,81.400002,246.5,250.399994,247.600006,259.0,878.17627,891.244385,877.905701,1068.406006,901.705383,1027.93103,691.099976,641.400024,668.200012,780.799988,667.400024,684.5,1142.199951,1382.900024,1157.400024,1478.699951,1270.099976,1248.5,128.342499,172.585403,148.661896,162.483597,134.646103,146.772797,-0.976545,-0.973472,-0.943475,-0.968723,-0.927995,-0.967702,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.514228,0.513102,0.521445,0.498363,0.498069,0.531581,-0.966463,-1.008063,-0.986907,-0.933587,-0.995267,-0.974115,-0.995205,-0.934798,-0.97826,-0.948613,-0.97302,-0.963473,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,294.078888,332.010406,308.005707,490.191193,314.5914,443.461914,119.0,103.0,109.0,233.0,109.0,132.0,564.0,948.0,599.0,1144.0,770.0,736.0,104.474998,165.113998,123.467796,211.253098,119.1884,153.566299,0.0,0.520733,0.07045,0.0,0.0,0.0,0,0,0,0,0,0,1.792,86.580147,-0.208,0.0,982.179504,100.0,3.301515,6.958448,178.264328,187.43132,6.9,-210383,True,294.078888,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,6.958448,,,361.848511,,468.959015,,565.597473,,533.635376,,566.248108,
2,2016-01-01 00:20:00+00:00,7.10018,6.386981,7.606015,7.695034,7.812548,7.469052,2.2,2.1,0.0,4.2,4.0,3.4,11.6,12.3,13.6,12.8,11.6,12.8,1.59964,1.661032,2.328598,1.501421,1.426233,1.648712,83.129791,69.414749,-120.673698,-113.834702,-118.3255,-104.277702,81.300003,64.0,-124.699997,-114.599998,-122.199997,-106.400002,87.800003,77.199997,-113.900002,-110.099998,-112.400002,-98.800003,1.907214,2.861577,3.214255,1.593463,3.1479,2.727755,83.128883,69.409897,239.316895,246.165298,241.671005,255.7173,81.300003,64.0,235.300003,245.399994,237.800003,253.600006,87.800003,77.199997,246.100006,249.899994,247.600006,261.200012,1137.56604,1037.644043,1212.978027,1254.245972,1118.692993,1178.525024,752.0,646.099976,754.0,960.5,760.599976,938.299988,1547.199951,1473.300049,1582.0,1564.900024,1456.5,1535.900024,174.083206,234.354401,277.971313,145.854706,153.791,166.3965,-0.968695,-0.978538,-0.932615,-0.967485,-0.928043,-0.97015,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.4,-0.4,-0.4,-0.4,-0.4,-0.4,0.516025,0.512991,0.511504,0.495999,0.496092,0.528538,-0.963727,-1.008043,-0.977577,-0.935797,-0.994107,-0.976495,-0.986437,-0.937108,-0.970872,-0.948943,-0.974592,-0.966483,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,587.302795,501.303894,816.978821,771.708191,556.22052,646.179321,182.0,105.0,190.0,333.0,205.0,304.0,1458.0,1132.0,2189.0,1740.0,1062.0,1415.0,243.555801,261.407715,491.376892,298.910797,199.444397,274.355103,0.0,0.4834,0.32405,0.0,0.0,0.0,0,0,0,0,0,0,7.742,61.013905,0.692,0.0,985.24054,78.0,3.478505,4.648656,198.435043,198.824799,7.7,-210382,True,587.302795,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,4.648656,,,332.010406,,308.005707,,490.191193,,314.5914,,443.461914,
3,2016-01-01 00:30:00+00:00,8.23228,7.787337,7.808475,9.624884,9.727865,8.529939,3.2,2.6,2.1,5.0,5.2,3.2,13.3,13.1,13.9,14.0,14.0,14.0,1.740764,1.739926,1.916947,1.526166,1.718514,1.670823,86.001266,72.334389,-116.544098,-111.4431,-115.163002,-101.1875,84.5,67.699997,-121.5,-114.599998,-117.300003,-104.599998,88.699997,74.400002,-113.099998,-107.300003,-113.199997,-98.099998,1.195644,1.684786,2.024229,2.212263,1.539548,1.795094,86.003304,72.337486,243.455795,248.555603,244.838501,258.816406,84.5,67.900002,238.5,245.399994,242.699997,255.399994,88.699997,74.400002,246.899994,252.699997,246.800003,261.899994,1326.420044,1267.133057,1314.844971,1498.531006,1422.848022,1386.286987,1047.699951,935.299988,908.5,1249.699951,1127.300049,1038.800049,1573.699951,1580.199951,1586.300049,1590.800049,1595.699951,1576.699951,132.203201,153.463898,170.619797,64.182907,126.391197,132.565598,-0.963905,-0.973183,-0.934377,-0.936525,-0.990278,-0.966398,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-0.1,-0.4,0.1,-0.2,0.3,-0.4,0.519505,0.512875,0.528237,0.487387,0.484581,0.526115,-0.964855,-1.004715,-0.9804,-0.910523,-1.05604,-0.971028,-0.985332,-0.9364,-0.973827,-0.922092,-1.037765,-0.961358,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,881.533325,799.274719,887.29071,1363.432983,1148.494995,1015.870972,412.0,319.0,235.0,717.0,540.0,406.0,1902.0,2040.0,2161.0,2266.0,2320.0,1953.0,285.7146,281.196808,345.216888,312.305389,381.315704,300.861908,0.0,0.778417,0.6814,0.0,0.042583,0.0,0,0,0,0,0,0,4.442,83.510201,1.892,0.0,983.532532,82.0,3.4,6.774216,180.0,188.488861,6.4,-210381,True,881.533325,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0,6.774216,True,,501.303894,,816.978821,,771.708191,,556.22052,,646.179321,
4,2016-01-01 00:40:00+00:00,9.534926,8.607587,9.26838,10.10559,9.908615,9.968257,4.6,2.7,5.2,4.0,5.6,5.5,17.4,17.200001,14.9,15.4,16.4,16.299999,2.1613,2.41416,1.713971,1.821593,2.207746,2.029599,81.58728,68.50782,-120.969101,-115.278,-120.016899,-105.753601,76.900002,64.900002,-124.300003,-119.0,-124.599998,-109.300003,88.5,72.699997,-113.400002,-112.900002,-114.800003,-102.199997,3.047187,2.998664,2.564816,2.133589,2.696447,1.842842,81.590797,68.504997,239.033997,244.722397,239.984695,254.243896,76.900002,64.900002,235.699997,241.0,235.399994,250.699997,88.5,72.699997,246.600006,247.100006,245.199997,257.799988,1436.891968,1385.954956,1458.185059,1511.962036,1404.344971,1494.881958,1059.699951,1022.0,1174.400024,1380.400024,1051.400024,1262.300049,1622.800049,1598.5,1600.699951,1605.099976,1618.400024,1607.5,132.491196,161.398102,89.934998,51.752781,147.227798,68.845284,-0.891823,-0.589367,-0.894888,-0.649507,-0.582958,-0.600953,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,8.1,7.8,4.3,7.3,10.9,8.0,1.093657,1.649069,0.705567,1.422941,1.70644,1.589473,-0.897377,-0.615583,-0.936103,-0.622947,-0.643735,-0.603192,-0.927097,-0.553957,-0.931835,-0.642977,-0.630328,-0.597603,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,600.0,600.0,600.0,600.0,600.0,600.0,1299.646973,1163.564941,1257.227051,1541.956055,1192.567017,1460.63501,437.0,414.0,609.0,912.0,429.0,745.0,2336.0,2329.0,2327.0,2330.0,2341.0,2342.0,531.022827,523.927002,396.175812,444.513397,508.840607,463.062988,0.0,0.94675,0.73675,0.0,0.009517,0.0,0,0,0,0,0,0,5.142,83.003822,2.492,0.6,976.71875,99.0,3.006659,4.401136,356.186005,358.698059,9.2,-210380,True,1299.646973,0,False,False,False,True,True,False,False,False,False,False,False,False,False,True,True,True,True,True,False,True,1.412466,2.98867,True,,799.274719,,887.29071,,1363.432983,,1148.494995,,1015.870972,


In [36]:
y = input_df["target"]


In [37]:
#input_df.replace([np.inf, -np.inf], np.nan, inplace=True)
#test_df = test_df.loc[:, ~test_df.columns.str.endswith("4")|~test_df.columns.str.endswith("5")|~test_df.columns.str.endswith("7")]
X_raw = input_df[test_df.columns].fillna(0).drop(["TimeStamp_StartFormat"],axis=1)
scaler = StandardScaler()
model = scaler.fit(X_raw)
X_scaled = model.transform(X_raw)

test_raw = test_df.fillna(0).drop(["TimeStamp_StartFormat"],axis=1)
#scaler = StandardScaler()
X_test = model.transform(test_raw)

In [38]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.25, random_state=123
)


In [39]:
train_data = pd.DataFrame(X_train,columns = X_raw.columns).copy()
train_data['target'] = y_train

In [40]:
mask = np.isfinite(y_train)

# Step 2: Filter X_train and y_train accordingly
X_train_clean = X_train[mask]
y_train_clean = y_train[mask]

In [41]:
X_raw = input_df[test_df.columns].fillna(0).drop(["TimeStamp_StartFormat"],axis=1)
X_raw["target"] = input_df["target"]
test_raw = test_df.fillna(0).drop(["TimeStamp_StartFormat"],axis=1)


In [43]:
train_data = X_raw.copy()#pd.DataFrame(X_train_clean,columns = X_raw.columns).copy()
#train_data['target'] = y_train_clean
train_data = train_data[~train_data['target'].isna()]

In [44]:
predictor = TabularPredictor(
    label='target',
    problem_type='regression',
    eval_metric='mae'
).fit(
    train_data,
	presets="best",
    time_limit=3600*8
)

# Predict on validation set
#val_pred = predictor.predict(val_data)
#val_mae = mean_absolute_error(val_data["target"], val_pred)
#print(f"Validation MAE: {val_mae:.4f}")

# Predict on test set
test_pred = predictor.predict(test_raw)

# Save predictions
np.savetxt("predictions_autogluon_xgb.csv", test_pred, delimiter=",")

No path specified. Models will be saved in: "AutogluonModels\ag-20250718_170627"
Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       20.20 GB / 31.86 GB (63.4%)
Disk Space Avail:   1566.28 GB / 1863.01 GB (84.1%)
Presets specified: ['best']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout 

[36m(_ray_fit pid=3504)[0m [1000]	valid_set's l1: 51.2086
[36m(_ray_fit pid=3504)[0m [2000]	valid_set's l1: 49.7179[32m [repeated 8x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=9384)[0m [3000]	valid_set's l1: 49.4093[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=9384)[0m [4000]	valid_set's l1: 48.9148[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=9384)[0m [5000]	valid_set's l1: 48.5847[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=6016)[0m [5000]	valid_set's l1: 48.1544[32m [repeated 5x across cluster][0m
[36m(_ray_fit pid=9384)[0m [6000]	valid_set's l1: 48.3474[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=15288)[0m [7000]	valid_set's l1: 48.0816[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=11960)[0m [7000]	val

[36m(_dystack pid=20068)[0m 	-47.6452	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	1085.36s	 = Training   runtime
[36m(_dystack pid=20068)[0m 	644.92s	 = Validation runtime
[36m(_dystack pid=20068)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 3428.19s of the 5823.87s of remaining time.
[36m(_dystack pid=20068)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=7.66%)


[36m(_ray_fit pid=21024)[0m [1000]	valid_set's l1: 50.6881[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=1804)[0m [2000]	valid_set's l1: 49.1929[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=1804)[0m [3000]	valid_set's l1: 48.5963[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=21024)[0m [4000]	valid_set's l1: 48.5698[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=20192)[0m [5000]	valid_set's l1: 48.9066[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=20192)[0m [6000]	valid_set's l1: 48.7338[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=20192)[0m [7000]	valid_set's l1: 48.6089[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=20192)[0m [8000]	valid_set's l1: 48.5323[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=8820)[0m [8000]	valid_set's l1: 48.0539[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=20192)[0m [9000]	valid_set's l1: 48.4751
[36m(_ray_fit pid=5652)[0m [9000]	valid_set's l1:

[36m(_dystack pid=20068)[0m 	-48.0518	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	1156.9s	 = Training   runtime
[36m(_dystack pid=20068)[0m 	641.62s	 = Validation runtime
[36m(_dystack pid=20068)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 2182.98s of the 4578.65s of remaining time.
[36m(_dystack pid=20068)[0m 	-54.1846	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	1610.45s	 = Training   runtime
[36m(_dystack pid=20068)[0m 	21.97s	 = Validation runtime
[36m(_dystack pid=20068)[0m Fitting model: CatBoost_BAG_L1 ... Training model for up to 548.77s of the 2944.45s of remaining time.
[36m(_dystack pid=20068)[0m 	Memory not enough to fit 8 folds in parallel. Will train 4 folds in parallel instead (Estimated 10.61% memory usage per fold, 42.45%/80.00% total).
[36m(_dystack pid=20068)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: 

[36m(_ray_fit pid=15108)[0m [1000]	valid_set's l1: 47.7392
[36m(_ray_fit pid=16792)[0m [1000]	valid_set's l1: 46.7568
[36m(_ray_fit pid=16792)[0m [2000]	valid_set's l1: 46.587[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=14888)[0m [3000]	valid_set's l1: 46.6253[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=15108)[0m [4000]	valid_set's l1: 47.5124[32m [repeated 5x across cluster][0m
[36m(_ray_fit pid=15108)[0m [5000]	valid_set's l1: 47.5393[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=11516)[0m [6000]	valid_set's l1: 46.9307[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=11516)[0m [7000]	valid_set's l1: 46.9616


[36m(_dystack pid=20068)[0m 	-46.7345	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	588.47s	 = Training   runtime
[36m(_dystack pid=20068)[0m 	36.68s	 = Validation runtime
[36m(_dystack pid=20068)[0m Fitting model: LightGBM_BAG_L2 ... Training model for up to 1447.33s of the 1447.25s of remaining time.
[36m(_dystack pid=20068)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=6.96%)
[36m(_dystack pid=20068)[0m 	-46.3063	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	58.35s	 = Training   runtime
[36m(_dystack pid=20068)[0m 	2.8s	 = Validation runtime
[36m(_dystack pid=20068)[0m Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 1382.45s of the 1382.37s of remaining time.
[36m(_dystack pid=20068)[0m 	-46.5219	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=20068)[0m 	1869.78s	 = Training   runtime
[36m(

[33m(raylet)[0m The node with node id: f340e74e9fd7e108cd26b94ce37c9d5953c8debd34567d9315c4d6d1 and address: 127.0.0.1 and node name: 127.0.0.1 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a 	(1) raylet crashes unexpectedly (OOM, etc.) 
	(2) raylet has lagging heartbeats due to slow network or busy workload.


In [45]:

# Prepare data as DataFrames with column names
train_data = X_raw.copy()#pd.DataFrame(X_train_clean,columns = X_raw.columns).copy()
#train_data['target'] = y_train_clean
train_data = train_data[~train_data['target'].isna()]
#val_data = pd.DataFrame(X_val,columns = X_raw.columns).copy()
#val_data['target'] = y_val
#val_data = val_data[~val_data['target'].isna()]
# Initialize predictor
predictor = TabularPredictor(
    label='target',
    problem_type='regression',
    eval_metric='mae'
).fit(
    train_data=train_data,
    #tuning_data=val_data,
    presets='experimental',
    #bagging=False,           # ✅ No Ray
    #num_bag_folds=0,         # ✅ Double confirm
    #use_bag_holdout=True,
    time_limit=3600*8,
    num_bag_folds=11, # to avoid raylets
    num_stack_levels=2 # when using bags is 2
     # Optional: increase if you want more output
)

# Predict on validation set
#val_pred = predictor.predict(val_data)
#val_mae = mean_absolute_error(val_data["target"], val_pred)
#print(f"Validation MAE: {val_mae:.4f}")

# Predict on test set
test_pred = predictor.predict(test_raw)

# Save predictions
np.savetxt("predictions_autogluon_xgb.csv", test_pred, delimiter=",")


No path specified. Models will be saved in: "AutogluonModels\ag-20250719_021226"
Preset alias specified: 'experimental' maps to 'experimental_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       20.07 GB / 31.86 GB (63.0%)
Disk Space Avail:   1563.23 GB / 1863.01 GB (83.9%)
Presets specified: ['experimental']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=11, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets o

LocalRayletDiedError: The task's local raylet died. Check raylet.out for more information.

In [None]:
input_df.head()

In [None]:
print(predictor.leaderboard(silent=True))

In [None]:
predictor.feature_metadata.get_features()

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

# Load data
#train_data = TabularDataset('train.csv')
#test_data = TabularDataset('test.csv')

# Train AutoGluon (automates everything)
predictor = TabularPredictor(label='target', problem_type='regression').fit(
    train_data,
    time_limit=3600,
    presets='best_quality'
)

# Predict and evaluate
predictions = predictor.predict(test_data)
leaderboard = predictor.leaderboard(test_data)
print(leaderboard)

In [None]:
xgb_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=12
    ,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1
)

# Train the model
xgb_model.fit(
    X_train, 
    y_train,
    eval_set=[(X_val, y_val)],
    
    verbose=True
)

# Evaluate on validation set
val_pred = xgb_model.predict(X_val)
val_mae = mean_absolute_error(y_val, val_pred)
print(f"Validation MAE: {val_mae:.4f}")


# Make predictions
test_pred = xgb_model.predict(X_test)

# Save predictions
np.savetxt("predictions_xgboost.csv", test_pred, delimiter=",")

In [None]:
#best 50.14 depth 12

In [None]:

# Train basic model
model = xgb.XGBRegressor().fit(X_train, y_train)

# Visualize feature importance
xgb.plot_importance(model, max_num_features=20)
plt.show()

In [None]:
from sklearn.feature_selection import mutual_info_regression

def evaluate_features(X, y):
    """Calculate mutual information scores"""
    mi_scores = mutual_info_regression(X, y)
    return pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Usage:
mi_scores = evaluate_features(X_train, y_train)
print(mi_scores.head(20)) 