In [1]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent.parent))  # Add project root to path

## Load config

In [2]:
# load config yaml
import yaml

with open("../../config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [3]:
config

{'preprocessing': {'impute_power_when_turbine_stopped': {'enabled': True,
   'method': 'most_similar_turbine'}}}

## Load data

In [4]:
import pandas as pd

# load train and validation data
df_train = pd.read_parquet("../../data/modified/sdwpf_train_214days_v1_preprocessed.parquet")
df_validation = pd.read_parquet("../../data/modified/sdwpf_validation_31days_v1_preprocessed.parquet")

## Identify most similar turbines

In [5]:
# for each turbine, calculate the pearson correlation coefficient of Patv with all the other turbines


def get_top_similar_turbines(
    df: pd.DataFrame,
    turbine_col: str = "TurbID",
    timestamp_col: str = "timestamp",
    value_col: str = "Patv",
    n_top: int = 10,
) -> dict:
    """
    Calculate top N most similar turbines for each turbine based on Pearson correlation.

    Args:
        df: DataFrame containing turbine data
        turbine_col: Name of the column containing turbine IDs
        timestamp_col: Name of the column containing timestamps
        value_col: Name of the column containing values to correlate
        n_top: Number of similar turbines to return (default: 10)

    Returns:
        Dictionary with structure:
        {
            turbine_id: {
                'most_similar_turbine_1': (turbine_id, correlation),
                'most_similar_turbine_2': (turbine_id, correlation),
                ...
                'most_similar_turbine_10': (turbine_id, correlation)
            },
            ...
        }
    """
    # Pivot the data to get turbines as columns and timestamp as index
    pivot_df = df.pivot(index=timestamp_col, columns=turbine_col, values=value_col)

    # Calculate correlation matrix between all turbines
    corr_matrix = pivot_df.corr(method="pearson")

    most_similar_turbines = {}

    for turbine in corr_matrix.index:
        # Get correlations for this turbine, excluding self-correlation
        correlations = corr_matrix[turbine].drop(turbine)

        # Sort correlations in descending order and get top N
        top_correlations = correlations.sort_values(ascending=False).head(n_top)

        # Create dictionary for this turbine
        turbine_dict = {}
        for i, (similar_turbine, corr_value) in enumerate(top_correlations.items(), 1):
            key = f"most_similar_turbine_{i}"
            turbine_dict[key] = (similar_turbine, corr_value)

        most_similar_turbines[turbine] = turbine_dict

    return most_similar_turbines


def convert_similarities_to_df(similarities_dict: dict) -> pd.DataFrame:
    """
    Convert the similarities dictionary to a DataFrame in long format.

    Args:
        similarities_dict: Dictionary output from get_top_similar_turbines()

    Returns:
        DataFrame with columns [turbine_id, rank, similar_turbine_id, correlation]
    """
    rows = []
    for turbine_id, similar_turbines in similarities_dict.items():
        for rank, (similar_turbine, correlation) in enumerate(similar_turbines.values(), 1):
            rows.append(
                {
                    "turbine_id": turbine_id,
                    "rank": rank,
                    "similar_turbine_id": similar_turbine,
                    "correlation": correlation,
                }
            )
    return pd.DataFrame(rows)

In [6]:
# get top 10 most similar turbines for each turbine
similar_turbines = get_top_similar_turbines(df_train)

# convert to dataframe
similar_turbines_df = convert_similarities_to_df(similar_turbines)

# save similar turbines df to parquet
similar_turbines_df.to_parquet("../../data/modified/similar_turbines_df_train.parquet")

In [7]:
df_train[df_train["impute_day_patv"] == 1]

Unnamed: 0,TurbID,Day,time,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv,timestamp,maintenance_day,turbine_stopped,turbine_at_rest,impute_day_patv
8640,1,61,00:00,11.84,2.54,28.33,38.59,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:00:00,0,1,0,1
8641,1,61,00:10,11.44,-0.38,28.39,38.44,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:10:00,0,1,0,1
8642,1,61,00:20,10.80,1.33,28.30,38.26,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:20:00,0,1,0,1
8643,1,61,00:30,13.35,4.67,28.06,38.16,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:30:00,0,1,0,1
8644,1,61,00:40,12.98,6.35,27.86,38.06,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:40:00,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4644859,132,161,23:10,0.87,-53.21,17.40,18.60,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:10:00,0,0,1,1
4644860,132,161,23:20,0.76,-51.92,17.40,18.48,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:20:00,0,0,1,1
4644861,132,161,23:30,0.94,-53.77,17.36,18.40,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:30:00,0,0,1,1
4644862,132,161,23:40,0.98,-52.85,17.20,18.39,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:40:00,0,0,1,1


## Impute values

In [8]:
# Apply imputation to train and validation sets
from windfarm_forecast.feature_engineering import impute_power_output

target_col = "Patv"

if config["preprocessing"]["impute_power_when_turbine_stopped"]["enabled"]:
    if config["preprocessing"]["impute_power_when_turbine_stopped"]["method"] == "most_similar_turbine":
        df_train = impute_power_output(df_train, similar_turbines_df)
        df_validation = impute_power_output(df_validation, similar_turbines_df)

        target_col = "Patv_imputed"
    else:
        raise NotImplementedError("The chosen imputation method is not implemented.")

In [9]:
# Archived: When we aggregate on dialy level, create a new feature that indicates the percentage of samples where power output is nan despite enough wind. Also create feature about proportion of samples containing nan values for each day

In [10]:
df_train[df_train["impute_day_patv"] == 1]

Unnamed: 0,TurbID,Day,time,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv,timestamp,maintenance_day,turbine_stopped,turbine_at_rest,impute_day_patv,Patv_imputed
8640,1,61,00:00,11.84,2.54,28.33,38.59,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:00:00,0,1,0,1,1400.65
8641,1,61,00:10,11.44,-0.38,28.39,38.44,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:10:00,0,1,0,1,1502.23
8642,1,61,00:20,10.80,1.33,28.30,38.26,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:20:00,0,1,0,1,1456.32
8643,1,61,00:30,13.35,4.67,28.06,38.16,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:30:00,0,1,0,1,1475.85
8644,1,61,00:40,12.98,6.35,27.86,38.06,219.95,1.90,1.00,1.9,-0.3,,2020-06-30 00:40:00,0,1,0,1,1451.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4060075,132,161,23:10,0.87,-53.21,17.40,18.60,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:10:00,0,0,1,1,0.00
4060076,132,161,23:20,0.76,-51.92,17.40,18.48,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:20:00,0,0,1,1,0.00
4060077,132,161,23:30,0.94,-53.77,17.36,18.40,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:30:00,0,0,1,1,0.00
4060078,132,161,23:40,0.98,-52.85,17.20,18.39,449.42,89.77,89.77,0.0,0.0,,2020-10-08 23:40:00,0,0,1,1,0.00


## Aggregate data on daily level

In [11]:
df_train.head()

Unnamed: 0,TurbID,Day,time,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv,timestamp,maintenance_day,turbine_stopped,turbine_at_rest,impute_day_patv,Patv_imputed
0,1,1,00:00,,,,,,,,,,,2020-05-01 00:00:00,0,0,0,0,
1,1,1,00:10,6.17,-3.99,30.73,41.8,25.92,1.0,1.0,1.0,-0.25,494.66,2020-05-01 00:10:00,0,0,0,0,494.66
2,1,1,00:20,6.27,-2.18,30.6,41.63,20.91,1.0,1.0,1.0,-0.24,509.76,2020-05-01 00:20:00,0,0,0,0,509.76
3,1,1,00:30,6.42,-0.73,30.52,41.52,20.91,1.0,1.0,1.0,-0.26,542.53,2020-05-01 00:30:00,0,0,0,0,542.53
4,1,1,00:40,6.25,0.89,30.49,41.38,20.91,1.0,1.0,1.0,-0.23,509.36,2020-05-01 00:40:00,0,0,0,0,509.36


In [12]:
# aggregate data on daily level
def aggregate_daily(df):
    """
    Aggregates wind turbine data from high frequency measurements (10 minute interval) to daily statistics.

    Args:
        df (pd.DataFrame): Input DataFrame containing wind turbine measurements with columns:
            - Day: Day identifier
            - Wspd: Wind speed
            - Wdir: Wind direction
            - Patv: Power output
            - maintanance_day: Maintenance day indicator
            - turbine_stopped: Turbine stopped indicator
            - turbine_at_rest: Turbine at rest indicator

    Returns:
        pd.DataFrame: Daily aggregated DataFrame containing:
            - Mean and std of wind speed and direction
            - Sum of power output
            - Mean of maintenance, stopped, and rest indicators
    """
    df_daily = (
        df.groupby(["Day"])
        .agg(
            {
                "Wspd": ["mean", "std"],
                "Wdir": ["mean", "std"],
                "Etmp": ["mean"],
                "Patv": "sum",
                "maintenance_day": "mean",
                "turbine_stopped": "mean",
                "turbine_at_rest": "mean",
            }
        )
        .reset_index()
    )

    # Flatten column names and reset index
    df_daily.columns = ["_".join(col).strip("_") for col in df_daily.columns.values]
    df_daily = df_daily.reset_index(drop=True)

    return df_daily


# df_train_daily = aggregate_daily(df_train)
# df_validation_daily = aggregate_daily(df_validation)

# ## Convert kW to MW
# df_train_daily['Patv_sum'] = df_train_daily['Patv_sum'] / 1000
# df_validation_daily['Patv_sum'] = df_validation_daily['Patv_sum'] / 1000

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4129344 entries, 0 to 4129343
Data columns (total 19 columns):
 #   Column           Dtype         
---  ------           -----         
 0   TurbID           object        
 1   Day              int64         
 2   time             object        
 3   Wspd             float64       
 4   Wdir             float64       
 5   Etmp             float64       
 6   Itmp             float64       
 7   Ndir             float64       
 8   Pab1             float64       
 9   Pab2             float64       
 10  Pab3             float64       
 11  Prtv             float64       
 12  Patv             float64       
 13  timestamp        datetime64[ns]
 14  maintenance_day  int64         
 15  turbine_stopped  int64         
 16  turbine_at_rest  int64         
 17  impute_day_patv  int64         
 18  Patv_imputed     float64       
dtypes: datetime64[ns](1), float64(11), int64(5), object(2)
memory usage: 598.6+ MB


In [14]:
# aggregate data over all turbines
def aggregate_across_turbines(df):
    """
    Aggregates wind farm data across all turbines for each timestamp.

    Args:
        df (pd.DataFrame): Input DataFrame containing wind turbine measurements

    Returns:
        pd.DataFrame: Aggregated DataFrame containing mean of measurements and
                     sum of power outputs across turbines for each timestamp
    """
    df_agg = (
        df.groupby(["timestamp"])
        .agg(
            {
                "TurbID": "count",
                "Day": "mean",
                "time": "first",
                "Wspd": "mean",
                "Wdir": "mean",
                "Etmp": "mean",
                "Itmp": "mean",
                "Ndir": "mean",
                "maintenance_day": "mean",
                "turbine_stopped": "mean",
                "turbine_at_rest": "mean",
                "impute_day_patv": "mean",
                "Patv": "sum",
                "Patv_imputed": "sum",
            }
        )
        .rename(columns={"TurbID": "num_turbines"})
    )

    # Calculate turbines with positive power output separately using vectorized operations
    n_active = df.groupby("timestamp")["Patv_imputed"].apply(lambda x: (x > 0).sum())
    df_agg["n_active_turbines"] = n_active

    return df_agg.reset_index()


df_train_agg = aggregate_across_turbines(df_train)
df_validation_agg = aggregate_across_turbines(df_validation)

In [15]:
df_train_agg

Unnamed: 0,timestamp,num_turbines,Day,time,Wspd,Wdir,Etmp,Itmp,Ndir,maintenance_day,turbine_stopped,turbine_at_rest,impute_day_patv,Patv,Patv_imputed,n_active_turbines
0,2020-05-01 00:00:00,134,1.0,00:00,,,,,,0.0,0.000000,0.000000,0.007463,0.00,0.00,0
1,2020-05-01 00:10:00,134,1.0,00:10,5.653284,-5.581119,31.184240,36.925522,20.347015,0.0,0.022388,0.029851,0.007463,46766.08,47155.32,131
2,2020-05-01 00:20:00,134,1.0,00:20,5.514104,-5.615970,31.110720,36.851493,19.238955,0.0,0.022388,0.029851,0.007463,44853.70,45217.96,131
3,2020-05-01 00:30:00,134,1.0,00:30,5.641053,-5.070977,31.055403,36.803158,19.474211,0.0,0.022388,0.029851,0.007463,46268.76,46676.80,130
4,2020-05-01 00:40:00,134,1.0,00:40,5.360075,-5.564436,30.956129,36.659474,17.313233,0.0,0.022388,0.029851,0.007463,41189.26,41540.24,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30811,2020-11-30 23:10:00,134,214.0,23:10,4.860224,1.989701,1.449764,11.251866,252.094701,0.0,0.000000,0.000000,0.000000,43688.69,43688.69,134
30812,2020-11-30 23:20:00,134,214.0,23:20,4.600299,2.525448,1.405433,11.197388,256.603731,0.0,0.000000,0.000000,0.000000,40150.28,40150.28,134
30813,2020-11-30 23:30:00,134,214.0,23:30,4.288134,2.386493,1.315039,11.132687,259.334030,0.0,0.000000,0.000000,0.000000,35560.85,35560.85,134
30814,2020-11-30 23:40:00,134,214.0,23:40,4.205299,1.267388,1.269055,11.099701,261.968209,0.0,0.000000,0.000000,0.000000,34561.02,34561.02,134


## Convert to MW



In [16]:
df_train_agg[target_col] = df_train_agg[target_col] / 1000
df_validation_agg[target_col] = df_validation_agg[target_col] / 1000

In [17]:
df_train_agg

Unnamed: 0,timestamp,num_turbines,Day,time,Wspd,Wdir,Etmp,Itmp,Ndir,maintenance_day,turbine_stopped,turbine_at_rest,impute_day_patv,Patv,Patv_imputed,n_active_turbines
0,2020-05-01 00:00:00,134,1.0,00:00,,,,,,0.0,0.000000,0.000000,0.007463,0.00,0.00000,0
1,2020-05-01 00:10:00,134,1.0,00:10,5.653284,-5.581119,31.184240,36.925522,20.347015,0.0,0.022388,0.029851,0.007463,46766.08,47.15532,131
2,2020-05-01 00:20:00,134,1.0,00:20,5.514104,-5.615970,31.110720,36.851493,19.238955,0.0,0.022388,0.029851,0.007463,44853.70,45.21796,131
3,2020-05-01 00:30:00,134,1.0,00:30,5.641053,-5.070977,31.055403,36.803158,19.474211,0.0,0.022388,0.029851,0.007463,46268.76,46.67680,130
4,2020-05-01 00:40:00,134,1.0,00:40,5.360075,-5.564436,30.956129,36.659474,17.313233,0.0,0.022388,0.029851,0.007463,41189.26,41.54024,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30811,2020-11-30 23:10:00,134,214.0,23:10,4.860224,1.989701,1.449764,11.251866,252.094701,0.0,0.000000,0.000000,0.000000,43688.69,43.68869,134
30812,2020-11-30 23:20:00,134,214.0,23:20,4.600299,2.525448,1.405433,11.197388,256.603731,0.0,0.000000,0.000000,0.000000,40150.28,40.15028,134
30813,2020-11-30 23:30:00,134,214.0,23:30,4.288134,2.386493,1.315039,11.132687,259.334030,0.0,0.000000,0.000000,0.000000,35560.85,35.56085,134
30814,2020-11-30 23:40:00,134,214.0,23:40,4.205299,1.267388,1.269055,11.099701,261.968209,0.0,0.000000,0.000000,0.000000,34561.02,34.56102,134


In [18]:
# TODO: transform the data to sequences to predict from the last 14 days the next 2 days

In [19]:
# create feature_engineering folder in data/modified and save the aggregated data
save_path = "../../data/modified/feature_engineered/"

Path(save_path).mkdir(parents=True, exist_ok=True)
df_train_agg.to_parquet(save_path + "sdwpf_train_214days_v1_feature_engineered.parquet")
df_validation_agg.to_parquet(save_path + "sdwpf_validation_31days_v1_feature_engineered.parquet")

In [20]:
# TODO: add scaling of features and checkeffect on performance

In [21]:
# TODO: improvements:
# Add lag features
# Add rolling statistics
# Add seasonal features (month, day of week, etc.)
# add wind power (wspd ** 3), cyclical encoding of wdir?, rolling statistics of wspd, wdir, etmp, patv