<h1>Build feature and output arrays

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import OneHotEncoder
import joblib

# lists for all data
all_turbine_types = []
all_hub_heights = []
all_capacities = []
all_commissioning_dates = []
all_production_data = []

with open(r"E:\MA_data\WPPs+production+reanalysis\WPPs+production+wind_new.json", "r", encoding="utf-8") as file:
    WPP_production_wind = json.load(file)

# collect data
for wpp in WPP_production_wind:
    all_turbine_types.append(str(wpp["Turbine"]))
    all_hub_heights.append(wpp["Hub_height"])
    all_capacities.append(wpp["Capacity"])
    all_commissioning_dates.append(f"{wpp['Commissioning_date']}/06" if isinstance(wpp["Commissioning_date"], str) and "/" not in wpp["Commissioning_date"] else wpp["Commissioning_date"])
    all_production_data.append(wpp["Production"])

# One-Hot-Encoding for turbine types
encoder = OneHotEncoder(sparse_output=False)
turbine_types_onehot = encoder.fit_transform(np.array(all_turbine_types).reshape(-1, 1))

# convert to datetime
standardised_dates = pd.to_datetime(all_commissioning_dates, format='%Y/%m')

# calculate age
ref_date = pd.Timestamp("2024-12-01")
ages = ref_date.year * 12 + ref_date.month - (standardised_dates.year * 12 + standardised_dates.month)

# create combined features and output lists
combined_features_raw = []
output_raw = []

# convert data in feature arrays
for idx, production_data in enumerate(all_production_data):
    num_rows = len(production_data)

    # repetitions for common features
    turbine_type_repeated = np.tile(turbine_types_onehot[idx], (num_rows, 1))
    hub_height_repeated = np.full((num_rows, 1), all_hub_heights[idx])
    age_repeated = np.full((num_rows, 1), ages[idx])
    capacities_repeated = np.full((num_rows, 1), all_capacities[idx])

    # extract production values and wind speeds
    production_values = np.array([entry[1] for entry in production_data]).reshape(-1, 1) / all_capacities[idx]
    wind_speeds = np.array([entry[2] for entry in production_data]).reshape(-1, 1)

    # combine all features
    combined_chunk = np.hstack((
        turbine_type_repeated,
        hub_height_repeated,
        age_repeated,
        wind_speeds
    ))

    # add the data
    combined_features_raw.append(combined_chunk)
    output_raw.append(production_values)

joblib.dump(encoder, "parameters/encoder.pkl")
joblib.dump(encoder, "parameters_deployment/encoder.pkl")

# combine all data chunks to one array
combined_features_raw = np.vstack(combined_features_raw)
output_raw = np.vstack(output_raw)

# round all values to two decimal places
combined_features_raw = np.round(combined_features_raw, decimals=4)
output_raw = np.round(output_raw, decimals=4)

<h1>Scale feature vector and define Dataset

In [None]:
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
import torch
import joblib
import numpy as np

# Erstelle eine Maske mit 10.000 zufälligen Indizes
random_indices = np.random.choice(combined_features_raw.shape[0], 10000, replace=False)

combined_features_subset = combined_features_raw.copy()
output_subset = output_raw.copy()

# Extrahiere die entsprechenden Werte aus den Arrays
combined_features_subset = combined_features_subset[random_indices]
output_subset = output_subset[random_indices]

choice = 0
if choice == 0:
    combined_features = combined_features_raw.copy()
    output = output_raw.copy()
elif choice == 1:
    combined_features = combined_features_subset.copy()
    output = output_subset.copy()


# Separate Scaler für jedes Feature
scaler_wind = StandardScaler()
scaler_ages = StandardScaler()
scaler_hub_heights = StandardScaler()

# Skalieren der einzelnen Features
combined_features[:, -1] = scaler_wind.fit_transform(combined_features[:, -1].reshape(-1, 1)).flatten() # scale wind speeds
combined_features[:, -2] = scaler_ages.fit_transform(combined_features[:, -2].reshape(-1, 1)).flatten()  # scale ages
combined_features[:, -3] = scaler_hub_heights.fit_transform(combined_features[:, -3].reshape(-1, 1)).flatten()  # scale hub heights

# Speichere alle Scaler in einem Dictionary
scalers = {
    "winds": scaler_wind,
    "ages": scaler_ages,
    "hub_heights": scaler_hub_heights,
}

# Speichere das Dictionary mit Joblib
joblib.dump(scalers, "parameters/scalers.pkl")
joblib.dump(scalers, "parameters_deployment/scalers.pkl")

# Dataset-Klasse für PyTorch
class WindPowerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.targets[index]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Erstellung der PyTorch-Datasets
dataset = WindPowerDataset(combined_features, output)

<h1>Define Model

In [14]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(0.3366)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)  # No activation in the output layer for regression
        return x


<h1>Training and Testing with reanalysis data</h1>
The computing resources on the personal PC are

• 4 physical CPU cores, with Hyper-Threading 4 additional virtual CPU cores --> 8 logical CPU cores

• 1 integrated GPU, that can be used with import torch_directml, device = torch_directml.device(), but it is much slower than the CPUs (and doesn't support HuberLoss: this calculation must be outsourced to the CPU. And no float64, only float32 datatypes)

In [None]:
import torch
import torch.optim as optim
from torch.nn import HuberLoss, MSELoss, L1Loss
from torch.utils.data import DataLoader, random_split

params = {"batch_size": 128,
          "lr": 0.00010155,
          "number_epochs": 10}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Model
input_size = combined_features.shape[1]

# use static instead of dynamic computational graphs
model = torch.jit.script(MLP(input_size=input_size)).to(device)
model.to(device)

# Trainings-Konfiguration
mae_criterion = L1Loss()
mse_criterion = MSELoss()
huber_criterion = HuberLoss()
optimizer = optim.Adam(model.parameters(), lr=params["lr"])

# shuffling doesn't matter here, has already taken place during random_split
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)

# Training
for epoch in range(params["number_epochs"]):
    print(f"Epoch {epoch + 1}/{params['number_epochs']}")
    model.train()
    train_loss_mae, train_loss_mse, train_loss_huber = 0, 0, 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        
        # Calculate metrics for each criterion
        loss_mae = mae_criterion(outputs, batch_y)
        loss_mse = mse_criterion(outputs, batch_y)
        loss_huber = huber_criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss_huber.backward()
        optimizer.step()

        # Accumulate metrics for logging
        train_loss_mae += loss_mae.item()
        train_loss_mse += loss_mse.item()
        train_loss_huber += loss_huber.item()

    train_loss_mae /= len(train_loader)
    train_loss_mse /= len(train_loader)
    train_loss_huber /= len(train_loader)

print(f"Training metrics")
print(f"    Huber: {train_loss_huber}")
print(f"    MSE: {train_loss_mse}")
print(f"    MAE: {train_loss_mae}")
print(f"    RMSE: {train_loss_mse}")

# Testen
model.eval()

test_loss_mae, test_loss_mse, test_loss_huber = 0, 0, 0

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        preds = model(batch_x)
        
        test_loss_mae += mae_criterion(preds, batch_y).item()
        test_loss_mse += mse_criterion(preds, batch_y).item()
        test_loss_huber += huber_criterion(preds, batch_y).item()

test_loss_mae /= len(test_loader)
test_loss_mse /= len(test_loader)
test_loss_huber /= len(test_loader)

print(f"Testing metrics")
print(f"    Huber: {test_loss_huber}")
print(f"    MSE: {test_loss_mse}")
print(f"    MAE: {test_loss_mae}")
print(f"    RMSE: {np.sqrt(test_loss_mse)}")

# Save model
torch.save(model.state_dict(), "parameters/model.pth")
print(f"Model parameters saved")

joblib.dump(input_size, "parameters/input_size.pkl")
print(f"Input size saved")


metrics = {
    "Training": {
        "Huber": train_loss_huber,
        "MAE": train_loss_mae,
        "MSE": train_loss_mse,
        "RMSE": np.sqrt(test_loss_mse)

    },
    "Testing": {
        "Huber": test_loss_huber,
        "MAE": test_loss_mae,
        "MSE": test_loss_mse,
        "RMSE": np.sqrt(test_loss_mse)

    }
}

joblib.dump(metrics, f"parameters/metrics.pkl")
print(f"metrics saved")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Losses
    Huber: 0.016687246595590552
    MSE: 0.03337471886359991
    MAE: 0.1301842309129482
    RMSE: 0.03337471886359991
Training Losses
    Huber: 0.01599478508873228
    MSE: 0.03198958444470639
    MAE: 0.12471857018331882
    RMSE: 0.17885632346860536
Model parameters saved
Input size saved
Losses saved


<h1>Training on all reanalysis data

In [None]:
import torch
import torch.optim as optim
from torch.nn import HuberLoss, MSELoss, L1Loss
from torch.utils.data import DataLoader

params = {"batch_size": 128,
          "lr": 0.00010155,
          "number_epochs": 10}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# DataLoader
data_loader = DataLoader(dataset, batch_size=params["batch_size"], shuffle=True)

# Model
input_size = combined_features[0].shape[1]

# use static instead of dynamic computational graphs
model = torch.jit.script(MLP(input_size=input_size)).to(device)
model.to(device)

# Trainings-Konfiguration
mae_criterion = L1Loss()
mse_criterion = MSELoss()
huber_criterion = HuberLoss()
optimizer = optim.Adam(model.parameters(), lr=params["lr"])

# Training
for epoch in range(params["number_epochs"]):
    print(f"Epoch {epoch + 1}/{params['number_epochs']}")
    model.train()
    train_loss_mae, train_loss_mse, train_loss_huber = 0, 0, 0

    for batch_x, batch_y in data_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        
        # Calculate metrics for each criterion
        loss_mae = mae_criterion(outputs, batch_y)
        loss_mse = mse_criterion(outputs, batch_y)
        loss_huber = huber_criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss_huber.backward()
        optimizer.step()

        # Accumulate metrics for logging
        train_loss_mae += loss_mae.item() # average loss of a single batch
        train_loss_mse += loss_mse.item()
        train_loss_huber += loss_huber.item()

    train_loss_mae /= len(data_loader)
    train_loss_mse /= len(data_loader)
    train_loss_huber /= len(data_loader)

# Save model
torch.save(model.state_dict(), "parameters_deployment/model.pth")
print(f"Model parameters saved")

joblib.dump(input_size, "parameters_deployment/input_size.pkl")
print(f"Input size saved")

print(f"Training metrics")
print(f"    Huber: {train_loss_huber}")
print(f"    MSE: {train_loss_mse}")
print(f"    MAE: {train_loss_mae}")
print(f"    RMSE: {train_loss_mse}")

metrics = {
    "Huber": train_loss_huber,
    "MAE": train_loss_mae,
    "MSE": train_loss_mse,
    "RMSE": np.sqrt(train_loss_mse)
}

joblib.dump(metrics, f"parameters_deployment/metrics.pkl")
print(f"metrics saved")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model parameters saved
Input size saved
Training Losses
    Huber: 0.016676704020100366
    MSE: 0.03335354720986296
    MAE: 0.13082658460584123
    RMSE: 0.03335354720986296
Losses saved


<h1>Load reforecast data

In [None]:
import os
import joblib

encoder = joblib.load("parameters/encoder.pkl")
scalers = joblib.load("parameters/scalers.pkl")

class WindPowerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.targets[index]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

input_dir = r"E:\MA_data\WPPs+production+reforecast"
dataset_lead_times = {}

for file in os.listdir(input_dir):
    file_path = os.path.join(input_dir, file)
    if os.path.isfile(file_path):  # Ensure it's a file (not a folder)
        lead_time = int(file.split("_")[-1].replace(".json", ""))
        with open(file_path, "r", encoding="utf-8") as file:
            forecast_data = json.load(file)

    print(f"Processing lead time: {lead_time}")

    all_turbine_types = []
    all_hub_heights = []
    all_capacities = []
    all_commissioning_dates = []
    all_production_data = []

    for unique_key, wpp in forecast_data.items():
        all_turbine_types.append(str(wpp["Turbine"]))
        all_hub_heights.append(wpp["Hub_height"])
        all_capacities.append(wpp["Capacity"])
        all_commissioning_dates.append(f"{wpp['Commissioning_date']}/06" if isinstance(wpp["Commissioning_date"], str) and "/" not in wpp["Commissioning_date"] else wpp["Commissioning_date"])
        all_production_data.append(wpp["Time Series"])

    # One-Hot-Encoding for turbine types
    turbine_types_onehot = encoder.transform(np.array(all_turbine_types).reshape(-1, 1))

    # convert to datetime
    standardised_dates = pd.to_datetime(all_commissioning_dates, format='%Y/%m')

    # calculate age
    ref_date = pd.Timestamp("2024-12-01")
    ages = ref_date.year * 12 + ref_date.month - (standardised_dates.year * 12 + standardised_dates.month)

    # create combined features and output lists
    combined_features_raw = []
    output_raw = []
    
    # convert data in feature arrays
    for idx, production_data in enumerate(all_production_data):
        num_rows = len(production_data)

        # Repetitions for common features
        turbine_type_repeated = np.tile(turbine_types_onehot[idx], (num_rows, 1))
        hub_height_repeated = np.full((num_rows, 1), float(all_hub_heights[idx]))
        age_repeated = np.full((num_rows, 1), ages[idx])

        # Extract production values and wind speeds
        production_values = np.array([entry[1] for entry in production_data]).reshape(-1, 1) / all_capacities[idx]
        wind_speeds = np.array([entry[2] for entry in production_data]).reshape(-1, 1)

        # combine all features
        combined_chunk = np.hstack((
            turbine_type_repeated,
            hub_height_repeated,
            age_repeated,
            wind_speeds
        ))

        # add the data
        combined_features_raw.append(combined_chunk)
        output_raw.append(production_values)

    # combine all data chunks to one array
    combined_features = np.vstack(combined_features_raw)
    output = np.vstack(output_raw)

    # Interpolate missing values (linear interpolation) in pandas
    wind_speed_series = pd.Series(combined_features[:, -1])
    wind_speed_series.interpolate(method='linear', inplace=True)
    combined_features[:, -1] = wind_speed_series.to_numpy()

    # round all values to two decimal places
    combined_features = np.round(combined_features, decimals=4)
    output = np.round(output, decimals=4)

    # Skalieren der einzelnen Features
    combined_features[:, -1] = scalers["winds"].transform(combined_features[:, -1].reshape(-1, 1)).flatten() # scale wind speeds
    combined_features[:, -2] = scalers["ages"].transform(combined_features[:, -2].reshape(-1, 1)).flatten()  # scale ages
    combined_features[:, -3] = scalers["hub_heights"].transform(combined_features[:, -3].reshape(-1, 1)).flatten()  # scale hub heights
    
    # Convert to PyTorch Dataset
    dataset_lead_times[lead_time] = WindPowerDataset(combined_features, output)


Processing lead time: 0
Processing lead time: 102
Processing lead time: 105
Processing lead time: 108
Processing lead time: 111
Processing lead time: 114
Processing lead time: 117
Processing lead time: 12
Processing lead time: 120
Processing lead time: 123
Processing lead time: 126
Processing lead time: 129
Processing lead time: 132
Processing lead time: 135
Processing lead time: 138
Processing lead time: 141
Processing lead time: 144
Processing lead time: 15
Processing lead time: 18
Processing lead time: 21
Processing lead time: 24
Processing lead time: 27
Processing lead time: 3
Processing lead time: 30
Processing lead time: 33
Processing lead time: 36
Processing lead time: 39
Processing lead time: 42
Processing lead time: 45
Processing lead time: 48
Processing lead time: 51
Processing lead time: 54
Processing lead time: 57
Processing lead time: 6
Processing lead time: 60
Processing lead time: 63
Processing lead time: 66
Processing lead time: 69
Processing lead time: 72
Processing le

<h1>Testing of model B (version not for deployment) on reforecast data

In [None]:
import torch
from torch.nn import HuberLoss, MSELoss, L1Loss
from torch.utils.data import DataLoader
import joblib

input_size = joblib.load("parameters/input_size.pkl")
model_state_dict = torch.load("parameters/model.pth", weights_only=True)
model = MLP(input_size)
model.load_state_dict(model_state_dict)
model.eval()

# Trainings-Konfiguration
mae_criterion = L1Loss()
mse_criterion = MSELoss()
huber_criterion = HuberLoss()

params = {"batch_size": 128,
        "lr": 0.00010155,
        "number_epochs": 10}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

metrics = {}

for key, dataset in dataset_lead_times.items():
    lead_time = key
    print(f"Testing for lead time {lead_time}")

    data_loader = DataLoader(dataset, batch_size=params["batch_size"], shuffle=True)

    test_loss_mae, test_loss_mse, test_loss_huber = 0, 0, 0

    with torch.no_grad():
        for batch_x, batch_y in data_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            preds = model(batch_x)
            
            test_loss_mae += mae_criterion(preds, batch_y).item()
            test_loss_mse += mse_criterion(preds, batch_y).item()
            test_loss_huber += huber_criterion(preds, batch_y).item()

    test_loss_mae /= len(data_loader)
    test_loss_mse /= len(data_loader)
    test_loss_huber /= len(data_loader)

    metrics[lead_time] = {
        "Huber": test_loss_huber,
        "MAE": test_loss_mae,
        "MSE": test_loss_mse,
        "RMSE": np.sqrt(test_loss_mse)
    }

joblib.dump(metrics, f"metrics_reforecast/metrics.pkl")
print(f"metrics saved")

Testing for lead time lead_time_0
Testing for lead time lead_time_102
Testing for lead time lead_time_105


KeyboardInterrupt: 

<h1>TensorBoard graph

In [None]:
import SummaryWriter

dummy_input = torch.rand(1, input_size)  # Batch size of 1, with "input_size" features

# Instantiate your model
model = MLP(input_size=input_size)

# Initialize the TensorBoard writer
writer = SummaryWriter("runs/mlp_model")

# Add the model graph to TensorBoard
writer.add_graph(model, dummy_input)

# Close the writer
writer.close()