Data

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, train_test_split
import pandas as pd

WPP_production_wind = pd.read_excel("data/WPPs+production+wind.xlsx")

#ids = WPP_production_wind['JSON-ID'].values
turbine_types = WPP_production_wind['Turbine'].values
hub_heights = WPP_production_wind['Hub_height'].values
#number_of_turbines = WPP_production_wind['Number_of_turbines'].values
capacities = WPP_production_wind['Capacity'].values
commissioning_dates = WPP_production_wind['Commissioning_date'].values

# One-Hot-Encoding for turbine types
encoder = OneHotEncoder(sparse_output=False)
turbine_types_onehot = encoder.fit_transform(turbine_types.reshape(-1, 1))

# Standardise the dates
standardised_dates = np.vectorize(lambda x: x if '/' in x else f"{x}/06")(commissioning_dates)

# Convert to datetime
standardised_dates = pd.to_datetime(standardised_dates, format='%Y/%m')

# Calculate ages
current_date = pd.Timestamp('2024-12-01')
ages = current_date.year * 12 + current_date.month - (standardised_dates.year * 12 + standardised_dates.month)

# Array für kombinierte Features erstellen
combined_features = []
output = []
chunk_indices = []
chunk_id = 0

for idx, row in WPP_production_wind.iterrows():
    for month in months:
        wind_power_speed = eval(WPP_production_wind.at[idx, month])
        if month in WPP_production_wind.columns and wind_power_speed != []:
            chunk_id += 1 # every month of a different WPP is a chunk

            # Windkraftwerks-spezifische Werte vorbereiten
            num_hours = len(wind_power_speed[1])  # Länge der Stunden (z.B. 720 oder 744)
            turbine_type_repeated = np.tile(turbine_types_onehot[idx, :], (num_hours, 1))
            hub_height_repeated = np.repeat(hub_heights[idx], num_hours)
            capacity_repeated = np.repeat(capacities[idx], num_hours)
            age_repeated = np.repeat(ages[idx], num_hours)

            # Werte für die letzte Spalte (wind_speeds) und Output (wind_power)
            wind_speeds = np.array(wind_power_speed[1]).reshape(-1, 1)  # Zeilenvektor -> Spaltenvektor
            wind_powers = np.array(wind_power_speed[0]).reshape(-1, 1)  # Zeilenvektor -> Spaltenvektor
            print(len(wind_speeds))
            print(len(wind_powers))

            # Kombinierte Features erstellen
            combined_features_chunk = np.column_stack((
                turbine_type_repeated,
                hub_height_repeated,
                capacity_repeated,
                age_repeated,
                wind_speeds
            ))

            # Outputs erstellen
            output_chunk = wind_powers

            # Chunk-Indizes erstellen (einfach `chunk_id` mehrfach zur Liste hinzufügen)
            chunk_indices.extend([chunk_id] * num_hours)

            # Daten speichern
            combined_features.append(combined_features_chunk)
            output.append(output_chunk)

# Arrays zusammenführen
combined_features = np.vstack(combined_features) # Gibt ein 2D-NumPy-Array zurück, in dem die einzelnen Arrays in der Liste vertikal gestapelt werden.
output = np.concatenate(output) # Gibt ein 1D-NumPy-Array zurück, in dem die einzelnen Arrays in der Liste hintereinander verknüpft werden.
chunk_indices = np.array(chunk_indices)  # Chunk-Indizes als NumPy-Array

# Standardisieren der numerischen Features (hub_heights, ages, capacities, wind_speeds)
scaler = StandardScaler()

# Die numerischen Spalten, die skaliert werden sollen (die letzten 4)
numerical_columns = [turbine_types_onehot.shape[1], turbine_types_onehot.shape[1] + 1,
                     turbine_types_onehot.shape[1] + 2, turbine_types_onehot.shape[1] + 3]

# Werte extrahieren, skalieren und zurücksetzen
combined_features[:, numerical_columns] = scaler.fit_transform(combined_features[:, numerical_columns])

# Windkraftwerke aufteilen (Train/Test)
train_plants, test_plants = train_test_split(chunk_indices, test_size=0.25, random_state=1)

# Trainingsdaten und Testdaten
train_indices = np.isin(chunk_indices, train_plants)
test_indices = np.isin(chunk_indices, test_plants)

train_features = combined_features[train_indices]
train_targets = output[train_indices]

test_features = combined_features[test_indices]
test_targets = output[test_indices]

# Dataset erstellen
class WindPowerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.targets[index]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

train_val_dataset = WindPowerDataset(features=train_features, targets=train_targets)
test_dataset = WindPowerDataset(features=test_features, targets=test_targets)

# MLP-Modell definieren
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.leaky_relu = nn.LeakyReLU()

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        return x

# # create lists of wind powers and wind speeds for each WPP, containing sublists for each month
# wind_powers = []
# wind_speeds = []
# for index in range(len(ids)): # for all WPPs. Empty sublists only originate from sparse data which won't be present later on
#     wind_powers.append([])
#     wind_speeds.append([])
#     for month in months:
#         data = eval(WPP_production_weather.at[index, month])
#         if data != []:
#             wind_powers[index].append(data[0] / number_of_turbines[index])
#             wind_speeds[index].append(data[1])
#     # concatenate sublists
#     wind_powers[index] = [item for sublist in wind_powers[index] for item in sublist]
#     wind_speeds[index] = [item for sublist in wind_speeds[index] for item in sublist]

# # Create chunks
# chunks = []
# for index in range(len(ids)): # for all WPPs. Empty sublists only originate from sparse data which won't be present later on
#     wind_power_speed = eval(WPP_production_weather.at[index, month])
#     for month in months:
#         wind_power_speed = eval(WPP_production_weather.at[index, month])
#         counter = 0
#         if data != []:
#             counter += 1
#             wind_powers[index].append(data[0] / number_of_turbines[index])
#             wind_speeds[index].append(data[1])
#         chunks.append(data)
#     wind_powers.append([])
#     wind_speeds.append([])
#     for month in months:
#         data = eval(WPP_production_weather.at[index, month])
#         if data != []:
#             wind_powers[index].append(data[0] / number_of_turbines[index])
#             wind_speeds[index].append(data[1])
#     # concatenate sublists
#     wind_powers[index] = [item for sublist in wind_powers[index] for item in sublist]
#     wind_speeds[index] = [item for sublist in wind_speeds[index] for item in sublist]

# for i, (ws, wp) in enumerate(zip(wind_speeds, wind_powers)):
#     for month in months:
#         if len(ws) > 0:  # Check if there is data for the month
#             ws_scaled = scaler.fit_transform(ws.reshape(-1, 1)).flatten()
#             wp_flat = wp.flatten()
#             turbine_type_repeated = np.repeat(turbine_types_onehot[i], len(ws)).reshape(-1, turbine_types_onehot.shape[1])
#             features_repeated = np.repeat(features_scaled[i], len(ws)).reshape(-1, features_scaled.shape[1])
#             combined_features = np.concatenate([turbine_type_repeated, features_repeated, ws_scaled.reshape(-1, 1)], axis=1)
#             chunks.append((combined_features, wp_flat))

720
720
744
744
744
742


IndexError: boolean index did not match indexed array along dimension 0; dimension is 2206 but corresponding boolean dimension is 2208

Model

In [None]:

# Global scaling of features (global means scaling over all features)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(np.stack([hub_heights, ages, capacities], axis=1))
# Skalierung jeder Liste separat
wind_speeds_scaled = [scaler.fit_transform(np.array(ws).reshape(-1, 1)).flatten() for ws in wind_speeds]

# Flatten der Windgeschwindigkeit
wind_speeds_scaled_flat = wind_speeds_scaled.reshape(-1, 1)  # Shape: (t * n, 1)

# Wiederhole statische Features und Turbinentypen
turbine_types_repeated = np.repeat(turbine_types_onehot, repeats=48, axis=0)  # Shape: (t * n, ...)
features_repeated = np.repeat(features_scaled, repeats=48, axis=0)  # Shape: (t * n, ...)

# Kombinierte Eingabedaten
combined_features = np.concatenate([turbine_types_repeated, features_repeated, wind_speeds_scaled_flat], axis=1)

# Zielwerte flatten
wind_powers_flat = wind_powers.T.flatten()  # Shape: (t * n,)

# Windkraftwerke aufteilen (Train/Test)
chunk_indices = np.arange(number_chunks)
train_plants, test_plants = train_test_split(chunk_indices, test_size=0.25, random_state=1)

# Trainingsdaten und Testdaten
train_indices = np.isin(plant_indices, train_plants)
test_indices = np.isin(plant_indices, test_plants)

train_features = combined_features[train_indices.repeat(48)]
train_targets = wind_powers_flat[train_indices.repeat(48)]

test_features = combined_features[test_indices.repeat(48)]
test_targets = wind_powers_flat[test_indices.repeat(48)]

# Dataset erstellen
class WindPowerDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.targets[index]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

train_val_dataset = WindPowerDataset(features=train_features, targets=train_targets)
test_dataset = WindPowerDataset(features=test_features, targets=test_targets)

# MLP-Modell definieren
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.leaky_relu = nn.LeakyReLU()

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        return x

AttributeError: 'list' object has no attribute 'reshape'

Training and Validation

In [None]:
# Trainingseinstellungen
number_epochs = 50
batch_size = 32 # as a power of 2 for higher efficiency
kf = KFold(n_splits=5, shuffle=True, random_state=1)
input_size = len(train_val_dataset[0][0]) # train_dataset[0] is a tuple with one tensor with input features and a tensor with output, input_size here 20 + 3 + 1
len_train_val_dataset = len(train_val_dataset)

# Cross-Validation
fold = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for train_idx, val_idx in kf.split(range(len_train_val_dataset)):  # Indizes für KFold
    print(f"Fold {fold}/{kf.n_splits}")
    fold += 1

    # Train- und Validierungsdaten erstellen
    train_fold_dataset = Subset(train_val_dataset, train_idx)
    val_fold_dataset = Subset(train_val_dataset, val_idx)

    train_loader = DataLoader(train_fold_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_fold_dataset, batch_size=batch_size, shuffle=False)

    # Modell, Loss und Optimizer
    model = MLP(input_size=input_size, hidden_size=64, output_size=1).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training
    for epoch in range(number_epochs):
        model.train()
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validierung
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                val_outputs = model(batch_x)
                val_loss += criterion(val_outputs.squeeze(), batch_y).item()

        val_loss /= len(val_loader)
        print(f"Epoch [{epoch+1}/50], Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

# Modell speichern
torch.save(model.state_dict(), "mlp_wind_power_model.pth")

Testing

In [None]:
# Test mit Testdaten
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        test_outputs = model(batch_x)
        test_loss += criterion(test_outputs.squeeze(), batch_y).item()

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")