calculate spearman correlation coefficient matrix

In [None]:
import pandas as pd
from scipy.stats import spearmanr

# Beispiel-Daten erstellen
data = {
    'wind_speed': [1, 2, 3, 4, 5],
    'temperature': [5, 6, 7, 8, 7],
    'pressure': [10, 9, 2, 4, 3],
    'production': []
}
df = pd.DataFrame(data)

# Spearman-Korrelationsmatrix berechnen
spearman_corr_matrix = df.corr(method='spearman')

print("Spearman-Korrelationsmatrix:")
print(spearman_corr_matrix)


load historical weather data (wind speed)

In [None]:
import numpy as np
import netCDF4 as nc
from netCDF4 import num2date

wind_speed_file = "data/weather_history/COSMO_REA6/WS_100m.2D.201501.nc4"

wind_speed = nc.Dataset(wind_speed_file)

number_hours = 48

lons = wind_speed['RLON'][:].filled(np.nan)
lons = np.where(lons > 180, lons - 360, lons)
lats = wind_speed['RLAT'][:].filled(np.nan)
time_unit = wind_speed['time'].units
times = num2date(wind_speed['time'][:number_hours], time_unit)
times = np.array([np.datetime64(t.strftime('%Y-%m-%dT%H:%M:%S')) for t in times])
wind_speeds = wind_speed['wind_speed'][:number_hours,:,:].filled(np.nan) # convert masked array to regular array by converting masked values to nan values

np.save("data/weather_history/COSMO_REA6/lons.npy", lons)
np.save("data/weather_history/COSMO_REA6/lats.npy", lats)
np.save("data/weather_history/COSMO_REA6/times.npy", times)
np.save("data/weather_history/COSMO_REA6/wind_speeds.npy", wind_speeds)

save WPPs in parquet file (The Wind Power)

In [None]:
import pandas as pd

# Lade die Excel-Datei nur einmal, filtere die relevanten Daten und speichere sie als Parquet-Datei
WPP_file = "data/WPPs/Windfarms_Europe_20241123.xlsx"
df = pd.read_excel(WPP_file, sheet_name='Windfarms', na_values=["#ND"])
df = df.drop(0) # remove first row (after headlines)

# Wähle nur die benötigten Spalten aus
df = df[['ID', 'Name', '2nd name', 'Latitude', 'Longitude', 'Manufacturer', 'Turbine', 'Hub height', 'Number of turbines', 'Total power', 'Developer', 'Operator', 'Owner', 'Commissioning date', 'Status']]

# Entferne Zeilen, bei denen Name, Total power, Latitude oder Longitude NaN ist
df = df.dropna(subset=['Name', 'Total power', 'Latitude', 'Longitude'])

# Behalte nur Zeilen, bei denen Status == "Production"
df = df.loc[df['Status'] == 'Production']

# Konvertiere Spalten explizit in ihre entsprechenden Datentypen
df['ID'] = df['ID'].astype(int)
df['Name'] = df['Name'].astype(str)
df['2nd name'] = df['2nd name'].astype(str)
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)
df['Manufacturer'] = df['Manufacturer'].astype(str)
df['Turbine'] = df['Turbine'].astype(str)
df['Hub height'] = df['Hub height'].astype(float)
df['Number of turbines'] = df['Number of turbines'].fillna(0).astype(int)
df['Total power'] = df['Total power'].astype(float)
df['Developer'] = df['Developer'].astype(str)
df['Operator'] = df['Operator'].astype(str)
df['Owner'] = df['Owner'].astype(str)
df['Commissioning date'] = df['Commissioning date'].astype(str)
df['Status'] = df['Status'].astype(str)

print(f"number of wind turbines: {len(df['ID'])})")

# Speichere die gefilterten Daten im Parquet-Format (deutlich schneller zu lesen und schreiben, als Excel-Dateien, und auch platzsparender)
df.to_parquet("data/WPPs/The_Wind_Power.parquet")


save production history example as parquet file

In [None]:
import pandas as pd

# Lade die Excel-Datei nur einmal, filtere die relevanten Daten und speichere sie als Parquet-Datei
example_file = "data/production_history/Example/example_time_series.xlsx" 
df = pd.read_excel(example_file)

# Speichere die gefilterten Daten im Parquet-Format (deutlich schneller zu lesen und schreiben, als Excel-Dateien, und auch platzsparender)
df.to_parquet("data/production_history/Example/example_time_series.parquet")

wind speeds (COSMO-REA6)

In [None]:
import netCDF4 as nc

fn = r"C:\Users\alexa\Documents\Webapp\data\weather history\WS_100m.2D.199501.nc4" # January 1995
ds = nc.Dataset(fn)

print(ds)

time = ds.variables['time'][:]
lon = ds.variables['RLON'][:]
lat = ds.variables['RLAT'][:]
wind_speed = ds.variables['wind_speed'][:]

for i in range(len(lon)):
    for j in range(len(lon[0])):
        lon[i,j] = lon[i,j] - 360 if lon[i,j] > 180 else lon[i,j]
    

three different visualisations

In [None]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs


# Erstelle eine Karte mit cartopy
plt.figure(figsize=(10, 5))
ax = plt.axes(projection=ccrs.PlateCarree())

# Trage die Windgeschwindigkeit auf der Karte ein
plt.contourf(lon, lat, wind_speed[0,:,:], transform=ccrs.PlateCarree(), cmap='viridis')

# Füge Küstenlinien hinzu
ax.coastlines()

# Zeige die Karte
plt.colorbar(label="Windgeschwindigkeit (m/s)")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

# Erstelle eine Karte mit curvilinearen Daten
plt.figure(figsize=(10, 5))
ax = plt.axes(projection=ccrs.PlateCarree())

# Zeichne die Windgeschwindigkeit auf dem curvilinearen Gitter
plt.pcolormesh(lon, lat, wind_speed[0,:,:], transform=ccrs.PlateCarree(), cmap='viridis')

# Füge Küstenlinien hinzu
ax.coastlines()

# Zeige die Karte
plt.colorbar(label="Windgeschwindigkeit (m/s)")
plt.show()


In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt

# Verwende eine Lambert-Projektion
ax = plt.axes(projection=ccrs.LambertConformal())

# Zeichne die Windgeschwindigkeit auf dem curvilinearen Gitter
plt.contourf(lon, lat, wind_speed[0,:,:], transform=ccrs.PlateCarree(), cmap='coolwarm')

# Küstenlinien und Raster hinzufügen
ax.coastlines()
ax.gridlines()

plt.colorbar(label="Windgeschwindigkeit (m/s)")
plt.show()


power curve

In [None]:
import numpy as np
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

# Definiere die Power Curve (Windgeschwindigkeit und Leistung)
wind_speeds = np.arange(0, 25.5, 0.5)  # Windgeschwindigkeiten
power_output = [0]*7 + [35, 80, 155, 238, 350, 474, 630, 802, 1018, 1234, 1504, 1773, 2076, 2379, 2664, 2948, 3141, 3334, 3425, 3515, 3546, 3577, 3586, 3594, 3598, 3599] + [3600]*18  # Leistung
max_cap = 3600
power_output_norm = [x / max_cap for x in power_output]
wind_speed_max = 25

# Erstelle die Interpolationsfunktion
interpolation_function = interp1d(wind_speeds, power_output_norm, kind='cubic', fill_value="extrapolate")

# Beispiel für Interpolation: Bestimme Werte für feinere Windgeschwindigkeiten
fine_wind_speeds = np.linspace(0, 25, 100)  # Feinere Windgeschwindigkeiten
interpolated_power_output = interpolation_function(fine_wind_speeds)

# Plot der diskreten und interpolierten Power Curve
plt.figure(figsize=(10, 6))
plt.plot(wind_speeds, power_output_norm, 'o', label='Diskrete Werte')  # Diskrete Punkte
plt.plot(fine_wind_speeds, interpolated_power_output, '-', label='Interpolierte Werte')  # Interpolierte Werte
plt.xlabel('Windgeschwindigkeit (m/s)')
plt.ylabel('Leistung (kW)')
plt.title('Interpolierte Power Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd

# Datei laden (relativer Pfad)
file_path = "./Global-Wind-Power-Tracker-June-2024.xlsx"
df = pd.read_excel(file_path, sheet_name='Data')

# Bereich für Europa definieren
lat_min, lat_max = 35, 72
lon_min, lon_max = -25, 45

# Filtere die Daten für den geografischen Bereich in Europa
df_filtered = df[(df['Latitude'] >= lat_min) & (df['Latitude'] <= lat_max) & 
                 (df['Longitude'] >= lon_min) & (df['Longitude'] <= lon_max)]

WPPs capacity distribution

In [None]:
import matplotlib.pyplot as plt

# Verteilung der Kapazität berechnen
capacity_distribution = df_filtered['Capacity (MW)'].value_counts()

# Diagramm erstellen, nur bis zum Maximalwert
plt.figure(figsize=(10, 6))
plt.hist(df_filtered['Capacity (MW)'].dropna(), bins=1000, range=(0, 2000), edgecolor='black')
plt.title('Verteilung der Windkraftanlagenkapazitäten in Europa')
plt.xlabel('Kapazität (MW)')
plt.ylabel('Anzahl')
plt.grid(True)
plt.show()

visualisation of WPPs

In [None]:
from ipyleaflet import Map, Marker, MarkerCluster
from ipywidgets import Layout

# Erstelle die Karte
m = Map(center=[(lat_min + lat_max) / 2, (lon_min + lon_max) / 2],
        zoom=5,
        layout=Layout(width='100%', height='500px')
       )

# Erstelle Marker-Objekte für jede Windkraftanlage
markers = [Marker(location=(row['Latitude'], row['Longitude'])) for _, row in df_filtered.iterrows()]

# Erstelle einen Marker Cluster
marker_cluster = MarkerCluster(markers=markers, disable_clustering_at_zoom=18)

# Füge den Marker Cluster zur Karte hinzu
m.add_layer(marker_cluster)

# Zeige die Karte an
m

data generation with power curve

In [None]:
import numpy as np

# Masked array in ein reguläres Array umwandeln
wind_speed_array = np.ma.filled(wind_speed[0,:,:], np.nan)

# Verteilung der Kapazitäten aus 'Capacity (MW)'
capacity_distribution = df_filtered['Capacity (MW)'].dropna().values

# Häufigkeiten der Kapazitäten berechnen
unique_capacities, counts = np.unique(capacity_distribution, return_counts=True)

# Wahrscheinlichkeiten für jede Kapazität (relative Häufigkeit)
probabilities = counts / counts.sum()

# Initialisiere die np.array Strukturen für die Daten
data = np.zeros((3, wind_speed_array.size))  # 3 Reihen für wind_speed, capacity, production

# Skaliere die Produktion basierend auf der Verteilung mit gewichteter Auswahl
for i in range(wind_speed_array.shape[0]):
    for j in range(wind_speed_array.shape[1]):
        wind_speed_select = wind_speed_array[i, j] # Verwende alle Windgeschwindigkeiten von Januar 1995
        capacity = np.random.choice(unique_capacities, p=probabilities) # Wähle eine Kapazität basierend auf ihrer Wahrscheinlichkeit
        production = interpolation_function(wind_speed_select) * capacity
        production = production if wind_speed_select < wind_speed_max else 0
                
        # Fülle die Werte in das np.array
        data[0, i*wind_speed_array.shape[0]+j] = wind_speed_select  # Windgeschwindigkeit
        data[1, i*wind_speed_array.shape[0]+j] = capacity  # Kapazität
        data[2, i*wind_speed_array.shape[0]+j] = production # Produktion


Preprocessing of historical production data: discard data of unwanted power plants, retain monthly files - old version, because too many nan values or missing rows detected

In [None]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Basisverzeichnisse
input_dir = r"E:\MA_data\raw production history ENTSO-E"
output_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed_old"

# Funktion zum Auffüllen fehlender Stunden und Zählen der fehlenden Werte
def fill_missing_hours(data, start_time, end_time):
    # Erstelle eine vollständige Zeitreihe für den Monat
    full_time_range = pd.date_range(start=start_time, end=end_time, freq='h')

    # Umwandeln der 'DateTime (UTC)'-Spalte in datetime-Objekte
    data.loc[:, 'DateTime (UTC)'] = pd.to_datetime(data['DateTime (UTC)'])

    # Setze den Index auf die DateTime-Spalte
    data.set_index('DateTime (UTC)', inplace=True)
    
    # Reindexiere die Daten, um fehlende Stunden mit NaN aufzufüllen
    data = data.reindex(full_time_range)

    # Zähle die Anzahl der fehlenden Werte
    missing_count = data['ActualGenerationOutput(MW)'].isna().sum()
    
    # Fülle fehlende Werte mit 0 (sowohl gerade hinzugefügte Zeilen mit nan Werten, als auch bereits bestehende Zeilen, in denen schon vorher keine Werte für die Produktion waren)
    data['ActualGenerationOutput(MW)'] = data['ActualGenerationOutput(MW)'].fillna(0)
    
    # Setze den Index zurück
    data.reset_index(inplace=True)
    
    return data, missing_count

number_missing_values = []

# Liste der Monate von 2015-01 bis 2024-10 generieren
months = pd.date_range(start="2015-01", end="2024-10", freq="MS").strftime("%Y_%m").tolist()

# For-Schleife für jede Datei
for i, month in enumerate(months):

    # Dateipfad erstellen
    input_file = os.path.join(input_dir, f"{month}_ActualGenerationOutputPerGenerationUnit_16.1.A_r2.1.csv")
    output_file = os.path.join(output_dir, f"production_summary_{month}.xlsx")

    # Datei einlesen
    print(f"Bearbeite Datei: {input_file}")
    data = pd.read_csv(input_file, sep='\t')

    # Filtere nach GenerationUnitType == 'Wind Onshore' oder 'Wind Offshore'
    filtered_data = data[(data['GenerationUnitType'] == 'Wind Onshore ') | (data['GenerationUnitType'] == 'Wind Offshore ')]

    # Wichtige Spalten identifizieren
    unique_windfarms = filtered_data[['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'AreaCode']].drop_duplicates() # 'AreaCode', 'AreaDisplayName', 'AreaTypeCode' and 'MapCode' of identical WPPs may differ --> don't use as criterion to identify unique windfarms
    
    # Auffüllen fehlender Stunden und Zählen der fehlenden Werte
    start_time = data['DateTime (UTC)'].min()
    end_time = data['DateTime (UTC)'].max()

    number_missing_values.append([])

    # Listen für die Produktion zu jeder Stunde hinzufügen
    production_data = []
    for _, row in unique_windfarms.iterrows():
        windfarm_data = filtered_data[(filtered_data['GenerationUnitName'] == row['GenerationUnitName']) & (filtered_data['AreaCode'] == row['AreaCode'])]
        windfarm_data, missing_count = fill_missing_hours(windfarm_data, start_time, end_time)
        production_values = windfarm_data['ActualGenerationOutput(MW)'].tolist() # where production is nan, the WPP has consumed and not generator power
        row_data = {
            'GenerationUnitName': row['GenerationUnitName'],
            'GenerationUnitCode': row['GenerationUnitCode'],
            'GenerationUnitType': row['GenerationUnitType'],
            'GenerationUnitInstalledCapacity(MW)': row['GenerationUnitInstalledCapacity(MW)'],
            'Production (MW)': production_values
        }
        production_data.append(row_data)

        # Zähle die fehlenden Werte für diesen Monat
        number_missing_values[i].append(missing_count)

    # DataFrame erstellen und in Excel speichern
    output_df = pd.DataFrame(production_data)
    output_df.to_excel(output_file, index=False)

    print("Anzahl der fehlenden Werte je Windkraftwerk für diesen Monat:", number_missing_values[i])
    print(f"Excel-Datei wurde erfolgreich erstellt: {output_file}")

In [None]:
# Mache die Liste flach, um alle Elemente in einer einzigen Liste zu sammeln
number_missing_values_flat = [item for sublist in number_missing_values for item in sublist]

# Berechne den Durchschnitt
average = sum(number_missing_values_flat) / len(number_missing_values_flat)
print(f"Der Durchschnitt ist: {average}")

# Überschrift
column_name = "number of missing elements per wind power plant for all investigated months"

# DataFrame erstellen
df = pd.DataFrame({column_name: [str(sublist) for sublist in number_missing_values]})

# Datei speichern
output_file = r"data\number_missing_values.xlsx"  # Pfad und Dateiname anpassen
df.to_excel(output_file, index=False)

print(f"Die Excel-Datei wurde erfolgreich gespeichert: {output_file}")

merge all monthly production data files to one combined file

In [None]:
import pandas as pd
import os

# Verzeichnisse
input_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed"
output_file = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.xlsx"

# Liste der Monate von 2015_01 bis 2024_10
months = pd.date_range(start="2015-01", end="2024-10", freq="MS").strftime("%Y_%m").tolist()

columns_old = ['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'AreaDisplayName', 'MapCode', 'AreaTypeCode']
# Leeres DataFrame für das Endergebnis
columns = columns_old + months
final_df = pd.DataFrame(columns=columns)

# Einlesen der einzelnen Dateien
for month in months:
    input_file = os.path.join(input_dir, f"production_summary_{month}.xlsx")

    # Überprüfen, ob die Datei existiert
    if not os.path.exists(input_file):
        print(f"Datei nicht gefunden: {input_file}")
        continue

    # Datei einlesen
    print(f"Verarbeite Datei: {input_file}")
    df = pd.read_excel(input_file)

    # Sicherstellen, dass die Spalte 'Production (MW)' existiert
    if 'Production (MW)' not in df.columns:
        print(f"Spalte 'Production (MW)' fehlt in {input_file}")
        continue

    df.rename(columns={'Production (MW)': month}, inplace=True)

    # Zusammenführen der Daten
    if final_df.empty:
        final_df = df
    else:
        # Zusammenführen: Gleiche Windkraftanlagen zusammenführen, neue hinzufügen
        final_df = pd.merge(final_df, df, how='outer', on=columns_old)

# Excel-Tabelle speichern
final_df.to_excel(output_file, index=False)
print(f"Zusammengeführte Excel-Tabelle wurde erfolgreich gespeichert unter: {output_file}")


Visualise filling rate of production data file (1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Datei laden
file_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed\production_summary_all.xlsx"
df = pd.read_excel(file_path)

# Anzahl der Windkraftanlagen pro AreaDisplayName
counts = df['AreaDisplayName'].value_counts()

# Berechnung der prozentualen Ausfüllquote pro AreaDisplayName
percentages = {}
for area in counts.index:
    subset = df[df['AreaDisplayName'] == area]
    total_cells = len(subset) * (len(subset.columns) - 6)  # Exkludiere nicht-produktive Spalten
    filled_cells = subset.iloc[:, 6:].notna().sum().sum()  # Nur Produktionsdaten berücksichtigen
    percentages[area] = (filled_cells / total_cells) * 100

# Balkendiagramm erstellen
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(counts.index, counts.values, color='skyblue')
ax.set_title('Anzahl der Windkraftanlagen mit Produktionsdaten pro Land')
ax.set_xlabel('Land (AreaDisplayName)')
ax.set_ylabel('Anzahl der Windkraftanlagen')
ax.tick_params(axis='x', rotation=45)

# Prozentsätze als Text hinzufügen
for bar, area in zip(bars, counts.index):
    height = bar.get_height()
    percentage = percentages[area]
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{percentage:.1f} %', ha='center', va='bottom')

plt.tight_layout()
plt.show()


Visualise filling rate of production data file (2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Datei laden
file_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed\production_summary_all.xlsx"
df = pd.read_excel(file_path)

# Liste der Monatskolumnen
month_columns = [col for col in df.columns if col.startswith("20")]

# Anzahl der Windkraftanlagen pro AreaDisplayName (Land)
windfarm_count = df.groupby("AreaDisplayName").size()

# Durchschnittliche Ausfüllquote pro Land
fill_rates = df[month_columns].notna().mean(axis=1)  # Berechne pro Windkraftanlage
average_fill_rate_per_country = df.groupby("AreaDisplayName")[month_columns].apply(
    lambda x: x.notna().mean(axis=1).mean()
)

# Plot erstellen
fig, ax1 = plt.subplots(figsize=(12, 6))

# Linke y-Achse: Anzahl der Windkraftanlagen
ax1.bar(
    windfarm_count.index,
    windfarm_count.values,
    label="Anzahl der Windkraftanlagen",
    alpha=0.7
)
ax1.set_ylabel("Anzahl der Windkraftanlagen", fontsize=12)
ax1.set_xlabel("AreaDisplayName (Land)", fontsize=12)
ax1.tick_params(axis="x", rotation=45)
ax1.legend(loc="upper left")

# Rechte y-Achse: Durchschnittliche Ausfüllquote
ax2 = ax1.twinx()
ax2.bar(
    average_fill_rate_per_country.index,
    average_fill_rate_per_country.values * 100,  # Prozentualer Wert
    label="Durchschnittliche Ausfüllquote (%)",
    alpha=0.5,
    color="orange"
)
ax2.set_ylabel("Durchschnittliche Ausfüllquote (%)", fontsize=12)
ax2.legend(loc="upper right")

# Titel und Layout
plt.title("Windkraftanlagen und Ausfüllquote pro Land", fontsize=14)
plt.tight_layout()

# Plot anzeigen
plt.show()


Download table to find common name for UK power plants from website

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL der Webseite
url = "https://osuked.github.io/Power-Station-Dictionary/dictionary.html"

# Abrufen der Webseite
response = requests.get(url)
if response.status_code != 200:
    print("Fehler beim Abrufen der Webseite.")
    exit()

# Parsing der Webseite mit BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Finden aller Tabellen auf der Seite
tables = soup.find_all('table')
if len(tables) < 3:
    print("Weniger als 3 Tabellen auf der Seite gefunden.")
    exit()

# Die dritte Tabelle auswählen (Index 2, da Python nullbasiert zählt)
table = tables[2]

# Spaltennamen extrahieren
headers = [header.text.strip() for header in table.find_all('th')]

# Zeilen extrahieren
rows = []
for row in table.find_all('tr')[1:]:  # Überspringt die Kopfzeile
    cells = [cell.text.strip() for cell in row.find_all('td')]
    rows.append(cells)

# DataFrame erstellen
df = pd.DataFrame(rows, columns=headers)

# Datei speichern
output_path = "data/WPPs/Power_Station_Dictionary.xlsx"
df.to_excel(output_path, index=False)
print(f"Die dritte Tabelle wurde erfolgreich als '{output_path}' gespeichert.")


Add common names from downloaded to assignment file

In [None]:
import pandas as pd

# Load the two provided files
file_1_path = r"C:\Users\alexa\Documents\Webapp\data\WPPs\Power_Station_Dictionary.xlsx"
file_2_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.xlsx"

# read the data
df1 = pd.read_excel(file_1_path)
df2 = pd.read_excel(file_2_path)

# introduce new columns at the end
df2['Names_UK_Plants'] = None
df2['ID_The-Wind-Power'] = None
df2['Comment'] = None

# move columns behind the column 'AreaTypeCode'
area_type_code_index = df2.columns.get_loc('AreaTypeCode') + 1
columns = list(df2.columns)
columns.remove('Names_UK_Plants')
columns.remove('ID_The-Wind-Power')
columns.remove('Comment')
columns = (
    columns[:area_type_code_index] +
    ['Names_UK_Plants', 'ID_The-Wind-Power', 'Comment'] +
    columns[area_type_code_index:]
)
df2 = df2[columns]

# Iterate over rows in df2 to match and update the "Names_UK_Plants" column
for index, row in df2.iterrows():
    generation_unit_name = row['GenerationUnitName']
    
    # Check if this name appears in the "National Grid BMU ID" of the first file
    matching_rows = df1[df1['National Grid BMU ID'].str.contains(generation_unit_name, na=False, case=False)]
    
    if not matching_rows.empty:
        # Get the "Common Name" value(s) and update the "Names_UK_Plants" column in df2
        common_names = matching_rows['Common Name'].tolist()
        df2.at[index, 'Names_UK_Plants'] = ', '.join(common_names)

# Save the updated DataFrame to a new Excel file
output_path = r"C:\Users\alexa\Documents\Webapp\data\Assignment.xlsx"
df2.to_excel(output_path, index=False)

output_path

load WPPs and assignment file (after manual assignment has been conducted) and combine the information

In [None]:
import pandas as pd

# Laden der Daten
df_wind_power = pd.read_parquet("data/WPPs/The_Wind_Power.parquet")
df_assignment = pd.read_excel("data/Assignment.xlsx")

In [None]:
# Filtere nur Zeilen, bei denen "ID_The-Wind-Power" nicht "not found" ist
df_assignment = df_assignment[df_assignment["ID_The-Wind-Power"] != "not found"]

# Extrahiere und entpacke alle gültigen IDs aus der Spalte "ID_The-Wind-Power"
def extract_ids(value):
    # Überprüfen, ob der Wert eine Liste ist, und ggf. in einzelne IDs zerlegen
    if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
        return eval(value)  # Konvertiert die Zeichenkette in eine Liste
    elif isinstance(value, (int, str)):
        return [int(value)]  # Einzelne IDs werden in eine Liste gewandelt
    return []

valid_ids = set()
df_assignment["ID_The-Wind-Power"].apply(lambda x: valid_ids.update(extract_ids(x)))

df_filtered = df_wind_power[df_wind_power['ID'].isin(valid_ids)].copy()
actual_ids = set(df_filtered['ID'])
suspended_ids = valid_ids - actual_ids

print("number potential WPPs:", len(valid_ids))
print("number actual WPPs:", len(actual_ids))
print("number suspended WPPs (no name, location, capacity or status not in operation):", len(suspended_ids))

# Füge Spalten für Produktion von 2015_01 bis 2024_10 hinzu
new_columns = {f"{year}_{month:02d}": [[] for _ in range(len(df_filtered))]
            for year in range(2015, 2025) for month in range(1, 13)
            if f"{year}_{month:02d}" in df_assignment.columns}

# Füge die neuen Spalten zum DataFrame hinzu
df_filtered = pd.concat([df_filtered, pd.DataFrame(new_columns, index=df_filtered.index)], axis=1)

# Gehe durch jede Zeile der Assignment-Datei und füge Produktionsdaten hinzu
for _, row in df_assignment.iterrows():
    
    ids_in_row = extract_ids(row["ID_The-Wind-Power"])
    first_id = ids_in_row[0]

    if first_id in suspended_ids:
        continue # jump to next iteration, because following line would fail for suspended_ids

    current_index = df_filtered.loc[df_filtered['ID'] == first_id].index[0]

    # add production values for each month, only requires the first ID
    for year in range(2015, 2025):
        for month in range(1, 13):
            column_name = f"{year}_{month:02d}"
            if column_name in df_assignment.columns: # neglect 2024_11 and 2024_12
                production_month = row[column_name]
                if isinstance(production_month, str): # type(value) = str, meaning value = production values
                    production_month = production_month.replace("nan", "0") # where production is nan, the WPP has consumed and not generator power
                    production_month = eval(production_month) # [[]] <-- "[[]]"
                    production_month = production_month[0] # [] <-- [[]]
                    existing_production = df_filtered.at[current_index, column_name]
                    
                    if existing_production == []: # no production values in cell for this month
                        df_filtered.at[current_index, column_name] = production_month
                    else: # several production values to be added to one WPP
                        combined_production = [a + b for a, b in zip(existing_production, production_month)]
                        df_filtered.at[current_index, column_name] = combined_production

    # add capacities of WPPs, if several are assigned to one row in the assignment table
    if first_id in actual_ids and len(ids_in_row) > 1: # only treat every id once here, because rows are discarded
        total_power = 0
        for id in ids_in_row:
            total_power += df_filtered.loc[df_filtered['ID'] == id, "Total power"].item() # add power
            if id != first_id:
                df_filtered = df_filtered[df_filtered['ID'] != id] # delete from dataframe
        df_filtered.loc[df_filtered['ID'] == first_id, "Total power"] = total_power
    
    # keep track of treated IDs to not try to delete rows twice 
    for id in ids_in_row:
        if id in actual_ids:
            actual_ids.discard(id)

actual_cluster_ids = set(df_filtered['ID'])
print("number WPPs after clustering", len(actual_cluster_ids))
df_filtered.to_excel("data/WPPs+production.xlsx", index=False)

assign wind speeds to WPPs and productions

In [None]:
import xarray as xr

# Name of wind_speed_file must remain correct during execution of the next cell, because data are lazy loaded. Segmentation of files into years necessary, because datapoints at large indices of too large files can't be loaded into memory during lazy loading
wind_speed_file = r"C:\Users\alexa\Documents\Webapp\data\weather_history\2024.grib"
# Chunkgröße für die Dimension "time" anpassen, sonst funktioniert die Berechnung wind_speeds = np.sqrt(wind_speeds_month['u100']**2 + wind_speeds_month['v100']**2) in der folgenden Zelle nicht
wind_speed = xr.open_dataset(wind_speed_file, engine="cfgrib", chunks={"time": 100})

import pandas as pd

times = pd.to_datetime(wind_speed['time'].values)
latitudes = wind_speed['latitude'].values
longitudes = wind_speed['longitude'].values

In [None]:
import numpy as np
import pandas as pd
from scipy.interpolate import interp2d
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

WPP_production = pd.read_excel("data/WPPs+production.xlsx")
df_assignment = pd.read_excel("data/Assignment.xlsx")

In [None]:
import numpy as np
import pandas as pd
from scipy.interpolate import interp2d
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

WPP_production = pd.read_excel("data/WPPs+production.xlsx")

# all WPPs
# ids = WPP_production['ID'].values
# lats_plants = WPP_production['Latitude'].values
# lons_plants = WPP_production['Longitude'].values

# only two WPPs for test reasons
ids = WPP_production['ID'].iloc[0:2]
lats_plants = WPP_production['Latitude'].iloc[0:2]
lons_plants = WPP_production['Longitude'].iloc[0:2]

months = [f"{year}_{month:02d}" for year in range(2024, 2025) for month in range(10, 11) # range(2015, 2025) for month in range(1, 13)
            if f"{year}_{month:02d}" in WPP_production.columns]

for month in months:
    print(f"month {month}")

    month_data = times[times.strftime('%Y_%m') == month]
    start = times.get_loc(month_data[0])
    end = times.get_loc(month_data[-1])
    wind_speeds_month = wind_speed.isel(time=slice(start, end+1))
    # this operation requires chunking
    wind_speeds = xr.apply_ufunc(
        np.sqrt,
        wind_speed['u100']**2 + wind_speed['v100']**2,
        dask="parallelized")
    wind_speeds = wind_speeds.load()

    for j in range(len(ids)):
        print(f"Wind Power Plant {j+1} / {len(ids)}")
        lon = lons_plants[j]
        lat = lats_plants[j]
        if WPP_production.at[j, month] != "[]":  # Check if there is production data
            interpolated_wind_speeds = np.zeros(len(month_data))
            for i, _ in enumerate(month_data):
                wind_speeds_i = wind_speeds[i].values
                spatial_interpolator = interp2d(longitudes, latitudes, wind_speeds_i, kind='cubic') # time in first dimension, i. e. wind_speeds[index] = wind_speeds[index, :, :]. Lazy evaluation of spatial_interpolator justifies creation of a new one not only for each time step, but also for each wind power plant
                interpolated_value = spatial_interpolator(lon, lat)[0]
                interpolated_value = round(interpolated_value, 3) # saves memory and computing resources
                interpolated_wind_speeds[i] = interpolated_value

            # Daten als NumPy-Array speichern (weniger speicherintensiv und stellt sicher, dass wind_speed und wind_power die gleiche Länge haben, die von wind_speed vorgegeben wird)
            production_data = np.array(eval(WPP_production.at[j, month]))  # Vorherige Daten als Array
            combined_data = np.stack((production_data, interpolated_wind_speeds), axis=0)
            WPP_production.at[j, month] = combined_data  # Kombinierte Daten speichern

In [None]:
# Define the columns to save
columns_to_save = [
    'ID', 'Name', '2nd name', 'Latitude', 'Longitude', 'Manufacturer', 'Turbine',
    'Hub height', 'Number of turbines', 'Total power', 'Developer', 'Operator',
    'Owner', 'Commissioning date', 'Status', '2024_09', '2024_10'
]

# Filter the DataFrame to include only the specified columns and rows where ids correspond to WPP_production['ID']
filtered_WPP_production = WPP_production[WPP_production['ID'].isin(ids)][columns_to_save]

# Save the filtered DataFrame to an Excel file
filtered_WPP_production.to_excel("data/WPPs+production+weather.xlsx", index=False)