<h1>1. Preprocessing of historical production data: discard data of unwanted power plants, retain monthly files</h1>
<ul>
    <li>WPPs are considered unique if they have the same name and installed capacity (capacity can change for the same WPP --> consider it as a new WPP)</li>
    <ul>
        <li>real duplicates are avoided by creating a set unique_windfarms_set and comparing, if already added</li>
        <li>distinguish between real duplicates (same 'GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', only different 'AreaCode', 'AreaDisplayName', 'AreaTypeCode' and 'MapCode') and WPPs that have the same 'GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', but a different 'GenerationUnitInstalledCapacity(MW)' (changed over time)</li>
    </ul>
    <li>takes 1 to 2 minutes per month, partially because only values for full hours are retained</li>
</ul>

In [None]:
import pandas as pd
import os
import json
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Basisverzeichnisse
input_dir = r"E:\MA_data\raw production history ENTSO-E"
output_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON"

# Liste der Monate von 2015-01 bis 2024-10 generieren
months = pd.date_range(start="2019-09", end="2019-09", freq="MS").strftime("%Y_%m").tolist()

# For-Schleife für jede Datei
for month in months:
    # Dateipfad erstellen
    input_file = os.path.join(input_dir, f"{month}_ActualGenerationOutputPerGenerationUnit_16.1.A_r2.1.csv")
    output_file = os.path.join(output_dir, f"production_summary_{month}.json")

    # Überprüfen, ob die Datei existiert
    if not os.path.exists(input_file):
        print(f"Datei nicht gefunden: {input_file}")
        continue

    # Datei einlesen
    print(f"Bearbeite Datei: {input_file}")
    data = pd.read_csv(input_file, sep='\t')

    # Filtere nach GenerationUnitType == 'Wind Onshore' oder 'Wind Offshore'
    filtered_data = data[(data['GenerationUnitType'] == 'Wind Onshore ') | (data['GenerationUnitType'] == 'Wind Offshore ')]

    # Konvertiere 'DateTime (UTC)' direkt in das ISO-8601-Format
    filtered_data.loc[:, 'DateTime (UTC)'] = pd.to_datetime(filtered_data['DateTime (UTC)']).dt.strftime('%Y-%m-%dT%H:%M:%S')

    # Wichtige Spalten identifizieren, 'AreaCode', 'AreaDisplayName', 'AreaTypeCode' and 'MapCode' of identical WPPs may differ --> use at least one of them as a criterion to identify unique windfarms, and sort out the duplicates manually, because otherwise, the production data are appended twice to the same wind farm
    unique_windfarms = filtered_data[['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'AreaCode']].drop_duplicates()
    unique_windfarms_set = set(unique_windfarms['GenerationUnitName'])

    # Listen für die Produktion zu jeder Stunde hinzufügen
    production_data = []
    for _, row in unique_windfarms.iterrows():
        # Filtern der Daten für das aktuelle Windkraftwerk
        windfarm_data = filtered_data[
            (filtered_data['GenerationUnitName'] == row['GenerationUnitName']) &
            (filtered_data['AreaCode'] == row['AreaCode']) # important to avoid adding to a wind farm production data of all its duplicates
        ]

        # Erstelle 2D-Array (Liste von Listen) mit Zeit und Produktion, da JSON keine Arrays speichern kann
        production_array = [
            [time, production]
            for time, production in zip(
                windfarm_data['DateTime (UTC)'],
                windfarm_data['ActualGenerationOutput(MW)']
            )
            if pd.notna(production) and pd.to_datetime(time).minute == 0  # Nur volle Stunden übernehmen (Resolution of weather data is hourly), although it significantly increases the execution time of the programme, und fehlende Werte überspringen
        ]

        # Daten für das Windkraftwerk hinzufügen
        row_data = {
            'GenerationUnitName': row['GenerationUnitName'],
            'GenerationUnitCode': row['GenerationUnitCode'],
            'GenerationUnitType': row['GenerationUnitType'],
            'GenerationUnitInstalledCapacity(MW)': row['GenerationUnitInstalledCapacity(MW)'],
            'Production': production_array
        }

        # don't add duplicates
        if row_data['GenerationUnitName'] in unique_windfarms_set:
            production_data.append(row_data)
            unique_windfarms_set.discard(row_data['GenerationUnitName'])

    # JSON-Datei speichern
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(production_data, json_file, ensure_ascii=False, indent=4)

    print(f"JSON-Datei wurde erfolgreich erstellt: {output_file}")


<h1>2. merge all monthly production data files to one combined file

In [None]:
import pandas as pd
import os
import json

# Verzeichnisse
input_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON"
output_file = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json"

# Liste der Monate von 2015_01 bis 2024_10
months = pd.date_range(start="2015-01", end="2024-10", freq="MS").strftime("%Y_%m").tolist()

columns_merge = ['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)']
final_data = {}

# Einlesen der einzelnen Dateien
for month in months:
    input_file = os.path.join(input_dir, f"production_summary_{month}.json")

    # Überprüfen, ob die Datei existiert
    if not os.path.exists(input_file):
        print(f"Datei nicht gefunden: {input_file}")
        continue

    # Datei einlesen
    print(f"Verarbeite Datei: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as file:
        monthly_data = json.load(file)

    # Zusammenführen: Gleiche Windkraftanlagen zusammenführen
    for windfarm in monthly_data:
        key = tuple(windfarm[col] for col in columns_merge) # unique key per WPP, defined by column_merge (name, code, type, capacity) --> duplicates are not added (already assured in previous cell)
        if key not in final_data:
            # Neu hinzufügen
            final_data[key] = windfarm
        else:
            # Produktion zusammenführen
            final_data[key]['Production'].extend(windfarm['Production'])

# Finales JSON-Datenformat vorbereiten
merged_data = list(final_data.values())

# JSON-Datei speichern
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(merged_data, json_file, ensure_ascii=False, indent=4)

print(f"Zusammengeführte JSON-Datei wurde erfolgreich gespeichert unter: {output_file}")


<h1>3. Create Excel File with WPPs in JSON file

In [None]:
import pandas as pd
import json

# Datei laden
file_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json"
output_excel_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\JSON_File.xlsx"

# JSON-Datei einlesen
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# assign an ID to each WPP in the JSON file that corresponds to its position in the list of dictionaries
for i, item in enumerate(data):
    item['JSON-ID'] = i

# Relevante Daten extrahieren
df = pd.DataFrame(data)[['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'JSON-ID']]

# Nach 'GenerationUnitName' sortieren
df_sorted = df.sort_values(by='GenerationUnitName')

# Daten in eine Excel-Datei speichern
df_sorted.to_excel(output_excel_path, index=False)

# Fertigmeldung
output_excel_path

<h1>4. Download table to find common name for UK power plants from website

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL der Webseite
url = "https://osuked.github.io/Power-Station-Dictionary/dictionary.html"

# Abrufen der Webseite
response = requests.get(url)
if response.status_code != 200:
    print("Fehler beim Abrufen der Webseite.")
    exit()

# Parsing der Webseite mit BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Finden aller Tabellen auf der Seite
tables = soup.find_all('table')
if len(tables) < 3:
    print("Weniger als 3 Tabellen auf der Seite gefunden.")
    exit()

# Die dritte Tabelle auswählen (Index 2, da Python nullbasiert zählt)
table = tables[2]

# Spaltennamen extrahieren
headers = [header.text.strip() for header in table.find_all('th')]

# Zeilen extrahieren
rows = []
for row in table.find_all('tr')[1:]:  # Überspringt die Kopfzeile
    cells = [cell.text.strip() for cell in row.find_all('td')]
    rows.append(cells)

# DataFrame erstellen
df = pd.DataFrame(rows, columns=headers)

# Datei speichern
output_path = "data/WPPs/Power_Station_Dictionary.xlsx"
df.to_excel(output_path, index=False)
print(f"Die dritte Tabelle wurde erfolgreich als '{output_path}' gespeichert.")

<h1>5. Add common names from downloaded to JSON file

In [None]:
import pandas as pd

# Load the two provided files
file_1_path = r"C:\Users\alexa\Documents\Webapp\data\WPPs\Power_Station_Dictionary.xlsx"
file_2_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\JSON_File.xlsx"

# read the data
df1 = pd.read_excel(file_1_path)
df2 = pd.read_excel(file_2_path)

# introduce new columns at the end
df2['Names_UK_Plants'] = None
df2['ID_The-Wind-Power'] = None
df2['Comment'] = None

# Iterate over rows in df2 to match and update the "Names_UK_Plants" column
for index, row in df2.iterrows():
    generation_unit_name = row['GenerationUnitName']
    
    # Check if this name appears in the "National Grid BMU ID" of the first file
    matching_rows = df1[df1['National Grid BMU ID'].str.contains(generation_unit_name, na=False, case=False)]
    
    if not matching_rows.empty:
        # Get the "Common Name" value(s) and update the "Names_UK_Plants" column in df2
        common_names = matching_rows['Common Name'].tolist()
        df2.at[index, 'Names_UK_Plants'] = ', '.join(common_names)

# Save the updated DataFrame to a new Excel file
output_path = r"C:\Users\alexa\Documents\Webapp\data\Assignment.xlsx"
df2.to_excel(output_path, index=False)

output_path

<h1>6. Perform manual assignment to The Wind Power database indices

<h1>7. save WPPs in parquet file (The Wind Power)</h1>
<ul>
    <li>saves a lot of time when loading the map</li>
</ul>

In [None]:
import pandas as pd

# Lade die Excel-Datei nur einmal, filtere die relevanten Daten und speichere sie als Parquet-Datei
WPP_file = "data/WPPs/Windfarms_Europe_20241123.xlsx"
df = pd.read_excel(WPP_file, sheet_name='Windfarms', na_values=["#ND"])
df = df.drop(0) # remove first row (after headlines)

# Wähle nur die benötigten Spalten aus
df = df[['ID', 'Name', '2nd name', 'Latitude', 'Longitude', 'Manufacturer', 'Turbine', 'Hub height', 'Number of turbines', 'Total power', 'Developer', 'Operator', 'Owner', 'Commissioning date', 'Status']]

# Entferne Zeilen, bei denen Name, Total power, Latitude oder Longitude NaN ist
df = df.dropna(subset=['Name', 'Total power', 'Latitude', 'Longitude'])

# Behalte nur Zeilen, bei denen Status == "Production"
df = df.loc[df['Status'] == 'Production']

# Konvertiere Spalten explizit in ihre entsprechenden Datentypen
df['ID'] = df['ID'].astype(int)
df['Name'] = df['Name'].astype(str)
df['2nd name'] = df['2nd name'].astype(str)
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)
df['Manufacturer'] = df['Manufacturer'].astype(str)
df['Turbine'] = df['Turbine'].astype(str)
df['Hub height'] = df['Hub height'].astype(float)
df['Number of turbines'] = df['Number of turbines'].fillna(0).astype(int)
df['Total power'] = df['Total power'].astype(float)
df['Developer'] = df['Developer'].astype(str)
df['Operator'] = df['Operator'].astype(str)
df['Owner'] = df['Owner'].astype(str)
df['Commissioning date'] = df['Commissioning date'].astype(str)
df['Status'] = df['Status'].astype(str)

print(f"number of wind turbines: {len(df['ID'])})")

# Speichere die gefilterten Daten im Parquet-Format (deutlich schneller zu lesen und schreiben, als Excel-Dateien, und auch platzsparender)
df.to_parquet("data/WPPs/The_Wind_Power.parquet")

<h1>8. load WPPs and assignment file and combine the information</h1>
<ul>
    <li>assignment file matches parquet file (ID_The-Wind-Power) to json file (JSON-ID)</li>
    <li>all three files are uploaded and a new json file is created</li>
    <li>the rows in the excel file correspond excatly to the rows in the json file (same number)</li>
    <li>JSON-IDs in produced JSON-file are those of WPPs with a matching in the wind power database (no "not found"), and more specifically those of the first WPPs, in case several WPP production data are added</li>
    <li>When the capacity of a wind park changes over time due to extension (see e. g. CLDCW-1), its name and generation unit code number remain the same, but the wind farm must be considered as a new one, resulting in a new row in the excel file / new dictionary in the json file.</li>
</ul>

In [2]:
import pandas as pd
import json

# Laden der Daten
df_wind_power = pd.read_parquet("data/WPPs/The_Wind_Power.parquet")
df_assignment = pd.read_excel("data/Assignment_manual.xlsx", sheet_name="Sheet1")
with open(r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json", "r") as file:
    df_json = json.load(file)

In [7]:
output_file = "data/WPPs+production.json"

# Filtere nur Zeilen, bei denen "ID_The-Wind-Power" nicht "not found" ist
df_assignment = df_assignment[df_assignment["ID_The-Wind-Power"] != "not found"]

# set wirh unique generation unit codes
generation_unit_code_set = set(df_assignment['GenerationUnitCode'])

# Extrahiere und entpacke alle gültigen IDs aus der Spalte "ID_The-Wind-Power"
def extract_ids(value):
    # Überprüfen, ob der Wert eine Liste ist, und ggf. in einzelne IDs zerlegen
    if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
        return eval(value)  # Konvertiert die Zeichenkette in eine Liste
    elif isinstance(value, (int, str)):
        return [int(value)]  # Einzelne IDs werden in eine Liste gewandelt
    return []

valid_ids = set()
df_assignment["ID_The-Wind-Power"].apply(lambda x: valid_ids.update(extract_ids(x)))

df_filtered = df_wind_power[df_wind_power['ID'].isin(valid_ids)].copy()
actual_ids = set(df_filtered['ID'])
suspended_ids = valid_ids - actual_ids

print("number potential WPPs:", len(valid_ids))
print("number actual WPPs:", len(actual_ids))
print("number suspended WPPs (no name, location, capacity or status not in operation):", len(suspended_ids))

production_data = [] # neues JSON-File mit Produktionsdaten für die WPPs
temporal_wpps = [] # WPPs, die temporär gespeichert werden, um sie später zu aktualisieren

# Gehe durch jede Zeile der Assignment-Datei und füge Produktionsdaten hinzu
for _, row in df_assignment.iterrows():
    
    ids_in_row = extract_ids(row["ID_The-Wind-Power"])
    first_id = ids_in_row[0] # dismiss other ids in the same row, because the capacity of the WPP is not taken from the wind power database anyway and other statistics should be the same for all indices

    if first_id in suspended_ids:
        continue # jump to next iteration, because following line would fail for suspended_ids

    production_array = df_json[row['JSON-ID']]['Production']
    capacity = row['GenerationUnitInstalledCapacity(MW)']

    if first_id not in actual_ids: # several lines in assignment files for one WPP in The Wind Power file
        if row['GenerationUnitCode'] not in generation_unit_code_set: # another row with the same generation unit code as a previous row --> create new WPP although its first_id is identical, because the capacity differs
            pass # continue at current_index = ...
        else: # add production data to existing WPP
            pass
            for _, wpp in enumerate(production_data):
                if wpp['ID_The-Wind-Power'] == first_id:

                    existing_production = wpp['Production']

                    # Vergleiche Zeitstempel und addiere nur bei Übereinstimmung
                    i, j = 0, 0  # Zwei Zeiger für existing_production und production_array
                    updated_production = []

                    while i < len(existing_production) and j < len(production_array):
                        time, existing_value = existing_production[i]
                        time_comp, new_value = production_array[j]

                        if time == time_comp:
                            updated_production.append([time, existing_value + new_value])
                            i += 1
                            j += 1
                        elif time < time_comp:
                            i += 1
                        else:
                            j += 1

                    if updated_production != []:
                        wpp['Production'] = updated_production # update production data (# Ergebnisliste enthält nur Einträge mit übereinstimmenden Zeitstempeln)
                        wpp['Capacity'] = wpp['Capacity'] + capacity # update capacity
                        temporal_wpps.append(wpp)
            continue # don't add another time to the production data
    else: # after wpps' production has been changed, treat temporal_wpps. Only possible now, because some wpps were needed multiple times
        if len(temporal_wpps) > 0:
            for wpp_new in temporal_wpps:
                # if available, delete the wpp from production data (recognised by GenerationUnitCode and GenerationUnitInstalledCapacity(MW))
                production_data = [wpp for wpp in production_data if not (wpp['Code'] == wpp_new['Code'] and wpp['Capacity'] == wpp_new['Capacity'])]
                production_data.append(wpp_new)
            temporal_wpps = []

    current_index = df_filtered.loc[df_filtered['ID'] == first_id].index[0]

    # Daten für das Windkraftwerk hinzufügen
    row_data = {
        'Name': row['GenerationUnitName'], # from assignment file
        'ID_The-Wind-Power': first_id, # from assignment file
        'JSON-ID': row['JSON-ID'], # from assignment file
        'Code': row['GenerationUnitCode'], # from assignment file
        'Type': row['GenerationUnitType'], # from assignment file
        'Capacity': capacity, # from assignment file
        'Hub_height': df_filtered.at[current_index, "Hub height"], # from The Wind Power file
        'Commissioning_date': df_filtered.at[current_index, "Commissioning date"], # from The Wind Power file
        'Number_of_turbines': int(df_filtered.at[current_index, "Number of turbines"]), # from The Wind Power file (value only valid for latest WPPs)
        'Turbine': df_filtered.at[current_index, "Turbine"], # from The Wind Power file
        'Latitude': df_filtered.at[current_index, "Latitude"], # from The Wind Power file
        'Longitude': df_filtered.at[current_index, "Longitude"], # from The Wind Power file
        'Production': production_array # from JSON file
    }

    production_data.append(row_data)

    # keep track of treated generation unit codes
    generation_unit_code_set.discard(row['GenerationUnitCode'])

    # keep track of treated IDs to not try deleting rows twice 
    for id in ids_in_row:
        if id in actual_ids:
            actual_ids.discard(id)

print("number WPPs after clustering", len(production_data))

# JSON-Datei speichern
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(production_data, json_file, ensure_ascii=False, indent=4)

print(f"Zusammengeführte JSON-Datei wurde erfolgreich gespeichert unter: {output_file}")

# Convert the list to a DataFrame
df_production_data = pd.DataFrame(production_data)

# Save the DataFrame to an Excel file
df_production_data.to_excel("data/WPPs+production.xlsx", index=False)

number potential WPPs: 116
number actual WPPs: 115
number suspended WPPs (no name, location, capacity or status not in operation): 1
number WPPs after clustering 106
Zusammengeführte JSON-Datei wurde erfolgreich gespeichert unter: data/WPPs+production.json


<h1>9. Add weather data</h1>
<ul>
<li>programme execution accelerated by</li>
        <ul>
                <li>interp2d instead of RegularGridInterpolator</li>
                <li>interpolation method linear instead of cubic</li>
                <li>only perform the single most time consuming step (extraction of u and v values) once at the beginning outside of any loop</li>
                <li>outer iteration over WPPs, even if this means creating a new interpolator not only for each time step, but also for each WPP, because it saves the search for a matching time step among all time steps in the production array of a WPP for all WPPs in the inner WPP loop</li>
        </ul>
</ul>

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.interpolate import interp2d
import warnings
import json
import xarray as xr
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Basisverzeichnisse für Input und Output
input_dir = r"C:\Users\alexa\Documents\Webapp\data\weather_history"
output_dir = "data"

# Schleife über die Jahre 2015 bis 2024
for year in range(2015, 2025):
    print(f"Processing year {year}...")

    # Dateinamen für Input- und Output-Dateien
    wind_speed_file = os.path.join(input_dir, f"{year}.grib")
    output_file_json = os.path.join(output_dir, f"WPPs+production+wind_{year}.json")
    output_file_excel = os.path.join(output_dir, f"WPPs+production+wind_{year}.xlsx")

    # Windgeschwindigkeitsdaten laden
    wind_speed = xr.open_dataset(wind_speed_file, engine="cfgrib", chunks={"time": 100})
    times = pd.to_datetime(wind_speed['time'].values)
    latitudes = wind_speed['latitude'].values
    longitudes = wind_speed['longitude'].values
    u = wind_speed['u100'].values
    v = wind_speed['v100'].values

    # Produktionsdaten laden
    WPP_production = pd.read_json("data/WPPs+production.json")

    WPP_production_wind = []

    # Iteration über alle Windkraftwerke
    for i, wpp in WPP_production.iterrows():
        lon = wpp['Longitude']
        lat = wpp['Latitude']
        production = wpp['Production']

        # Filtere Produktionsdaten für das aktuelle Jahr
        production_subset = [entry for entry in production if str(year) in entry[0]]

        if not production_subset:
            print(f"Wind power plant {i+1}/{len(WPP_production)} has no production data for {year}, skipping...")
            continue
        else:
            print(f"Wind power plant {i+1}/{len(WPP_production)} for year {year}")
            interpolated_production = []
            for j, entry in enumerate(production_subset):
                time_str, production_value = entry
                timestep = pd.to_datetime(time_str)
                if timestep in times:
                    time_index = times.get_loc(timestep)

                    wind_speeds = np.sqrt(u[time_index]**2 + v[time_index]**2)
                    spatial_interpolator = interp2d(longitudes, latitudes, wind_speeds, kind='linear')
                    wind_speed_value = spatial_interpolator(lon, lat)[0]
                    wind_speed_value = round(wind_speed_value, 2)

                    interpolated_production.append([time_str, production_value, wind_speed_value])

            # Produktionsdaten aktualisieren
            wpp['Production'] = interpolated_production
            WPP_production_wind.append(wpp.to_dict())

    # Speichere die aktualisierten Produktionsdaten
    with open(output_file_json, 'w', encoding='utf-8') as json_file:
        json.dump(WPP_production_wind, json_file, ensure_ascii=False, indent=4)
    print(f"Updated JSON file for {year} saved to: {output_file_json}")

    # Konvertiere die Liste in einen DataFrame und speichere als Excel
    df_WPP_production = pd.DataFrame(WPP_production_wind)
    df_WPP_production.to_excel(output_file_excel, index=False)
    print(f"Updated Excel file for {year} saved to: {output_file_excel}")

<h1>10. Merge all data to one file

In [None]:
import os
import json
import pandas as pd

# Verzeichnis und Dateien
input_dir = r"C:\Users\alexa\Documents\Webapp\data"
output_json = os.path.join(input_dir, "WPPs+production+wind.json")
output_excel = os.path.join(input_dir, "WPPs+production+wind.xlsx")

# Daten sammeln
all_wpp_data = {}
columns = [
    "Name",
    "ID_The-Wind-Power",
    "JSON-ID",
    "Code",
    "Type",
    "Capacity",
    "Hub_height",
    "Commission_date",
    "Number_of_turbines",
    "Turbine",
    "Latitude",
    "Longitude",
]
excel_rows = []
json_ids = [] # json ids added to all_wpp_data

# Durch die JSON-Dateien iterieren
for year in range(2015, 2025):
    file_path = os.path.join(input_dir, f"WPPs+production+wind_{year}.json")

    # Prüfen, ob die Datei existiert
    if not os.path.isfile(file_path):
        print(f"Datei {file_path} nicht gefunden, überspringe...")
        continue

    # JSON-Datei laden
    with open(file_path, "r", encoding="utf-8") as file:
        yearly_data = json.load(file)

    # Daten verarbeiten
    for wpp in yearly_data:
        json_id = wpp["JSON-ID"]
        if json_id not in json_ids:
            all_wpp_data[json_id] = wpp
            json_ids.append(json_id)

            # Excel-Zeilen sammeln
            excel_rows.append(
                [
                    wpp.get("Name"),
                    wpp.get("ID_The-Wind-Power"),
                    json_id,
                    wpp.get("Code"),
                    wpp.get("Type"),
                    wpp.get("Capacity"),
                    wpp.get("Hub_height"),
                    wpp.get("Commission_date"),
                    wpp.get("Number_of_turbines"),
                    wpp.get("Turbine"),
                    wpp.get("Latitude"),
                    wpp.get("Longitude"),
                ]
            )

        else:
            # Produktionsdaten anhängen
            all_wpp_data[json_id]["Production"].extend(wpp["Production"])

# JSON-Datei schreiben
with open(output_json, "w", encoding="utf-8") as file:
    json.dump(list(all_wpp_data.values()), file, indent=4, ensure_ascii=False)

# Excel-Datei schreiben
excel_df = pd.DataFrame(excel_rows, columns=columns)
excel_df.to_excel(output_excel, index=False)

print(f"Zusammengeführte JSON-Datei gespeichert unter: {output_json}")
print(f"Zusammengeführte Excel-Datei gespeichert unter: {output_excel}")


Zusammengeführte JSON-Datei gespeichert unter: C:\Users\alexa\Documents\Webapp\data\WPPs+production+wind.json
Zusammengeführte Excel-Datei gespeichert unter: C:\Users\alexa\Documents\Webapp\data\WPPs+production+wind.xlsx


<h1>11. Manually add as many missing technical specifications as possible

<h1>12. Adopt technical specifications in JSON file

In [8]:
import os
import json
import pandas as pd

# Verzeichnis und Dateien
input_dir = r"C:\Users\alexa\Documents\Webapp\data"
json_file = os.path.join(input_dir, "WPPs+production+wind.json")
excel_file = os.path.join(input_dir, "WPPs+production+wind.xlsx")

# Dateien laden
with open(json_file, "r", encoding="utf-8") as file:
    wpp_data = json.load(file)

excel_df = pd.read_excel(excel_file)

# Daten aktualisieren
for wpp in wpp_data:
    json_id = wpp["JSON-ID"]

    # Entsprechenden Excel-Eintrag finden
    matching_row = excel_df.loc[excel_df["JSON-ID"] == json_id]

    updated_hub_height = matching_row["Hub_height"].values[0]
    updated_turbine = matching_row["Turbine"].values[0]

    wpp["Hub_height"] = updated_hub_height
    wpp["Turbine"] = updated_turbine

# Aktualisierte JSON-Datei speichern
with open(os.path.join(input_dir, "WPPs+production+wind.json"), "w", encoding="utf-8") as file:
    json.dump(wpp_data, file, indent=4, ensure_ascii=False)

print(f"Aktualisierte JSON-Datei gespeichert unter: {os.path.join(input_dir, "WPPs+production+wind.json")}")

Aktualisierte JSON-Datei gespeichert unter: C:\Users\alexa\Documents\Webapp\data\WPPs+production+wind.json


<h1>13. Verify that datapoints where production > capacity are not unreasonably numerous and significant

In [8]:
import pandas as pd
import json

# JSON-Datei laden
with open(f"data/WPPs+production+wind.json", "r", encoding="utf-8") as file:
    WPP_production_wind = json.load(file)

# Listen zur Speicherung der Ergebnisse
entries_with_over_capacity = []

# Überkapazitätsprüfung
for wpp in WPP_production_wind:
    capacity = wpp["Capacity"]
    for record in wpp["Production"]:
        timestamp, production = record[0], record[1]
        if production > capacity:
            entries_with_over_capacity.append({
                "Name": wpp["Name"],
                "Timestamp": timestamp,
                "Production (MW)": production,
                "Capacity (MW)": capacity
            })

# Wenn Überkapazitätswerte gefunden werden, in Excel speichern
output_excel_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\over_capacity_entries.xlsx"

if entries_with_over_capacity:
    df_over_capacity = pd.DataFrame(entries_with_over_capacity)
    df_over_capacity.to_excel(output_excel_path, index=False)
    print(f"Überkapazitätswerte erfolgreich gespeichert unter: {output_excel_path}")
else:
    print("Keine Werte mit Überkapazität gefunden.")

Überkapazitätswerte erfolgreich gespeichert unter: C:\Users\alexa\Documents\Webapp\data\production_history\over_capacity_entries.xlsx
