<h1>1. Preprocessing of historical production data: discard data of unwanted power plants, retain monthly files</h1>
<p>duplicates are avoided by creating a set unique_windfarms_set and comparing, if already added<br>
takes 1 to 2 minutes per month, partially because only values for full hours are retained

In [115]:
import pandas as pd
import os
import json
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Basisverzeichnisse
input_dir = r"E:\MA_data\raw production history ENTSO-E"
output_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON"

# Liste der Monate von 2015-01 bis 2024-10 generieren
months = pd.date_range(start="2019-09", end="2019-09", freq="MS").strftime("%Y_%m").tolist()

# For-Schleife für jede Datei
for month in months:
    # Dateipfad erstellen
    input_file = os.path.join(input_dir, f"{month}_ActualGenerationOutputPerGenerationUnit_16.1.A_r2.1.csv")
    output_file = os.path.join(output_dir, f"production_summary_{month}.json")

    # Überprüfen, ob die Datei existiert
    if not os.path.exists(input_file):
        print(f"Datei nicht gefunden: {input_file}")
        continue

    # Datei einlesen
    print(f"Bearbeite Datei: {input_file}")
    data = pd.read_csv(input_file, sep='\t')

    # Filtere nach GenerationUnitType == 'Wind Onshore' oder 'Wind Offshore'
    filtered_data = data[(data['GenerationUnitType'] == 'Wind Onshore ') | (data['GenerationUnitType'] == 'Wind Offshore ')]

    # Konvertiere 'DateTime (UTC)' direkt in das ISO-8601-Format
    filtered_data.loc[:, 'DateTime (UTC)'] = pd.to_datetime(filtered_data['DateTime (UTC)']).dt.strftime('%Y-%m-%dT%H:%M:%S')

    # Wichtige Spalten identifizieren, 'AreaCode', 'AreaDisplayName', 'AreaTypeCode' and 'MapCode' of identical WPPs may differ --> use at least one of them as a criterion to identify unique windfarms, and sort out the duplicates manually, because otherwise, the production data are appended twice to the same wind farm
    unique_windfarms = filtered_data[['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'AreaCode']].drop_duplicates()
    unique_windfarms_set = set(unique_windfarms['GenerationUnitName'])

    # Listen für die Produktion zu jeder Stunde hinzufügen
    production_data = []
    for _, row in unique_windfarms.iterrows():
        # Filtern der Daten für das aktuelle Windkraftwerk
        windfarm_data = filtered_data[
            (filtered_data['GenerationUnitName'] == row['GenerationUnitName']) &
            (filtered_data['AreaCode'] == row['AreaCode']) # important to avoid adding to a wind farm production data of all its duplicates
        ]

        # Erstelle 2D-Array (Liste von Listen) mit Zeit und Produktion, da JSON keine Arrays speichern kann
        production_array = [
            [time, production]
            for time, production in zip(
                windfarm_data['DateTime (UTC)'],
                windfarm_data['ActualGenerationOutput(MW)']
            )
            if pd.notna(production) and pd.to_datetime(time).minute == 0  # Nur volle Stunden übernehmen (Resolution of weather data is hourly), although it significantly increases the execution time of the programme, und fehlende Werte überspringen
        ]

        # Daten für das Windkraftwerk hinzufügen
        row_data = {
            'GenerationUnitName': row['GenerationUnitName'],
            'GenerationUnitCode': row['GenerationUnitCode'],
            'GenerationUnitType': row['GenerationUnitType'],
            'GenerationUnitInstalledCapacity(MW)': row['GenerationUnitInstalledCapacity(MW)'],
            'Production': production_array
        }

        # don't add duplicates
        if row_data['GenerationUnitName'] in unique_windfarms_set:
            production_data.append(row_data)
            unique_windfarms_set.discard(row_data['GenerationUnitName'])

    # JSON-Datei speichern
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(production_data, json_file, ensure_ascii=False, indent=4)

    print(f"JSON-Datei wurde erfolgreich erstellt: {output_file}")


Bearbeite Datei: E:\MA_data\raw production history ENTSO-E\2019_09_ActualGenerationOutputPerGenerationUnit_16.1.A_r2.1.csv
JSON-Datei wurde erfolgreich erstellt: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2019_09.json


<h1>2. merge all monthly production data files to one combined file

In [127]:
import pandas as pd
import os
import json

# Verzeichnisse
input_dir = r"C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON"
output_file = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json"

# Liste der Monate von 2015_01 bis 2024_10
months = pd.date_range(start="2015-01", end="2024-10", freq="MS").strftime("%Y_%m").tolist()

columns_merge = ['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)']
final_data = {}

# Einlesen der einzelnen Dateien
for month in months:
    input_file = os.path.join(input_dir, f"production_summary_{month}.json")

    # Überprüfen, ob die Datei existiert
    if not os.path.exists(input_file):
        print(f"Datei nicht gefunden: {input_file}")
        continue

    # Datei einlesen
    print(f"Verarbeite Datei: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as file:
        monthly_data = json.load(file)

    # Zusammenführen: Gleiche Windkraftanlagen zusammenführen
    for windfarm in monthly_data:
        key = tuple(windfarm[col] for col in columns_merge) # unique key per WPP, defined by column_merge (name, code, type, capacity) --> duplicates are not added (already assured in previous cell)
        if key not in final_data:
            # Neu hinzufügen
            final_data[key] = windfarm
        else:
            # Produktion zusammenführen
            final_data[key]['Production'].extend(windfarm['Production'])

# Finales JSON-Datenformat vorbereiten
merged_data = list(final_data.values())

# JSON-Datei speichern
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(merged_data, json_file, ensure_ascii=False, indent=4)

print(f"Zusammengeführte JSON-Datei wurde erfolgreich gespeichert unter: {output_file}")


Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_01.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_02.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_03.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_04.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_05.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_06.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_07.json
Verarbeite Datei: C:\Users\alexa\Documents\Webapp\data\production_history\processed_new\JSON\production_summary_2015_08.json


<h1>3. Perform manual assignment to The Wind Power database indices

Create Excel File with WPPs in JSON file

In [None]:
import pandas as pd
import json

# Datei laden
file_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json"
output_excel_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\JSON_File.xlsx"

# JSON-Datei einlesen
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# assign an ID to each WPP in the JSON file that corresponds to its position in the list of dictionaries
for i, item in enumerate(data):
    item['JSON-ID'] = i

# Relevante Daten extrahieren
df = pd.DataFrame(data)[['GenerationUnitName', 'GenerationUnitCode', 'GenerationUnitType', 'GenerationUnitInstalledCapacity(MW)', 'JSON-ID']]

# Nach 'GenerationUnitName' sortieren
df_sorted = df.sort_values(by='GenerationUnitName')

# Daten in eine Excel-Datei speichern
df_sorted.to_excel(output_excel_path, index=False)

# Fertigmeldung
output_excel_path

'C:\\Users\\alexa\\Documents\\Webapp\\data\\production_history\\JSON_File.xlsx'

Download table to find common name for UK power plants from website

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL der Webseite
url = "https://osuked.github.io/Power-Station-Dictionary/dictionary.html"

# Abrufen der Webseite
response = requests.get(url)
if response.status_code != 200:
    print("Fehler beim Abrufen der Webseite.")
    exit()

# Parsing der Webseite mit BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Finden aller Tabellen auf der Seite
tables = soup.find_all('table')
if len(tables) < 3:
    print("Weniger als 3 Tabellen auf der Seite gefunden.")
    exit()

# Die dritte Tabelle auswählen (Index 2, da Python nullbasiert zählt)
table = tables[2]

# Spaltennamen extrahieren
headers = [header.text.strip() for header in table.find_all('th')]

# Zeilen extrahieren
rows = []
for row in table.find_all('tr')[1:]:  # Überspringt die Kopfzeile
    cells = [cell.text.strip() for cell in row.find_all('td')]
    rows.append(cells)

# DataFrame erstellen
df = pd.DataFrame(rows, columns=headers)

# Datei speichern
output_path = "data/WPPs/Power_Station_Dictionary.xlsx"
df.to_excel(output_path, index=False)
print(f"Die dritte Tabelle wurde erfolgreich als '{output_path}' gespeichert.")

Add common names from downloaded to assignment file

In [106]:
import pandas as pd

# Load the two provided files
file_1_path = r"C:\Users\alexa\Documents\Webapp\data\WPPs\Power_Station_Dictionary.xlsx"
file_2_path = r"C:\Users\alexa\Documents\Webapp\data\production_history\JSON_File.xlsx"

# read the data
df1 = pd.read_excel(file_1_path)
df2 = pd.read_excel(file_2_path)

# introduce new columns at the end
df2['Names_UK_Plants'] = None
df2['ID_The-Wind-Power'] = None
df2['Comment'] = None

# Iterate over rows in df2 to match and update the "Names_UK_Plants" column
for index, row in df2.iterrows():
    generation_unit_name = row['GenerationUnitName']
    
    # Check if this name appears in the "National Grid BMU ID" of the first file
    matching_rows = df1[df1['National Grid BMU ID'].str.contains(generation_unit_name, na=False, case=False)]
    
    if not matching_rows.empty:
        # Get the "Common Name" value(s) and update the "Names_UK_Plants" column in df2
        common_names = matching_rows['Common Name'].tolist()
        df2.at[index, 'Names_UK_Plants'] = ', '.join(common_names)

# Save the updated DataFrame to a new Excel file
output_path = r"C:\Users\alexa\Documents\Webapp\data\Assignment.xlsx"
df2.to_excel(output_path, index=False)

output_path

'C:\\Users\\alexa\\Documents\\Webapp\\data\\Assignment.xlsx'

perform manual assignment

load WPPs and assignment file (after manual assignment has been conducted) and combine the information - To Do !!!!!!!!!!<br>
assignment file matches parquet file (ID_The-Wind-Power) to json file (JSON-ID)<br>
all three files are uploaded and a new json file is created<br>
the rows in the excel file correspond excatly to the rows in the json file (same number)<br>
JSON-IDs in outgoing JSON-file are those of WPPs with matching in the wind power database (no "not found"), and more specifically that of the first WPP when WPP production data are added

In [145]:
import pandas as pd

# Laden der Daten
df_wind_power = pd.read_parquet("data/WPPs/The_Wind_Power.parquet")
df_assignment = pd.read_excel("data/Assignment.xlsx", sheet_name="Sheet1")
with open(r"C:\Users\alexa\Documents\Webapp\data\production_history\production_summary_all.json", "r") as file:
    df_json = json.load(file)

In [151]:
output_file = "data/WPPs+production.json"

# Filtere nur Zeilen, bei denen "ID_The-Wind-Power" nicht "not found" ist
df_assignment = df_assignment[df_assignment["ID_The-Wind-Power"] != "not found"]

# set wirh unique generation unit codes
generation_unit_code_set = set(df_assignment['GenerationUnitCode'])

# Extrahiere und entpacke alle gültigen IDs aus der Spalte "ID_The-Wind-Power"
def extract_ids(value):
    # Überprüfen, ob der Wert eine Liste ist, und ggf. in einzelne IDs zerlegen
    if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
        return eval(value)  # Konvertiert die Zeichenkette in eine Liste
    elif isinstance(value, (int, str)):
        return [int(value)]  # Einzelne IDs werden in eine Liste gewandelt
    return []

valid_ids = set()
df_assignment["ID_The-Wind-Power"].apply(lambda x: valid_ids.update(extract_ids(x)))

df_filtered = df_wind_power[df_wind_power['ID'].isin(valid_ids)].copy()
actual_ids = set(df_filtered['ID'])
suspended_ids = valid_ids - actual_ids

print("number potential WPPs:", len(valid_ids))
print("number actual WPPs:", len(actual_ids))
print("number suspended WPPs (no name, location, capacity or status not in operation):", len(suspended_ids))

# Erstelle neues JSON-File mit Produktionsdaten für die WPPs
production_data = []

# Gehe durch jede Zeile der Assignment-Datei und füge Produktionsdaten hinzu
for _, row in df_assignment.iterrows():
    
    ids_in_row = extract_ids(row["ID_The-Wind-Power"])
    first_id = ids_in_row[0] # dismiss other ids in the same row, because the capacity of the WPP is not taken from the wind power database anyway and other statistics should be the same for all indices

    if first_id in suspended_ids:
        continue # jump to next iteration, because following line would fail for suspended_ids

    production_array = df_json[row['JSON-ID']]['Production']
    capacity = row['GenerationUnitInstalledCapacity(MW)']
    if first_id not in actual_ids: # several lines in assignment files for one WPP in The Wind Power file
        if row['GenerationUnitCode'] not in generation_unit_code_set: # another row with the same generation unit code as a previous row --> create new WPP although its first_id is identical, because the capacity differs
            break # continue at current_index = ...
        else: # add production data to existing WPP
            for i, wpp in enumerate(production_data):
                if wpp['ID_The-Wind-Power'] == first_id:
                    existing_production = wpp['Production']
                    production_array = [
                        [time, existing_value + new_value]
                        for (time, existing_value), (time_comp, new_value) in zip(existing_production, production_array)
                        if time == time_comp
                    ]
                    production_data[i]['Production'] = production_array # update production data

                    existing_capacity = wpp['Capacity']
                    capacity = existing_capacity + capacity
                    production_data[i]['Capacity'] = capacity # update capacity
    else:
        continue # don't add another time to the production data

    current_index = df_filtered.loc[df_filtered['ID'] == first_id].index[0]

    # Daten für das Windkraftwerk hinzufügen
    row_data = {
        'Name': row['GenerationUnitName'], # from assignment file
        'ID_The-Wind-Power': first_id, # from assignment file
        'JSON-ID': row['JSON-ID'], # from assignment file
        'Code': row['GenerationUnitCode'], # from assignment file
        'Type': row['GenerationUnitType'], # from assignment file
        'Capacity': capacity, # from assignment file
        'Hub_height': df_filtered.at[current_index, "Hub height"], # from The Wind Power file
        'Commission_date': df_filtered.at[current_index, "Commission date"], # from The Wind Power file
        'Number_of_turbines': df_filtered.at[current_index, "Number of turbines"], # from The Wind Power file (value only valid for latest WPPs)
        'Production': production_array # from JSON file
    }

    production_data.append(row_data)

    # keep track of treated generation unit codes
    generation_unit_code_set.discard(row_data['GenerationUnitName'])

    # keep track of treated IDs to not try deleting rows twice 
    for id in ids_in_row:
        if id in actual_ids:
            actual_ids.discard(id)

actual_cluster_ids = len(production_data)
print("number WPPs after clustering", len(actual_cluster_ids))

# JSON-Datei speichern
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(production_data, json_file, ensure_ascii=False, indent=4)

print(f"Zusammengeführte JSON-Datei wurde erfolgreich gespeichert unter: {output_file}")

# Convert the list to a DataFrame
df_production_data = pd.DataFrame(production_data)

# Save the DataFrame to an Excel file
df_production_data.to_excel("data/WPPs+production.xlsx", index=False)

number potential WPPs: 116
number actual WPPs: 115
number suspended WPPs (no name, location, capacity or status not in operation): 1


TypeError: object of type 'int' has no len()