In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import norm
import json

## 1. Setup the combined CDFs

In [2]:
def merge_csv_files(file_paths):
    """
    Diese Funktion lädt mehrere CSV-Dateien und kombiniert sie, indem sie die Werte für gleiche Spalten untereinander in neue Zeilen schreibt.

    :param file_paths: Liste der Dateipfade zu den CSV-Dateien
    :return: Zusammengeführter DataFrame
    """
    dataframes = [pd.read_csv(file) for file in file_paths]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

file_paths = ["saved_csv/CDF_Functions_Radiation.csv",
              "saved_csv/CDF_Functions_Temperature.csv",
              "saved_csv/CDF_Functions_Consumption.csv"]
merged_df = merge_csv_files(file_paths)

merged_df.to_csv("saved_csv/CDF_Functions.csv", index=False)


## 2. Read the combined CDFs correctly

``CDF: 589 float values in list``

``Radiation_CDF: df["Month_X_Hour_Y"][0]``

``Temperature_CDF: df["Month_X_Hour_Y"][1]``

``X = 1, ..., 12; Y = 0, ..., 23``

In [3]:
file_path = "saved_csv/CDF_Functions.csv"

# CSV einlesen
df = pd.read_csv(file_path)

# Umwandlung der CDF-Spalten von Strings zu Listen von Floats
for column in df.columns[1:]:  # Erste Spalte ist "CDFs", daher erst ab Spalte 1
    df[column] = df[column].apply(lambda x: list(map(float, x.split(";"))) if isinstance(x, str) else [])

## Calculate Correlation

In [4]:
# Load and preprocess consumption data
df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
df_consumption['time'] = pd.to_datetime(df_consumption['Date'] + ' ' + df_consumption['Time'], format='%d/%m/%Y %H:%M:%S')
df_consumption.drop(columns=['Date', 'Time'], inplace=True)
df_consumption.drop(columns=['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3','Global_reactive_power', 'Global_intensity', 'Voltage'], inplace=True)
def to_numeric_or_nan(value):
    try:
        return pd.to_numeric(value)
    except ValueError:
        return np.nan
# Convert values in the 'Global_active_power' column to numeric, non-numeric values become NaN
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].apply(to_numeric_or_nan)
# Drop nan's
df_consumption = df_consumption.dropna(subset=['Global_active_power'])
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].astype(float)
df_consumption = df_consumption.set_index("time")
df_consumption = df_consumption['Global_active_power'].resample('H').sum().reset_index()
df_consumption

  df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_consumption['Global_active_power'] = df_consumption['Global_active_power'].astype(float)


Unnamed: 0,time,Global_active_power
0,2006-12-16 17:00:00,152.024
1,2006-12-16 18:00:00,217.932
2,2006-12-16 19:00:00,204.014
3,2006-12-16 20:00:00,196.114
4,2006-12-16 21:00:00,183.388
...,...,...
34584,2010-11-26 17:00:00,103.554
34585,2010-11-26 18:00:00,94.408
34586,2010-11-26 19:00:00,99.560
34587,2010-11-26 20:00:00,69.822


In [5]:
# Load and preprocess sun data
df_solar = pd.read_csv("data/Solar_Timeseries_2005_2023.csv")
# Zeitstempel umwandeln
df_solar['time'] = pd.to_datetime(df_solar['time'], format='%Y%m%d:%H%M', errors='coerce')
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']].dropna()
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']]
df_solar = df_solar.set_index("time")
df_solar = df_solar[['G(i) (Globalstrahlung)', 'T2m (Temperatur)']].resample('H').sum().reset_index()
df_solar

Unnamed: 0,time,G(i) (Globalstrahlung),T2m (Temperatur)
0,2005-01-01 00:00:00,0.0,7.97
1,2005-01-01 01:00:00,0.0,7.74
2,2005-01-01 02:00:00,0.0,7.27
3,2005-01-01 03:00:00,0.0,6.59
4,2005-01-01 04:00:00,0.0,5.90
...,...,...,...
166531,2023-12-31 19:00:00,0.0,7.74
166532,2023-12-31 20:00:00,0.0,7.81
166533,2023-12-31 21:00:00,0.0,7.72
166534,2023-12-31 22:00:00,0.0,7.71


In [7]:
# Zusammenführen der DataFrames anhand der Zeitspalte
merged_df = pd.merge(df_consumption, df_solar, on='time')
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,time,Global_active_power,G(i) (Globalstrahlung),T2m (Temperatur)
0,2006-12-16 17:00:00,152.024,0.0,6.64
1,2006-12-16 18:00:00,217.932,0.0,6.11
2,2006-12-16 19:00:00,204.014,0.0,5.81
3,2006-12-16 20:00:00,196.114,0.0,5.61
4,2006-12-16 21:00:00,183.388,0.0,5.26
...,...,...,...,...
34584,2010-11-26 17:00:00,103.554,0.0,-0.68
34585,2010-11-26 18:00:00,94.408,0.0,-0.44
34586,2010-11-26 19:00:00,99.560,0.0,-0.76
34587,2010-11-26 20:00:00,69.822,0.0,-2.04


In [8]:
correlation = merged_df[['G(i) (Globalstrahlung)', 'T2m (Temperatur)', 'Global_active_power']].corr()
correlation

Unnamed: 0,G(i) (Globalstrahlung),T2m (Temperatur),Global_active_power
G(i) (Globalstrahlung),1.0,0.380393,0.007108
T2m (Temperatur),0.380393,1.0,-0.192351
Global_active_power,0.007108,-0.192351,1.0


In [9]:
correlation = np.array(correlation)
correlation

array([[ 1.        ,  0.38039259,  0.0071077 ],
       [ 0.38039259,  1.        , -0.192351  ],
       [ 0.0071077 , -0.192351  ,  1.        ]])

In [None]:
def generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf=None):
    """
    Generiert n_samples Samples, die mittels Gaussian Copula so korreliert werden,
    dass die marginalen Verteilungen den vorgegebenen CDFs (als Quantile) entsprechen.

    Parameter
    ----------
    n_samples : int
        Anzahl der zu generierenden Samples.
    correlation : float
        Gewünschter Korrelationswert zwischen Globalstrahlung und Temperatur.
    radiation_cdf : list oder np.array
        Sortierte Liste von Werten der Strahlungsverteilung (589 Werte).
    temperature_cdf : list oder np.array
        Sortierte Liste von Werten der Temperaturverteilung (589 Werte).
    active_power_cdf : list oder np.array, optional
        Sortierte Liste von Werten der active_power-Verteilung (589 Werte). Standard: None.

    Returns
    -------
    radiation_samples : np.array
        Samples aus der Strahlungsverteilung.
    temperature_samples : np.array
        Samples aus der Temperaturverteilung.
    active_power_samples : np.array
        Samples aus der active_power-Verteilung (falls active_power_cdf angegeben).
    """
    # Kovarianzmatrix für 3 Variablen (wenn active_power_cdf angegeben ist)
    if active_power_cdf is not None:
        cov = correlation
    else:
        cov = np.array([[1, correlation],
                        [correlation, 1]])

    # Generiere Samples aus der multivariaten Normalverteilung
    mv_samples = np.random.multivariate_normal(mean=[0]*cov.shape[0], cov=cov, size=n_samples)

    # Transformation in Uniformvariablen über die Standard-Normal-CDF
    u = norm.cdf(mv_samples)

    # Erstelle äquidistante Wahrscheinlichkeiten, die den Quantilen der CDFs entsprechen
    p_radiation = np.linspace(0, 1, len(radiation_cdf))
    p_temperature = np.linspace(0, 1, len(temperature_cdf))

    # Inverse Transform Sampling mittels Interpolation
    radiation_samples = np.interp(u[:, 0], p_radiation, radiation_cdf)
    temperature_samples = np.interp(u[:, 1], p_temperature, temperature_cdf)

    if active_power_cdf is not None:
        p_active_power = np.linspace(0, 1, len(active_power_cdf))
        active_power_samples = np.interp(u[:, 2], p_active_power, active_power_cdf)
        return radiation_samples, temperature_samples, active_power_samples

    return radiation_samples, temperature_samples

def generate_samples_for_hour_from_df(df, column, n_samples=1000, correlation=0.3786):
    """
    Generiert für die angegebene Stunde (Spalte) korrelierte Samples aus dem
    bereits geladenen DataFrame.

    Annahmen:
      - Der DataFrame enthält Spalten mit Namen wie "Month_X_Hour_Y"
        (X = 1, ..., 12; Y = 0, ..., 23).
      - In jeder dieser Spalten:
            Zeile 0 enthält die Radiation_CDF (Liste von 589 floats)
            Zeile 1 enthält die Temperature_CDF (Liste von 589 floats)
            Zeile 2 enthält die active_power_CDF (Liste von 589 floats) (optional)

    Parameter
    ----------
    df : pd.DataFrame
        DataFrame, der die CDFs enthält.
    column : str
        Name der Spalte, z.B. "Month_4_Hour_13".
    n_samples : int, optional
        Anzahl der zu generierenden Samples (Standard: 1000).
    correlation : float, optional
        Gewünschter Korrelationswert (Standard: 0.3786).

    Returns
    -------
    radiation_samples, temperature_samples, active_power_samples : tuple von np.array
        Korrelierte Samples aus der Strahlungs-, Temperatur- und active_power-Verteilung.
    """
    radiation_cdf = df[column][0]
    temperature_cdf = df[column][1]
    active_power_cdf = df[column][2] if len(df[column]) > 2 else None

    return generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf)


In [None]:
def cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5):
    """
    Gruppiert Szenarien basierend auf den angegebenen Spaltenwerten und berechnet deren Wahrscheinlichkeiten.

    Parameter
    ----------
    column : str
        Der Name der Spalte, die zur Generierung von Stichproben verwendet wird. Format: Month_{month}_Hour_{hour}
    correlation : float
        Die Korrelation, die zur Generierung von Stichproben verwendet wird (wurde am Begin des Dokumentes berechnet).
    n_samples : int, optional
        Die Anzahl der zu generierenden Stichproben. Standard ist wie von uns genutzt 1000.
    n_clusters : int, optional
        Die Anzahl der Cluster für den KMeans-Algorithmus. Standard ist wie von uns genutzt 5.
    
    Returns
    -------
    tuple
        Ein Tupel mit zwei Listen:
        - scenarios : List[List[float]]
            Die Cluster-Zentren, die die Szenarien repräsentieren.
        - probabilities : List[float]
            Die Wahrscheinlichkeiten jedes Clusters/ Szenarios.
    """

    samples_rad, samples_temp, samples_active_power = generate_samples_for_hour_from_df(df, column, n_samples=n_samples, correlation=correlation)
    samples = np.vstack((samples_rad, samples_temp, samples_active_power)).T

    # Erstellen eines KMeans-Modells
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)

    # Anpassen des Modells auf die Daten
    kmeans.fit(samples)

    # Zuordnung der Cluster
    labels = kmeans.labels_

    # Cluster-Zentren anzeigen
    cluster_centers = kmeans.cluster_centers_

    # Wahrscheinlichkeiten berechnen (relative Häufigkeit der Punkte in jedem Cluster)
    unique, counts = np.unique(labels, return_counts=True)
    probs = counts / len(samples)

    # Ergebnisse speichern
    scenarios = cluster_centers.tolist()
    probabilities = probs.tolist()
    return scenarios, probabilities

In [12]:
all_scenarios = []
all_probabilities = []
for month in range(1,13):
    for hour in range(0, 24):
        column = f"Month_{month}_Hour_{hour}"
        scenarios, probabilities = cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5)
        all_scenarios.append(scenarios)
        all_probabilities.append(probabilities)   

In [14]:
len(all_scenarios)

288

In [15]:
len(all_probabilities)

288

In [16]:
# Ergebnisse in JSON-Dateien speichern
with open('results/all_scenarios.json', 'w') as f:
    json.dump(all_scenarios, f)

with open('results/all_probabilities.json', 'w') as f:
    json.dump(all_probabilities, f)