In [11]:
import pandas as pd
import glob
import numpy as np


## 1. Setup the combined CDFs

In [12]:
def merge_csv_files(file_paths):
    """
    Diese Funktion lädt mehrere CSV-Dateien und kombiniert sie, indem sie die Werte für gleiche Spalten untereinander in neue Zeilen schreibt.

    :param file_paths: Liste der Dateipfade zu den CSV-Dateien
    :return: Zusammengeführter DataFrame
    """
    dataframes = [pd.read_csv(file) for file in file_paths]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

file_paths = ["saved_csv/CDF_Functions_Radiation.csv",
              "saved_csv/CDF_Functions_Temperature.csv",
              "saved_csv/CDF_Functions_Consumption.csv"]
merged_df = merge_csv_files(file_paths)

merged_df.to_csv("saved_csv/CDF_Functions.csv", index=False)


## 2. Read the combined CDFs correctly

``CDF: 589 float values in list``

``Radiation_CDF: df["Month_X_Hour_Y"][0]``

``Temperature_CDF: df["Month_X_Hour_Y"][1]``

``X = 1, ..., 12; Y = 0, ..., 23``

In [29]:
file_path = "saved_csv/CDF_Functions.csv"

# CSV einlesen
df = pd.read_csv(file_path)

# Umwandlung der CDF-Spalten von Strings zu Listen von Floats
for column in df.columns[1:]:  # Erste Spalte ist "CDFs", daher erst ab Spalte 1
    df[column] = df[column].apply(lambda x: list(map(float, x.split(";"))) if isinstance(x, str) else [])

## Calculate Correlation

In [18]:
# Load and preprocess consumption data
df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
df_consumption['time'] = pd.to_datetime(df_consumption['Date'] + ' ' + df_consumption['Time'], format='%d/%m/%Y %H:%M:%S')
df_consumption.drop(columns=['Date', 'Time'], inplace=True)
df_consumption.drop(columns=['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3','Global_reactive_power', 'Global_intensity', 'Voltage'], inplace=True)
def to_numeric_or_nan(value):
    try:
        return pd.to_numeric(value)
    except ValueError:
        return np.nan
# Convert values in the 'Global_active_power' column to numeric, non-numeric values become NaN
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].apply(to_numeric_or_nan)
# Drop nan's
df_consumption = df_consumption.dropna(subset=['Global_active_power'])
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].astype(float)
df_consumption = df_consumption.set_index("time")
df_consumption = df_consumption['Global_active_power'].resample('H').sum().reset_index()
df_consumption

  df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_consumption['Global_active_power'] = df_consumption['Global_active_power'].astype(float)


Unnamed: 0,time,Global_active_power
0,2006-12-16 17:00:00,152.024
1,2006-12-16 18:00:00,217.932
2,2006-12-16 19:00:00,204.014
3,2006-12-16 20:00:00,196.114
4,2006-12-16 21:00:00,183.388
...,...,...
34584,2010-11-26 17:00:00,103.554
34585,2010-11-26 18:00:00,94.408
34586,2010-11-26 19:00:00,99.560
34587,2010-11-26 20:00:00,69.822


In [20]:
# Load and preprocess sun data
df_solar = pd.read_csv("data/Solar_Timeseries_2005_2023.csv")
# Zeitstempel umwandeln
df_solar['time'] = pd.to_datetime(df_solar['time'], format='%Y%m%d:%H%M', errors='coerce')
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']].dropna()
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']]
df_solar = df_solar.set_index("time")
df_solar = df_solar[['G(i) (Globalstrahlung)', 'T2m (Temperatur)']].resample('H').sum().reset_index()
df_solar

Unnamed: 0,time,G(i) (Globalstrahlung),T2m (Temperatur)
0,2005-01-01 00:00:00,0.0,7.97
1,2005-01-01 01:00:00,0.0,7.74
2,2005-01-01 02:00:00,0.0,7.27
3,2005-01-01 03:00:00,0.0,6.59
4,2005-01-01 04:00:00,0.0,5.90
...,...,...,...
166531,2023-12-31 19:00:00,0.0,7.74
166532,2023-12-31 20:00:00,0.0,7.81
166533,2023-12-31 21:00:00,0.0,7.72
166534,2023-12-31 22:00:00,0.0,7.71


In [21]:
# Zusammenführen der DataFrames anhand der Zeitspalte
merged_df = pd.merge(df_consumption, df_solar, on='time')
merged_df

Unnamed: 0,time,Global_active_power,G(i) (Globalstrahlung),T2m (Temperatur)
0,2006-12-16 17:00:00,152.024,0.0,6.64
1,2006-12-16 18:00:00,217.932,0.0,6.11
2,2006-12-16 19:00:00,204.014,0.0,5.81
3,2006-12-16 20:00:00,196.114,0.0,5.61
4,2006-12-16 21:00:00,183.388,0.0,5.26
...,...,...,...,...
34584,2010-11-26 17:00:00,103.554,0.0,-0.68
34585,2010-11-26 18:00:00,94.408,0.0,-0.44
34586,2010-11-26 19:00:00,99.560,0.0,-0.76
34587,2010-11-26 20:00:00,69.822,0.0,-2.04


In [22]:
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,time,Global_active_power,G(i) (Globalstrahlung),T2m (Temperatur)
0,2006-12-16 17:00:00,152.024,0.0,6.64
1,2006-12-16 18:00:00,217.932,0.0,6.11
2,2006-12-16 19:00:00,204.014,0.0,5.81
3,2006-12-16 20:00:00,196.114,0.0,5.61
4,2006-12-16 21:00:00,183.388,0.0,5.26
...,...,...,...,...
34584,2010-11-26 17:00:00,103.554,0.0,-0.68
34585,2010-11-26 18:00:00,94.408,0.0,-0.44
34586,2010-11-26 19:00:00,99.560,0.0,-0.76
34587,2010-11-26 20:00:00,69.822,0.0,-2.04


In [23]:
correlation = merged_df[['G(i) (Globalstrahlung)', 'T2m (Temperatur)', 'Global_active_power']].corr()
correlation


Unnamed: 0,G(i) (Globalstrahlung),T2m (Temperatur),Global_active_power
G(i) (Globalstrahlung),1.0,0.380393,0.007108
T2m (Temperatur),0.380393,1.0,-0.192351
Global_active_power,0.007108,-0.192351,1.0


In [27]:
correlation = np.array(correlation)
correlation

array([[ 1.        ,  0.38039259,  0.0071077 ],
       [ 0.38039259,  1.        , -0.192351  ],
       [ 0.0071077 , -0.192351  ,  1.        ]])

In [30]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf=None):
    """
    Generiert n_samples Samples, die mittels Gaussian Copula so korreliert werden,
    dass die marginalen Verteilungen den vorgegebenen CDFs (als Quantile) entsprechen.

    Parameter
    ----------
    n_samples : int
        Anzahl der zu generierenden Samples.
    correlation : float
        Gewünschter Korrelationswert zwischen Globalstrahlung und Temperatur.
    radiation_cdf : list oder np.array
        Sortierte Liste von Werten der Strahlungsverteilung (589 Werte).
    temperature_cdf : list oder np.array
        Sortierte Liste von Werten der Temperaturverteilung (589 Werte).
    active_power_cdf : list oder np.array, optional
        Sortierte Liste von Werten der active_power-Verteilung (589 Werte). Standard: None.

    Returns
    -------
    radiation_samples : np.array
        Samples aus der Strahlungsverteilung.
    temperature_samples : np.array
        Samples aus der Temperaturverteilung.
    active_power_samples : np.array
        Samples aus der active_power-Verteilung (falls active_power_cdf angegeben).
    """
    # Kovarianzmatrix für 3 Variablen (wenn active_power_cdf angegeben ist)
    if active_power_cdf is not None:
        # cov = np.array([[1, correlation, correlation],
        #                 [correlation, 1, correlation],
        #                 [correlation, correlation, 1]])
        cov = correlation
    else:
        cov = np.array([[1, correlation],
                        [correlation, 1]])

    # Generiere Samples aus der multivariaten Normalverteilung
    mv_samples = np.random.multivariate_normal(mean=[0]*cov.shape[0], cov=cov, size=n_samples)

    # Transformation in Uniformvariablen über die Standard-Normal-CDF
    u = norm.cdf(mv_samples)

    # Erstelle äquidistante Wahrscheinlichkeiten, die den Quantilen der CDFs entsprechen
    p_radiation = np.linspace(0, 1, len(radiation_cdf))
    p_temperature = np.linspace(0, 1, len(temperature_cdf))

    # Inverse Transform Sampling mittels Interpolation
    radiation_samples = np.interp(u[:, 0], p_radiation, radiation_cdf)
    temperature_samples = np.interp(u[:, 1], p_temperature, temperature_cdf)

    if active_power_cdf is not None:
        p_active_power = np.linspace(0, 1, len(active_power_cdf))
        active_power_samples = np.interp(u[:, 2], p_active_power, active_power_cdf)
        return radiation_samples, temperature_samples, active_power_samples

    return radiation_samples, temperature_samples

def generate_samples_for_hour_from_df(df, column, n_samples=1000, correlation=0.3786):
    """
    Generiert für die angegebene Stunde (Spalte) korrelierte Samples aus dem
    bereits geladenen DataFrame.

    Annahmen:
      - Der DataFrame enthält Spalten mit Namen wie "Month_X_Hour_Y"
        (X = 1, ..., 12; Y = 0, ..., 23).
      - In jeder dieser Spalten:
            Zeile 0 enthält die Radiation_CDF (Liste von 589 floats)
            Zeile 1 enthält die Temperature_CDF (Liste von 589 floats)
            Zeile 2 enthält die active_power_CDF (Liste von 589 floats) (optional)

    Parameter
    ----------
    df : pd.DataFrame
        DataFrame, der die CDFs enthält.
    column : str
        Name der Spalte, z.B. "Month_4_Hour_13".
    n_samples : int, optional
        Anzahl der zu generierenden Samples (Standard: 1000).
    correlation : float, optional
        Gewünschter Korrelationswert (Standard: 0.3786).

    Returns
    -------
    radiation_samples, temperature_samples, active_power_samples : tuple von np.array
        Korrelierte Samples aus der Strahlungs-, Temperatur- und active_power-Verteilung.
    """
    radiation_cdf = df[column][0]
    temperature_cdf = df[column][1]
    active_power_cdf = df[column][2] if len(df[column]) > 2 else None

    return generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf)

# Beispielaufruf für "Month_4_Hour_13":
samples_rad, samples_temp, samples_active_power = generate_samples_for_hour_from_df(df, "Month_4_Hour_13", n_samples=1000, correlation=correlation)

# Optional: Ausgabe der ersten 10 Samples
print("Radiation Samples:", samples_rad[:10])
print("Temperature Samples:", samples_temp[:10])
print("active_power Samples:", samples_active_power[:10])


Radiation Samples: [276.34014752 342.18768008 426.70831618  96.7119164  138.54
 694.193262   810.90358798 276.48712635 771.1422642  966.04892537]
Temperature Samples: [15.49        8.56537049 11.11       17.42546375 12.71       10.27
 15.35514319 13.80888029 17.53372573 12.80165382]
active_power Samples: [ 81.24899107 131.20150322  99.00201114  62.83585528  19.64245946
  47.79318903  88.86045234  35.82632819   0.          87.56952746]


In [65]:
df

Unnamed: 0,CDFs,Month_1_Hour_0,Month_1_Hour_1,Month_1_Hour_2,Month_1_Hour_3,Month_1_Hour_4,Month_1_Hour_5,Month_1_Hour_6,Month_1_Hour_7,Month_1_Hour_8,...,Month_12_Hour_14,Month_12_Hour_15,Month_12_Hour_16,Month_12_Hour_17,Month_12_Hour_18,Month_12_Hour_19,Month_12_Hour_20,Month_12_Hour_21,Month_12_Hour_22,Month_12_Hour_23
0,Radiation_CDF,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.91,...",...,"[0.0, 0.0, 7.29, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Temperature_CDF,"[-14.95, -12.79, -12.55, -9.89, -8.06, -7.06, ...","[-14.87, -13.1, -12.53, -10.37, -9.24, -8.27, ...","[-14.54, -13.15, -12.53, -10.67, -9.86, -9.14,...","[-14.79, -12.97, -12.49, -12.1, -9.02, -8.69, ...","[-15.6, -13.14, -12.67, -12.59, -10.22, -9.01,...","[-15.24, -13.64, -13.17, -12.2, -11.65, -8.49,...","[-14.75, -13.83, -13.45, -12.03, -11.67, -9.14...","[-14.03, -13.73, -13.72, -11.81, -10.7, -10.41...","[-13.75, -12.57, -12.48, -11.38, -10.6, -9.97,...",...,"[-8.94, -5.27, -5.07, -4.91, -4.76, -4.69, -3....","[-9.04, -5.64, -5.53, -5.43, -5.35, -5.21, -4....","[-9.28, -6.67, -6.33, -5.94, -5.79, -5.46, -5....","[-9.38, -7.69, -7.03, -6.77, -6.66, -6.19, -5....","[-9.55, -9.48, -8.07, -7.22, -6.78, -6.53, -5....","[-10.47, -10.22, -8.07, -7.39, -7.08, -7.04, -...","[-10.55, -10.33, -8.94, -8.34, -7.41, -6.42, -...","[-10.58, -9.93, -9.86, -8.52, -8.42, -7.2, -6....","[-10.89, -10.13, -9.15, -8.92, -8.79, -8.34, -...","[-11.15, -10.68, -9.71, -8.88, -8.32, -7.79, -..."
2,Consumption_CDF,"[0.0, 0.0, 15.158, 15.958, 16.19, 16.286, 16.3...","[0.0, 0.0, 14.788, 15.19, 15.264, 15.276, 15.6...","[0.0, 0.0, 15.386, 15.762, 16.004, 16.064, 16....","[0.0, 0.0, 15.414, 15.462, 15.504, 15.59, 15.6...","[0.0, 0.0, 15.52, 15.662, 15.966000000000001, ...","[0.0, 0.0, 14.64, 15.376, 15.608, 15.65, 15.72...","[0.0, 0.0, 16.548000000000002, 17.118, 17.892,...","[0.0, 0.0, 16.982, 17.096, 17.454, 17.888, 18....","[0.0, 0.0, 17.724, 18.142, 18.602, 19.176, 21....",...,"[15.43, 15.83, 16.740000000000002, 17.058, 18....","[15.315999999999999, 16.868, 17.338, 17.474, 1...","[15.566, 16.172, 16.18, 16.71, 17.76, 17.808, ...","[15.222, 15.826, 17.544, 21.04, 21.438, 24.05,...","[20.532, 28.086000000000002, 30.312, 30.876, 3...","[16.738, 22.174, 25.674, 27.242, 35.0899999999...","[15.53, 16.172, 20.488, 20.936, 25.312, 27.954...","[15.532, 17.042, 18.456, 21.626, 22.376, 23.11...","[15.584, 16.816, 17.466, 21.074, 22.178, 23.54...","[15.712, 16.226, 16.428, 16.764, 17.052, 17.57..."


In [None]:
from sklearn.cluster import KMeans


In [67]:
def cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5):
    samples_rad, samples_temp, samples_active_power = generate_samples_for_hour_from_df(df, column, n_samples=n_samples, correlation=correlation)
    samples = np.vstack((samples_rad, samples_temp, samples_active_power)).T

    # Erstellen eines KMeans-Modells
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)

    # Anpassen des Modells auf die Daten
    kmeans.fit(samples)

    # Zuordnung der Cluster
    labels = kmeans.labels_

    # Cluster-Zentren anzeigen
    cluster_centers = kmeans.cluster_centers_

    # Wahrscheinlichkeiten berechnen (relative Häufigkeit der Punkte in jedem Cluster)
    unique, counts = np.unique(labels, return_counts=True)
    probs = counts / len(samples)

    # Ergebnisse speichern
    scenarios = cluster_centers.tolist()
    probabilities = probs.tolist()
    return scenarios, probabilities

In [None]:

for month in range(1,13):
    # print(month)
    for hour in range(0, 24):
        # print(hour)
        

In [83]:
all_scenarios = []
all_probabilities = []
for month in range(1,3):
    # print(month)
    for hour in range(0, 2):
        # print(hour)
        column = f"Month_{month}_Hour_{hour}"
        print(column)
        scenarios, probabilities = cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5)
        all_scenarios.append(scenarios)
        all_probabilities.append(probabilities)       

Month_1_Hour_0
Month_1_Hour_1
Month_2_Hour_0
Month_2_Hour_1


In [85]:
import json
# Ergebnisse in JSON-Dateien speichern
with open('all_scenarios.json', 'w') as f:
    json.dump(all_scenarios, f)

with open('all_probabilities.json', 'w') as f:
    json.dump(all_probabilities, f)

In [84]:
all_scenarios

[[[0.0, 0.9263133722427637, 130.76595198679007],
  [0.0, 2.8349664375005306, 21.84312392879872],
  [0.0, 1.620512579847407, 85.60967179715638],
  [0.0, 0.6941064953171023, 202.72803804166367],
  [0.0, 1.8764864533945596, 50.229796365577805]],
 [[0.0, 2.528436695945473, 20.491955914687242],
  [0.0, 0.8778370960429722, 130.6350930818232],
  [0.0, 2.183263345925802, 187.95904597699294],
  [0.0, 1.417586972847828, 48.813712844224014],
  [0.0, 1.4173358455763474, 82.0870380363599]],
 [[0.0, 2.568612612225955, 21.075052168527755],
  [0.0, -0.47245965520975686, 142.40184007978223],
  [0.0, -0.14810958773986238, 230.83804716457757],
  [0.0, 1.606591007531054, 55.72330644515859],
  [0.0, 1.4093593462379426, 91.69082395781913]],
 [[0.0, 2.26999034269693, 19.650772141068213],
  [0.0, 1.2710953595911445, 87.00942886665817],
  [0.0, -0.1586712253113134, 209.64358195339983],
  [0.0, 1.322056714251599, 55.20695420444507],
  [0.0, -0.1721830160784814, 134.39644541136573]]]

In [72]:
s, p = cluster_scenarios_with_probabilities('Month_4_Hour_13', correlation, n_samples=1000, n_clusters=5)
print(s)
print()
print(p)

[[722.9060246907807, 13.954278430580944, 63.883849880777774], [154.02271950130887, 12.382336620024937, 56.01007625507782], [341.6757977147761, 13.123094006147767, 62.03047057782793], [908.5352951990928, 15.872110952862698, 61.54193808024208], [526.2235235626501, 13.987129844641354, 60.594622873293254]]

[0.149, 0.242, 0.14, 0.318, 0.151]


In [None]:
# length_arr = 10
# samples = np.vstack((samples_rad[:length_arr], samples_temp[:length_arr], samples_active_power[:length_arr])).T
samples = np.vstack((samples_rad, samples_temp, samples_active_power)).T
samples

array([[276.34014752,  15.49      ,  81.24899107],
       [342.18768008,   8.56537049, 131.20150322],
       [426.70831618,  11.11      ,  99.00201114],
       ...,
       [541.2499884 ,  12.44766278,  24.71403091],
       [702.27081448,  12.15766116,  42.98650076],
       [898.65297743,  19.51639865,  51.63137557]])

In [53]:
import numpy as np
from sklearn.cluster import KMeans

# Anzahl der Cluster definieren
n_clusters = 5

In [57]:
# Erstellen eines KMeans-Modells
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Anpassen des Modells auf die Daten
kmeans.fit(samples)

# Zuordnung der Cluster
labels = kmeans.labels_

# Cluster-Zentren anzeigen
cluster_centers = kmeans.cluster_centers_

print("Cluster-Zuordnung:", labels)
print("Cluster-Zentren:", cluster_centers)


Cluster-Zuordnung: [1 4 4 3 3 0 0 1 0 2]
Cluster-Zentren: [[758.7463714   14.38628964  45.55121379]
 [276.41363693  14.64944014  58.53765963]
 [966.04892537  12.80165382  87.56952746]
 [117.6259582   15.06773188  41.23915737]
 [384.44799813   9.83768524 115.10175718]]


In [58]:
# Wahrscheinlichkeiten berechnen (relative Häufigkeit der Punkte in jedem Cluster)
assignments = kmeans.labels_
unique, counts = np.unique(assignments, return_counts=True)
probs = counts / len(samples)

# Ergebnisse speichern
scenarios = cluster_centers.tolist()
probabilities = probs.tolist()


In [59]:
scenarios

[[758.7463713962163, 14.386289639320077, 45.5512137898761],
 [276.41363693195535, 14.649440143264068, 58.537659633011074],
 [966.0489253651216, 12.8016538228519, 87.56952746244649],
 [117.62595820162437, 15.06773187553078, 41.239157370016336],
 [384.44799812914334, 9.837685243893034, 115.10175718242425]]

In [63]:
# [1 4 4 3 3 0 0 1 0 2]
sorted([1, 4, 4, 3, 3, 0, 0, 1, 0, 2])

[0, 0, 0, 1, 1, 2, 3, 3, 4, 4]

In [60]:
probabilities

[0.3, 0.2, 0.1, 0.2, 0.2]

# Chris:

In [None]:


# Relevante Spalten auswählen
df_filtered = df[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']].dropna()

# Korrelation zwischen Globalstrahlung und Temperatur berechnen
correlation = df_filtered[['G(i) (Globalstrahlung)', 'T2m (Temperatur)']].corr()
correlation


Unnamed: 0,G(i) (Globalstrahlung),T2m (Temperatur)
G(i) (Globalstrahlung),1.0,0.378649
T2m (Temperatur),0.378649,1.0


In [36]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf):
    """
    Generiert n_samples Samples, die mittels Gaussian Copula so korreliert werden,
    dass die marginalen Verteilungen den vorgegebenen CDFs (als Quantile) entsprechen.

    Parameter
    ----------
    n_samples : int
        Anzahl der zu generierenden Samples.
    correlation : float
        Gewünschter Korrelationswert zwischen Globalstrahlung und Temperatur.
    radiation_cdf : list oder np.array
        Sortierte Liste von Werten der Strahlungsverteilung (589 Werte).
    temperature_cdf : list oder np.array
        Sortierte Liste von Werten der Temperaturverteilung (589 Werte).

    Returns
    -------
    radiation_samples : np.array
        Samples aus der Strahlungsverteilung.
    temperature_samples : np.array
        Samples aus der Temperaturverteilung.
    """
    # Kovarianzmatrix für 2 Variablen
    cov = np.array([[1, correlation],
                    [correlation, 1]])

    # Generiere Samples aus der multivariaten Normalverteilung
    mv_samples = np.random.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)

    # Transformation in Uniformvariablen über die Standard-Normal-CDF
    u = norm.cdf(mv_samples)

    # Erstelle äquidistante Wahrscheinlichkeiten, die den Quantilen der CDFs entsprechen
    p_radiation = np.linspace(0, 1, len(radiation_cdf))
    p_temperature = np.linspace(0, 1, len(temperature_cdf))

    # Inverse Transform Sampling mittels Interpolation
    radiation_samples = np.interp(u[:, 0], p_radiation, radiation_cdf)
    temperature_samples = np.interp(u[:, 1], p_temperature, temperature_cdf)

    return radiation_samples, temperature_samples

def generate_samples_for_hour_from_df(df, column, n_samples=1000, correlation=0.3786):
    """
    Generiert für die angegebene Stunde (Spalte) korrelierte Samples aus dem
    bereits geladenen DataFrame.

    Annahmen:
      - Der DataFrame enthält Spalten mit Namen wie "Month_X_Hour_Y"
        (X = 1, ..., 12; Y = 0, ..., 23).
      - In jeder dieser Spalten:
            Zeile 0 enthält die Radiation_CDF (Liste von 589 floats)
            Zeile 1 enthält die Temperature_CDF (Liste von 589 floats)

    Parameter
    ----------
    df : pd.DataFrame
        DataFrame, der die CDFs enthält.
    column : str
        Name der Spalte, z.B. "Month_4_Hour_13".
    n_samples : int, optional
        Anzahl der zu generierenden Samples (Standard: 1000).
    correlation : float, optional
        Gewünschter Korrelationswert (Standard: 0.3786).

    Returns
    -------
    radiation_samples, temperature_samples : tuple von np.array
        Korrelierte Samples aus der Strahlungs- und Temperaturverteilung.
    """
    radiation_cdf = df[column][0]
    temperature_cdf = df[column][1]

    return generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf)


# Beispielaufruf für "Month_4_Hour_13":
samples_rad, samples_temp = generate_samples_for_hour_from_df(df, "Month_4_Hour_13", n_samples=1000, correlation=0.3786)

# Optional: Ausgabe der ersten 10 Samples
print("Radiation Samples:", samples_rad[:10])
print("Temperature Samples:", samples_temp[:10])


Radiation Samples: [121.09710185 211.55480841 107.08612446 474.64651504 561.56619769
 485.29411037 891.05603966 660.36670471 959.16466731 297.00029994]
Temperature Samples: [ 8.8809792  12.31000496 16.13       17.53597052 15.53       18.01458442
 12.06685529 13.90889791 22.87131265 15.2563865 ]
