In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import norm
import json

  from pandas.core import (


## 1. Setup the combined CDFs

In [2]:
def merge_csv_files(file_paths):
    """
    This function loads multiple CSV files and combines them by appending the values for the same columns into new rows.

    :param file_paths: List of file paths to the CSV files
    :return: Merged DataFrame
    """
    import pandas as pd
    
    dataframes = [pd.read_csv(file) for file in file_paths]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

file_paths = ["data/saved_csv/CDF_Functions_Radiation.csv",
              "data/saved_csv/CDF_Functions_Temperature.csv",
              "data/saved_csv/CDF_Functions_Consumption.csv"]
merged_df = merge_csv_files(file_paths)

merged_df.to_csv("data/saved_csv/CDF_Functions.csv", index=False)


## 2. Read the combined CDFs correctly

``CDF: 589 float values in list``

``Radiation_CDF: df["Month_X_Hour_Y"][0]``

``Temperature_CDF: df["Month_X_Hour_Y"][1]``

``X = 1, ..., 12; Y = 0, ..., 23``

In [3]:

file_path = "data/saved_csv/CDF_Functions.csv"

# Read CSV file
df = pd.read_csv(file_path)

# Convert CDF columns from strings to lists of floats
for column in df.columns[1:]:  # First column is "CDFs", so start from column 1
    df[column] = df[column].apply(lambda x: list(map(float, x.split(";"))) if isinstance(x, str) else [])


## Calculate Correlation

In [4]:
# Load and preprocess consumption data
df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
df_consumption['time'] = pd.to_datetime(df_consumption['Date'] + ' ' + df_consumption['Time'], format='%d/%m/%Y %H:%M:%S')
df_consumption.drop(columns=['Date', 'Time'], inplace=True)
df_consumption.drop(columns=['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3','Global_reactive_power', 'Global_intensity', 'Voltage'], inplace=True)
def to_numeric_or_nan(value):
    try:
        return pd.to_numeric(value)
    except ValueError:
        return np.nan
# Convert values in the 'Global_active_power' column to numeric, non-numeric values become NaN
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].apply(to_numeric_or_nan)
# Drop nan's
df_consumption = df_consumption.dropna(subset=['Global_active_power'])
df_consumption['Global_active_power'] = df_consumption['Global_active_power'].astype(float)
df_consumption = df_consumption.set_index("time")
df_consumption = df_consumption['Global_active_power'].resample('H').sum().reset_index()
df_consumption['Global_active_power'] = df_consumption['Global_active_power']/60
df_consumption

  df_consumption = pd.read_csv('data/household_power_consumption.txt', sep=';')
  df_consumption = df_consumption['Global_active_power'].resample('H').sum().reset_index()


Unnamed: 0,time,Global_active_power
0,2006-12-16 17:00:00,2.533733
1,2006-12-16 18:00:00,3.632200
2,2006-12-16 19:00:00,3.400233
3,2006-12-16 20:00:00,3.268567
4,2006-12-16 21:00:00,3.056467
...,...,...
34584,2010-11-26 17:00:00,1.725900
34585,2010-11-26 18:00:00,1.573467
34586,2010-11-26 19:00:00,1.659333
34587,2010-11-26 20:00:00,1.163700


In [5]:
# Load and preprocess sun data
df_solar = pd.read_csv("data/Solar_Timeseries_2005_2023.csv")
# Convert timestamp
df_solar['time'] = pd.to_datetime(df_solar['time'], format='%Y%m%d:%H%M', errors='coerce')
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']].dropna()
df_solar = df_solar[['time', 'G(i) (Globalstrahlung)', 'T2m (Temperatur)']]
df_solar = df_solar.set_index("time")
df_solar = df_solar[['G(i) (Globalstrahlung)', 'T2m (Temperatur)']].resample('H').sum().reset_index()
df_solar

  df_solar = df_solar[['G(i) (Globalstrahlung)', 'T2m (Temperatur)']].resample('H').sum().reset_index()


Unnamed: 0,time,G(i) (Globalstrahlung),T2m (Temperatur)
0,2005-01-01 00:00:00,0.0,7.97
1,2005-01-01 01:00:00,0.0,7.74
2,2005-01-01 02:00:00,0.0,7.27
3,2005-01-01 03:00:00,0.0,6.59
4,2005-01-01 04:00:00,0.0,5.90
...,...,...,...
166531,2023-12-31 19:00:00,0.0,7.74
166532,2023-12-31 20:00:00,0.0,7.81
166533,2023-12-31 21:00:00,0.0,7.72
166534,2023-12-31 22:00:00,0.0,7.71


In [6]:
# Merge the DataFrames using the time column
merged_df = pd.merge(df_consumption, df_solar, on='time')
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,time,Global_active_power,G(i) (Globalstrahlung),T2m (Temperatur)
0,2006-12-16 17:00:00,2.533733,0.0,6.64
1,2006-12-16 18:00:00,3.632200,0.0,6.11
2,2006-12-16 19:00:00,3.400233,0.0,5.81
3,2006-12-16 20:00:00,3.268567,0.0,5.61
4,2006-12-16 21:00:00,3.056467,0.0,5.26
...,...,...,...,...
34584,2010-11-26 17:00:00,1.725900,0.0,-0.68
34585,2010-11-26 18:00:00,1.573467,0.0,-0.44
34586,2010-11-26 19:00:00,1.659333,0.0,-0.76
34587,2010-11-26 20:00:00,1.163700,0.0,-2.04


In [7]:
correlation = merged_df[['G(i) (Globalstrahlung)', 'T2m (Temperatur)', 'Global_active_power']].corr()
correlation

Unnamed: 0,G(i) (Globalstrahlung),T2m (Temperatur),Global_active_power
G(i) (Globalstrahlung),1.0,0.380393,0.007108
T2m (Temperatur),0.380393,1.0,-0.192351
Global_active_power,0.007108,-0.192351,1.0


In [8]:
correlation = np.array(correlation)
correlation

array([[ 1.        ,  0.38039259,  0.0071077 ],
       [ 0.38039259,  1.        , -0.192351  ],
       [ 0.0071077 , -0.192351  ,  1.        ]])

In [9]:
def generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf=None):
    """
    Generates n_samples that are correlated using a Gaussian Copula,
    ensuring that the marginal distributions match the given CDFs (as quantiles).

    Parameters
    ----------
    n_samples : int
        Number of samples to generate.
    correlation : float
        Desired correlation value between global radiation and temperature.
    radiation_cdf : list or np.array
        Sorted list of values for the radiation distribution (589 values).
    temperature_cdf : list or np.array
        Sorted list of values for the temperature distribution (589 values).
    active_power_cdf : list or np.array, optional
        Sorted list of values for the active_power distribution (589 values). Default: None.

    Returns
    -------
    radiation_samples : np.array
        Samples from the radiation distribution.
    temperature_samples : np.array
        Samples from the temperature distribution.
    active_power_samples : np.array
        Samples from the active_power distribution (if active_power_cdf is provided).
    """
    # Covariance matrix for 3 variables (if active_power_cdf is provided)
    if active_power_cdf is not None:
        cov = correlation
    else:
        cov = np.array([[1, correlation],
                        [correlation, 1]])

    # Generate samples from a multivariate normal distribution
    mv_samples = np.random.multivariate_normal(mean=[0]*cov.shape[0], cov=cov, size=n_samples)

    # Transform to uniform variables using the standard normal CDF
    u = norm.cdf(mv_samples)

    # Create equidistant probabilities corresponding to CDF quantiles
    p_radiation = np.linspace(0, 1, len(radiation_cdf))
    p_temperature = np.linspace(0, 1, len(temperature_cdf))

    # Inverse transform sampling via interpolation
    radiation_samples = np.interp(u[:, 0], p_radiation, radiation_cdf)
    temperature_samples = np.interp(u[:, 1], p_temperature, temperature_cdf)

    if active_power_cdf is not None:
        p_active_power = np.linspace(0, 1, len(active_power_cdf))
        active_power_samples = np.interp(u[:, 2], p_active_power, active_power_cdf)
        return radiation_samples, temperature_samples, active_power_samples

    return radiation_samples, temperature_samples

def generate_samples_for_hour_from_df(df, column, n_samples=1000, correlation=0.3786):
    """
    Generates correlated samples for a given hour (column) from the provided DataFrame.

    Assumptions:
      - The DataFrame contains columns named like "Month_X_Hour_Y"
        (X = 1, ..., 12; Y = 0, ..., 23).
      - Each of these columns:
            Row 0 contains the Radiation_CDF (list of 589 floats)
            Row 1 contains the Temperature_CDF (list of 589 floats)
            Row 2 contains the active_power_CDF (list of 589 floats) (optional)

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the CDFs.
    column : str
        Name of the column, e.g., "Month_4_Hour_13".
    n_samples : int, optional
        Number of samples to generate (default: 1000).
    correlation : float, optional
        Desired correlation value (default: 0.3786).

    Returns
    -------
    radiation_samples, temperature_samples, active_power_samples : tuple of np.array
        Correlated samples from the radiation, temperature, and active_power distributions.
    """
    radiation_cdf = df[column][0]
    temperature_cdf = df[column][1]
    active_power_cdf = df[column][2] if len(df[column]) > 2 else None

    return generate_correlated_samples(n_samples, correlation, radiation_cdf, temperature_cdf, active_power_cdf)


In [10]:
def cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5):
    """
    Groups scenarios based on the specified column values and calculates their probabilities.

    Parameters
    ----------
    column : str
        The name of the column used for generating samples. Format: Month_{month}_Hour_{hour}
    correlation : float
        The correlation used for generating samples (computed at the beginning of the document).
    n_samples : int, optional
        The number of samples to generate. Default is 1000, as used by us.
    n_clusters : int, optional
        The number of clusters for the KMeans algorithm. Default is 5, as used by us.
    
    Returns
    -------
    tuple
        A tuple with two lists:
        - scenarios : List[List[float]]
            The cluster centers representing the scenarios.
        - probabilities : List[float]
            The probabilities of each cluster/scenario.
    """

    samples_rad, samples_temp, samples_active_power = generate_samples_for_hour_from_df(df, column, n_samples=n_samples, correlation=correlation)
    samples = np.vstack((samples_rad, samples_temp, samples_active_power)).T

    # Create a KMeans model
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)

    # Fit the model to the data
    kmeans.fit(samples)

    # Assign clusters
    labels = kmeans.labels_

    # Display cluster centers
    cluster_centers = kmeans.cluster_centers_

    # Calculate probabilities (relative frequency of points in each cluster)
    unique, counts = np.unique(labels, return_counts=True)
    probs = counts / len(samples)

    # Store results
    scenarios = cluster_centers.tolist()
    probabilities = probs.tolist()
    return scenarios, probabilities


In [11]:
all_scenarios = []
all_probabilities = []
for month in range(1,13):
    for hour in range(0, 24):
        column = f"Month_{month}_Hour_{hour}"
        scenarios, probabilities = cluster_scenarios_with_probabilities(column, correlation, n_samples=1000, n_clusters=5)
        all_scenarios.append(scenarios)
        all_probabilities.append(probabilities)   

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

In [12]:
len(all_scenarios)

288

In [13]:
len(all_probabilities)

288

In [None]:
# store results in json file
with open('data/results/all_scenarios.json', 'w') as f:
    json.dump(all_scenarios, f)

with open('data/results/all_probabilities.json', 'w') as f:
    json.dump(all_probabilities, f)