# REDUCEDHEATCARB analysis

This JupyterLabs notebook can be used for physics ifnormed machine learning analysis in the REDUCEDHEATCAB project.
Don't forget to install the requirements listed in [requirements.txt](../requirements.txt) first!

## Setting the stage

First several imports and variables need to be defined


### Imports and generic settings

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

%load_ext autoreload

    
from tqdm.notebook import tqdm
from scipy.interpolate import RectBivariateSpline

from gekko import GEKKO

import sys
sys.path.append('../data/')
sys.path.append('../view/')
sys.path.append('../analysis/')

from plotter import Plot
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from ipywidgets import interact, FloatSlider, SelectMultiple, VBox, HBox, Output, Checkbox
from IPython.display import display
from scipy.stats import linregress

%matplotlib inline
%matplotlib widget


from nfh_utils import *

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

from rhc_analysis import BoilerEfficiency, Learner, Model, Comfort

# rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_5_min.parquet'
rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_1_min.parquet'

home_BAG_data_file_path = 'buildings_properties_detailed.xlsx'
home_metadata_file_path = 'home_properties_detailed.parquet'
boilers_excel_file_path = "boilers.xlsx"
homes_boilers_excel_file_path = "homes_boilers.xlsx"

# Initialize the class with the path to your Parquet file
boiler_returntemp_load_efficiency_file_path = 'boiler_returntemp_load_efficiency.parquet'
boiler_efficiency = BoilerEfficiency(boiler_returntemp_load_efficiency_file_path)

regime_change_A1_file_path = 'homes_interventions_A1.xlsx'
regime_change_B1_file_path = 'homes_interventions_B1.xlsx'

# File for intermediate output (including preprocessing that may likely needs to migrate to the GEKKO model code for the what-if simulations)
rhc_heat_dstr_preprocessed_poperties_file='rhc_heat_dstr_preprocessed_properties.parquet'

rhc_analysis_results_file = 'rhc_results.parquet'
rhc_analysis_results_per_period_file = 'rhc_results_per_period.xlsx'

rhc_dstr_analysis_results_per_period_file = 'rhc_dstr_results_per_period.parquet'
rhc_dstr_analysis_results_per_period_excel_file = 'rhc_dstr_results_per_period.xlsx'

import logging 

# Clear any existing handlers to avoid duplicate logs
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Set up logging to the console only
logging.basicConfig(
    level=logging.WARNING,  # Adjust log level as necessary (DEBUG, INFO, WARNING, etc.)
    # level=logging.INFO,  # Adjust log level as necessary (DEBUG, INFO, WARNING, etc.)
    # level=logging.DEBUG,  # Adjust log level as necessary (DEBUG, INFO, WARNING, etc.)
    stream=sys.stderr,    # Send logs to stdout (you can also use sys.stderr if needed)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)

In [None]:
units_to_mathtext = property_types = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'p' : r'$persons$',
    'W' : r'$W$',
    'W_m_2' : r'$W/m^{2}$',
    'm_s_1' : r'$m/s$',
    'W0' : r'$W^{0}$',
    'l_min_1' : r'$L/min$',
    'dm3_s_1' : r'$dm^{3}/s$',
    'pct': '%',
    'min_1' : r'$min^{-1}$',
    'K' : r'$K$',
}

### Reading building and installation parameters

In [None]:
df_bldng_data  = pd.read_excel(home_BAG_data_file_path,  index_col='id')

In [None]:
df_bldng_data.rename(columns={'floors__m2': 'usable_area__m2'}, inplace=True)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.T)

In [None]:
df_boilers = pd.read_excel(boilers_excel_file_path)[['brand_model',
                                                     'fan_min_ch_rotations__min_1',
                                                     'fan_max_ch_rotations__min_1',
                                                     'Qnh_min_lhv__kW',
                                                     'Qnh_max_lhv__kW', 
                                                     'desired_temp_delta_flow_ret__K',
                                                     'overheat_hysteresis__K',
                                                     'hydronic_pump_brand',
                                                     'hydronic_pump_model',
                                                     'pump_head__m',
                                                    ]]

df_homes_boilers = pd.read_excel(homes_boilers_excel_file_path).rename(columns={'pseudonym': 'id'})

df_homes_boilers = pd.merge(df_homes_boilers, df_boilers, on='brand_model', how='left').set_index('id')

In [None]:
df_boilers

In [None]:
df_homes_boilers

In [None]:
df_bldng_data = pd.merge(df_bldng_data, df_homes_boilers[['fan_min_ch_rotations__min_1',
                                                          'fan_max_ch_rotations__min_1',
                                                          'Qnh_min_lhv__kW',
                                                          'Qnh_max_lhv__kW', 
                                                          'desired_temp_delta_flow_ret__K',
                                                          'overheat_hysteresis__K',
                                                          'floor_heating__bool',
                                                          'brand_model',
                                                          'hydronic_pump_brand',
                                                          'hydronic_pump_model',
                                                          'pump_head__m',
                                                         ]], on='id', how='left')


In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.T)

In [None]:
df_bldng_data.columns.to_list()

In [None]:
df_bldng_data.describe().T

### Reading preprocessed interpolated properties from a parquet file

In [None]:
%%time

# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
df_prep.info()

## Determine post_pump_run__pct

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[df_prep['batch_import_remeha_boiler_status_pump_post_run__bool'] == True]['batch_import_remeha_flow_dstr_pump_speed__pct'].to_frame(), property_col='batch_import_remeha_flow_dstr_pump_speed__pct')

In [None]:
%%time
median_pump_speeds = (
    df_prep[df_prep['batch_import_remeha_boiler_status_pump_post_run__bool'] == True]
    ['batch_import_remeha_flow_dstr_pump_speed__pct']
    .groupby('id')
    .median()
)


In [None]:
%%time
df_bldng_data['post_pump_run__pct'] = df_bldng_data.index.map(median_pump_speeds)


In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.sort_values('brand_model').T)

## Do other data inspections

In [None]:
%%time
boolean_cols = [col for col in df_prep.columns if df_prep[col].dtype.name == 'boolean']

summary = (
    df_prep[boolean_cols]
    .apply(lambda col: col.value_counts(dropna=False).reindex([True, False, pd.NA], fill_value=0))
    .T
    # .rename(columns={True: '% True', False: '% False', pd.NA: '% NA'})
)

In [None]:
display(summary.style.format("{:,}"))

In [None]:
summary__pct = (
    df_prep[boolean_cols]
    .apply(lambda col: col.value_counts(dropna=False).reindex([True, False, pd.NA], fill_value=0))
    .T
    .rename(columns={True: '% True', False: '% False', pd.NA: '% NA'})
)

summary__pct = (summary__pct.div(summary__pct.sum(axis=1), axis=0) * 100).round(2)  # Convert to percentages

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(summary__pct.sort_values(by='% True', ascending=False))

In [None]:
%%time
%autoreload 2
# df_prep['is_post_pump_running__bool'] = df_prep['batch_import_remeha_boiler_status_pump_post_run__bool'].replace(False, pd.NA)
df_prep['is_post_pump_running__bool'] = df_prep['batch_import_remeha_boiler_status_burning_ch__bool'].replace(False, pd.NA)

df_post_pump_run_streaks = Learner.valid_learn_list(
    df_data=df_prep,
    req_props=['is_post_pump_running__bool'],  # Use the temporary column
    property_sources={'is_post_pump_running__bool': 'is_post_pump_running__bool'},
    duration_threshold=timedelta(minutes=0)
)

In [None]:
df_post_pump_run_streaks

In [None]:
# Compute describe stats
stats = df_post_pump_run_streaks.reset_index(level='duration')['duration'].describe()

# Add median column (redundant since median is already in .describe(), but explicit calculation shown)
stats['median'] = df_post_pump_run_streaks.reset_index(level='duration')['duration'].median()

# Transpose and display with float formatting
with pd.option_context('display.float_format', '{:.1f}'.format):
    display(stats.T.to_frame())

In [None]:
%matplotlib inline
%matplotlib widget
df_post_pump_run_streaks.reset_index()['duration'].dt.total_seconds().div(60).plot.hist(bins=100, edgecolor='black',  title = "pump streak duration (minutes)")

In [None]:
# Compute describe stats
grouped_stats = df_post_pump_run_streaks.reset_index(level='duration')['duration'].groupby('id').describe()

# Add median column (redundant since median is already in .describe(), but explicit calculation shown)
grouped_stats['median'] = df_post_pump_run_streaks.reset_index(level='duration')['duration'].groupby('id').median()

# Transpose and display with float formatting
with pd.option_context('display.float_format', '{:.1f}'.format):
    display(grouped_stats.T)

In [None]:
# Boxplot per id

# Reset index to bring 'id' as a column (if it's part of MultiIndex)
df_plot = df_post_pump_run_streaks.reset_index()
# Convert 'duration' to minutes
df_plot['duration__min'] = df_plot['duration'].dt.total_seconds() / s_min_1
# Create the figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Prepare data: Group durations by 'id'
data = [df_plot[df_plot['id'] == id_]['duration__min']
        for id_ in df_plot['id'].unique()]

# Create the boxplot
ax.boxplot(data, labels=df_plot['id'].unique())

# Customize the plot
ax.set_title('Boxplot of Durations per ID')
ax.set_xlabel('ID')
ax.set_ylabel('Duration (minutes)')
ax.tick_params(axis='x', rotation=90)  # Rotate x-axis labels if needed

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# # visualize all input data
# df_plot = df_prep

# list(df_plot.index.unique('id').dropna())

# df_plot.index.unique('id').dropna()

# # df_plot.loc[[401632]][[prop for prop in df_plot.columns.values if prop.split('__')[-1] in ('degC', 'W', '0', 'bool', 'ppm', 'W_m_2')]]

# #Plot all properties from all sources for all ids
# Plot.dataframe_preprocessed_plot(df_plot.loc[[401632]][[prop for prop in df_plot.columns.values if prop.split('__')[-1] in ('degC', 'ppm', 'W_m_2')]], units_to_mathtext)

In [None]:
mandatory_sourceprops =  ['batch_import_KNMI_temp_outdoor__degC', 
                          'batch_import_KNMI_sol_ghi__W_m_2',
                          'batch_import_KNMI_wind__m_s_1',
                          'batch_import_KNMI_air_outdoor__Pa',
                          'batch_import_remeha_temp_indoor__degC',
                          'batch_import_remeha_temp_ret__degC',
                          'batch_import_remeha_temp_flow__degC',
                          'batch_import_remeha_fan_rotations__min_1',
                          'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
                          # 'device_p1-reader_g_use_cum__m3'
                         ]

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=mandatory_sourceprops, freq='1W', title_fontsize=8)

# Calculate additional properties 
We may have to move some of these calculations to inside the GEKKO Python model code (e.g. for the what-if scenario simulation)

## Calculate comfortable__bool

In [None]:
%%time
%autoreload 2
# initially, we do not use occupancy yet
df_prep.loc[:,'comfortable__bool'] = Comfort.is_comfortable(
    df_prep['batch_import_remeha_temp_indoor__degC'], 
    df_prep['batch_import_remeha_temp_set__degC'], 
    target_ppd__pct=10, 
    # occupancy__bool=(df_prep['device_living_room_occupancy__p'] > 0)
)

In [None]:
# Get an overview of True, False, and NaN values
df_prep['comfortable__bool'].value_counts(normalize=True, dropna=False).to_frame().T

In [None]:
# Get an overview of True, False, and NaN values
df_prep['comfortable__bool'].value_counts(normalize=True, dropna=False).to_frame().T

In [None]:
# Get an overview of True, False, and NaN values
df_prep['comfortable__bool'].groupby('id').value_counts(normalize=True, dropna=False).to_frame().unstack().T

## Use boiler-specific efficiency to calculate heat_ch__W and flow_dstr__dm3_s_1

In [None]:
%%time
# Create a new column to store the interpolated efficiency values
df_prep['eta_ch_hhv__W0'] = np.nan

# Iterate over each unique ID (grouping by 'id' in the MultiIndex)
for idx, group in tqdm(df_prep.groupby(level='id')):
    # Get the brand_model for the current id
    brand_model = group['brand_model'].iloc[0]
    
    # Get the interpolator function for the current brand_model
    boiler_efficiency_hhv = boiler_efficiency.get_efficiency_hhv_interpolator(brand_model)
    
    # Filter rows where interpolation can be applied
    valid_rows = group[
        group['g_use_fan_load__pct'].notna() & 
        group['batch_import_remeha_temp_ret__degC'].notna()
    ]
    
    # Apply the interpolator to the filtered rows (no rounding applied)
    boiler_efficiency_values = valid_rows.apply(
        lambda row: boiler_efficiency_hhv(
            row['g_use_fan_load__pct'], 
            row['batch_import_remeha_temp_ret__degC']
        ),
        axis=1
    )
    
    # Update the main DataFrame with the interpolated values
    df_prep.loc[valid_rows.index, 'eta_ch_hhv__W0'] = boiler_efficiency_values

### Calculate heat_ch__W

In [None]:
%%time
df_prep.loc[:,'heat_ch__W'] = df_prep['g_use_fan_ch_hhv__W'] * df_prep['eta_ch_hhv__W0']

#### Calculate efficiency error if we would only have temp_dstr__degC as input for efficiency lookup

In [None]:
base_efficiency = df_prep['heat_ch__W'].sum() / df_prep['g_use_fan_ch_hhv__W'].sum()
print(f"base_efficiency: {base_efficiency}")


# Calculate per-home efficiency
base_efficiency_per_home = (
    df_prep.groupby('id')['heat_ch__W'].sum() /
    df_prep.groupby('id')['g_use_fan_ch_hhv__W'].sum()
)

In [None]:
%%time
columns_needed = [
    'batch_import_remeha_temp_ret__degC',
    'batch_import_remeha_temp_flow__degC',
    'g_use_fan_load__pct',
    'brand_model',
    'g_use_fan_ch_hhv__W',
]
# Create a minimal DataFrame for the scenario
df_prep_scenario = df_prep[columns_needed].copy()



# Calculate temp_dstr__degC
df_prep_scenario['temp_dstr__degC'] = (
    (df_prep_scenario['batch_import_remeha_temp_ret__degC'] +
     df_prep_scenario['batch_import_remeha_temp_flow__degC']) / 2
)


# Create a new column to store the interpolated efficiency values
df_prep_scenario['eta_ch_hhv__W0'] = np.nan

# Iterate over each unique ID (grouping by 'id' in the MultiIndex)
for idx, group in tqdm(df_prep_scenario.groupby(level='id')):
    # Get the brand_model for the current id
    brand_model = group['brand_model'].iloc[0]
    
    # Get the interpolator function for the current brand_model
    boiler_efficiency_hhv = boiler_efficiency.get_efficiency_hhv_interpolator(brand_model)
    
    # Filter rows where interpolation can be applied
    valid_rows = group[
        group['g_use_fan_load__pct'].notna() & 
        group['batch_import_remeha_temp_ret__degC'].notna()
    ]
    
    # Apply the interpolator to the filtered rows (no rounding applied)
    boiler_efficiency_values = valid_rows.apply(
        lambda row: boiler_efficiency_hhv(
            row['g_use_fan_load__pct'], 
            row['temp_dstr__degC']
        ),
        axis=1
    )
    
    # Update the main DataFrame with the interpolated values
    df_prep_scenario.loc[valid_rows.index, 'eta_ch_hhv__W0'] = boiler_efficiency_values

In [None]:
# Calculate heat_ch__W
df_prep_scenario['heat_ch__W'] = df_prep_scenario['g_use_fan_ch_hhv__W'] * df_prep_scenario['eta_ch_hhv__W0']

In [None]:
# Calculate overall efficiency
scenario_efficiency = df_prep_scenario['heat_ch__W'].sum() / df_prep_scenario['g_use_fan_ch_hhv__W'].sum()
print(f"scenario_efficiency: {scenario_efficiency}")

# Calculate per-home efficiency
scenario_efficiency_per_home = (
    df_prep_scenario.groupby('id')['heat_ch__W'].sum() /
    df_prep_scenario.groupby('id')['g_use_fan_ch_hhv__W'].sum()
)

base_efficiency_per_home = (base_efficiency_per_home * 100).rename("eta_hhv_using_temp_ret__pct")
scenario_efficiency_per_home = (scenario_efficiency_per_home * 100).rename("eta_hhv_using_temp_dstr__pct")

comparison_df = pd.concat([base_efficiency_per_home, scenario_efficiency_per_home], axis=1)

comparison_df["absolute_diff__pctpt"] = (comparison_df["eta_hhv_using_temp_ret__pct"] - comparison_df["eta_hhv_using_temp_dstr__pct"])

comparison_df["relative_diff__pct"] = (
    comparison_df["absolute_diff__pctpt"] / comparison_df["eta_hhv_using_temp_dstr__pct"] * 100
)


In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(comparison_df.describe())


In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(comparison_df.sort_values(by='eta_hhv_using_temp_ret__pct'))


In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(comparison_df.mean())

### Calculate flow_dstr__dm3_s_1

In [None]:
%%time
# Ensure the temperature difference column is present
df_prep['delta_t_flow_ret__K'] = (
    df_prep['batch_import_remeha_temp_flow_ch__degC'] - 
    df_prep['batch_import_remeha_temp_ret_ch__degC']
)

In [None]:
vectorized_water_volumetric_heat_capacity__J_dm_3_K_1 = np.vectorize(water_volumetric_heat_capacity__J_dm_3_K_1)

In [None]:
%%time
# Use np.where to conditionally replace results based on the temperature difference
df_prep.loc[:, 'flow_dstr__dm3_s_1'] = np.where(
    (df_prep['delta_t_flow_ret__K'] <= 0) | pd.isna(df_prep['delta_t_flow_ret__K']),  # Check if the temperature difference is NA, zero, or below zero
    np.nan,  # Use np.nan for missing values
    (
        df_prep['heat_ch__W']
        / (
            vectorized_water_volumetric_heat_capacity__J_dm_3_K_1(
                (df_prep['batch_import_remeha_temp_flow_ch__degC'] + df_prep['batch_import_remeha_temp_ret_ch__degC'])/2,
                heat_dstr_nl_avg_abs__Pa
            ) 
            * df_prep['delta_t_flow_ret__K']
        )
    )
)

In [None]:
# Clipping the calculated flow values, based on maximum flow of 6.5 [m³/h] 
df_prep['flow_dstr__dm3_s_1'] = df_prep['flow_dstr__dm3_s_1'].clip(lower=0, upper=flow_ch_max__dm3_s_1).astype('float32')

## Inspect results 

In [None]:
# Create masks
boiler_burning_mask = ((df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True) | (df_prep['batch_import_remeha_boiler_status_burning_dhw__bool'] == True)) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
boiler_ch_mask = (df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
boiler_dhw_mask = (df_prep['batch_import_remeha_boiler_status_burning_dhw__bool'] == True) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
remeha_data_notna_mask = (df_prep['batch_import_remeha_temp_indoor__degC'].notna()) & (df_prep['device_p1-reader_g_use_hhv__W'].notna())
boiler_valve_closed_mask = (df_prep['batch_import_remeha_temp_indoor__degC'].notna()) & (df_prep['batch_import_remeha_gas_valve_closed__bool'] == True)

In [None]:
# Select mask
boiler_status_mask = remeha_data_notna_mask 


### Check heat distribution descriptive statistics

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[['batch_import_remeha_flow_dstr_pump_speed__pct',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_temp_flow__degC',
                                     'batch_import_remeha_temp_flow_ch__degC',
                                     'batch_import_remeha_temp_ret__degC',
                                     'batch_import_remeha_temp_ret_ch__degC',
                                     'heat_ch__W',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_dhw_flow__l_min_1'
                                    ]].describe().T)

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[remeha_data_notna_mask][['batch_import_remeha_flow_dstr_pump_speed__pct',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_temp_flow__degC',
                                     'batch_import_remeha_temp_flow_ch__degC',
                                     'batch_import_remeha_temp_ret__degC',
                                     'batch_import_remeha_temp_ret_ch__degC',
                                     'heat_ch__W',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_dhw_flow__l_min_1'
                                    ]].describe().T)

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[boiler_ch_mask][['batch_import_remeha_flow_dstr_pump_speed__pct',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_temp_flow__degC',
                                     'batch_import_remeha_temp_flow_ch__degC',
                                     'batch_import_remeha_temp_ret__degC',
                                     'batch_import_remeha_temp_ret_ch__degC',
                                     'heat_ch__W',
                                     'flow_dstr__dm3_s_1',
                                     'batch_import_remeha_dhw_flow__l_min_1'
                                    ]].describe().T)

#### Estimating flow capacity

In [None]:
est_flow_dstr_capacity__dm3_s_1 = df_prep[boiler_ch_mask]['flow_dstr__dm3_s_1'].mean() / (df_prep[boiler_ch_mask]['batch_import_remeha_flow_dstr_pump_speed__pct'].mean()/100)

In [None]:
est_flow_dstr_capacity__dm3_s_1

In [None]:
pd.DataFrame(df_prep[boiler_ch_mask]['flow_dstr__dm3_s_1'].groupby('id').mean()).describe().T

In [None]:
pd.DataFrame(df_prep[boiler_ch_mask]['batch_import_remeha_flow_dstr_pump_speed__pct'].groupby('id').mean()).describe().T

In [None]:
pd.DataFrame(df_prep[boiler_ch_mask]['flow_dstr__dm3_s_1'].groupby('id').mean()/ (df_prep[boiler_ch_mask]['batch_import_remeha_flow_dstr_pump_speed__pct'].groupby('id').mean()/100)).describe().T

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'flow_dstr__dm3_s_1'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)


In [None]:
df_prep[['batch_import_remeha_dhw_flow__l_min_1']].groupby('id').count()

### Inspecting gas power based on various sources: fan, boilercounter and P1 device

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[remeha_data_notna_mask][[
        'fan_speed__pct',
        'g_use_fan_load__pct',
        'g25_3_use_fan_lhv__W',
        'device_p1-reader_g_use_hhv__W',
        'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
        'gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0',
        'gas_calorific_factor_groningen_hhv_to_actual_hhv__J0',
        'gas_temp_factor_ref_to_actual__J0',
        'gas_pressure_factor_ref_to_actual__J0',
        'gas_pressure_factor_correct_smart_meter_to_actual__J0',
        'g_use_fan_ch_hhv__W',
        'g_use_fan_dhw_hhv__W',
        'g_use_fan_hhv__W',
        'g_use_boilercounter_ch_hhv__W',
        'g_use_boilercounter_dhw_hhv__W',
        'g_use_boilercounter_hhv__W', 
        'g_use_p1_hhv__W',
    ]].describe().T)

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[boiler_ch_mask][[
        'fan_speed__pct',
        'g_use_fan_load__pct',
        'g25_3_use_fan_lhv__W',
        'device_p1-reader_g_use_hhv__W',
        'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
        'gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0',
        'gas_calorific_factor_groningen_hhv_to_actual_hhv__J0',
        'gas_temp_factor_ref_to_actual__J0',
        'gas_pressure_factor_ref_to_actual__J0',
        'gas_pressure_factor_correct_smart_meter_to_actual__J0',
        'g_use_fan_ch_hhv__W',
        'g_use_fan_dhw_hhv__W',
        'g_use_fan_hhv__W',
        'g_use_boilercounter_ch_hhv__W',
        'g_use_boilercounter_dhw_hhv__W',
        'g_use_boilercounter_hhv__W', 
        'g_use_p1_hhv__W',
    ]].describe().T)

#### Inspecting weighted conversion factors

In [None]:
(df_prep[boiler_status_mask]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'] * df_prep[boiler_status_mask]['g_use_fan_hhv__W']).sum() / df_prep[boiler_status_mask]['g_use_fan_hhv__W'].sum() / 1e6

In [None]:
df_prep[boiler_status_mask]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'].mean()/ gas_g25_3_ref_lhv__J_m_3

In [None]:
df_prep[boiler_status_mask]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'].mean()/gas_groningen_nl_avg_std_hhv__J_m_3

In [None]:
gas_groningen_nl_avg_std_hhv__J_m_3 / gas_g25_3_ref_lhv__J_m_3

In [None]:
conversion_factor = (
    df_prep[boiler_status_mask]['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0'].mean()
    * df_prep[boiler_status_mask]['gas_pressure_factor_ref_to_actual__J0'].mean()
    * df_prep[boiler_status_mask]['gas_temp_factor_ref_to_actual__J0'].mean()
)
conversion_factor

In [None]:
conversion_factor_smart_meter = (
    df_prep[boiler_status_mask]['gas_calorific_factor_groningen_hhv_to_actual_hhv__J0'].mean()
    * df_prep[boiler_status_mask]['gas_pressure_factor_correct_smart_meter_to_actual__J0'].mean()
)
conversion_factor_smart_meter

In [None]:
average_gas_smart_meter__W = df_prep[boiler_status_mask]['g_use_p1_hhv__W'].mean()
average_gas_smart_meter__W

### Inspecting gas power based used by boiler, based on boiler counters

#### Inspecting boiler counters while valve is closed

In [None]:
with pd.option_context('display.float_format', '{:.4f}'.format):
    display(df_prep[boiler_valve_closed_mask][[
        'fan_speed__pct',
        'g_use_fan_load__pct',
        'g25_3_use_fan_lhv__W',
        'device_p1-reader_g_use_hhv__W',
        'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
        'gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0',
        'gas_calorific_factor_groningen_hhv_to_actual_hhv__J0',
        'gas_temp_factor_ref_to_actual__J0',
        'gas_pressure_factor_ref_to_actual__J0',
        'gas_pressure_factor_correct_smart_meter_to_actual__J0',
        'g_use_fan_ch_hhv__W',
        'g_use_fan_dhw_hhv__W',
        'g_use_fan_hhv__W',
        'g_use_boilercounter_ch_hhv__W',
        'g_use_boilercounter_dhw_hhv__W',
        'g_use_boilercounter_hhv__W', 
        'g_use_p1_hhv__W',
    ]].describe().T) 

#### Compare gas power calculated in various ways

In [None]:
# Group by 'id' and calculate the means
df_prep[boiler_status_mask].groupby(level='id').agg({
    'batch_import_remeha_temp_flow_ch_max__degC': 'mean',
    'batch_import_remeha_temp_flow_ch__degC': 'mean',
    'batch_import_remeha_temp_ret_ch__degC': 'mean',
    'eta_ch_hhv__W0': 'mean',
}).sort_values(by='eta_ch_hhv__W0', ascending=False)

In [None]:
df_prep[boiler_ch_mask & (df_prep['batch_import_remeha_temp_ret_ch__degC'] <30)]['batch_import_remeha_temp_ret_ch__degC'].count() / df_prep[boiler_ch_mask]['batch_import_remeha_temp_ret_ch__degC'].count()

In [None]:
# average efficiency on higher heating value, NOT weighted by gas input
df_prep[boiler_ch_mask]['eta_ch_hhv__W0'].mean() 

In [None]:
# average efficiency on higher heating value, weighted by gas input
(df_prep.loc[boiler_ch_mask, 'eta_ch_hhv__W0'] * df_prep.loc[boiler_ch_mask, 'g_use_boilercounter_hhv__W']).sum() / df_prep.loc[boiler_ch_mask, 'g_use_boilercounter_hhv__W'].sum()


In [None]:
mask_efficiency_over_1 = (df_prep['eta_ch_hhv__W0'] > 1.0) & boiler_ch_mask

In [None]:
df_prep[mask_efficiency_over_1]['eta_ch_hhv__W0'].count() / df_prep[boiler_ch_mask]['eta_ch_hhv__W0'].count()

In [None]:
df_prep[mask_efficiency_over_1]['g_use_fan_hhv__W'].sum() / df_prep[boiler_ch_mask]['g_use_fan_hhv__W'].sum()

In [None]:
# Filter the DataFrame based on boiler_status_mask
df_filtered = df_prep.loc[boiler_status_mask]

# Group by 'id' and calculate the sum of the relevant columns
grouped = df_filtered.groupby('id').agg(
    mean_boiler_use=('g_use_fan_hhv__W', 'mean'),
    mean_smart_meter_use=('g_use_p1_hhv__W', 'mean'),
    dsmr_version=('device_p1-reader_dsmr_version__0', 'first')  # Assuming the DSMR version doesn't change within an id
)

# Calculate the ratio
grouped['ratio'] = grouped['mean_boiler_use'] / grouped['mean_smart_meter_use']

In [None]:
grouped

In [None]:
# Now, group by DSMR version and calculate the mean ratio for each version
version_grouped = grouped.groupby('dsmr_version').agg(
    sum_mean_boiler_use=('mean_boiler_use', 'sum'),
    sum_mean_smart_meter_use=('mean_smart_meter_use', 'sum'),
    mean_ratio=('ratio', 'mean'),
    count=('ratio', 'size'))

version_grouped['ratio_sums'] =  version_grouped['sum_mean_boiler_use'] / version_grouped['sum_mean_smart_meter_use']


In [None]:
version_grouped

### Boxplots per home (when boiler is burning for central heating & gas value is open)

In [None]:
# Group by 'id' and calculate the mean for both 'eta_ch_hhv__W0' and 'batch_import_remeha_temp_ret__degC'
df_prep[boiler_ch_mask].groupby(level='id').agg({
    'batch_import_remeha_temp_flow_ch_max__degC': 'mean',
    'batch_import_remeha_temp_flow__degC': 'mean',
    'batch_import_remeha_temp_ret__degC': 'mean',
    'eta_ch_hhv__W0': 'mean',
    'heat_ch__W': 'max',
}).sort_values(by='eta_ch_hhv__W0', ascending=False)

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='batch_import_remeha_temp_ret_ch__degC')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='g_use_fan_load__pct')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='eta_ch_hhv__W0')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='heat_ch__W')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='batch_import_remeha_temp_flow_ch__degC')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='batch_import_remeha_temp_outdoor__degC')

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'batch_import_remeha_temp_outdoor__degC'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)


## Inspect internal heat from electrical energy


In [None]:
df_prep[['device_p1-reader_e__W', 'device_p1-reader_e_use__W', 'device_p1-reader_e_ret__W']].describe().T

In [None]:
df_prep[['device_p1-reader_e__W', 'device_p1-reader_e_use__W', 'device_p1-reader_e_ret__W']].groupby('id').describe().T

In [None]:
%%time
# Calculate the fraction of time across al homes that net electricity usage is less than zero (which results into a relative 'cooling' effect: the sun is less effectively heating the home)
(df_prep['device_p1-reader_e__W'] < 0).mean()

In [None]:
%%time
# Calculate the fraction of time per home that net electricity usage is less than zero (which reaslts into a relative 'cooling' effect: the sun is less effectively heating the home)
df_prep.groupby('id')['device_p1-reader_e__W'].apply(lambda x: (x < 0).mean())

## Optional: write heat distribution preprocessing results

### Writing heat distribution preprocessing results results to parquet file

In [None]:
df_heat_dist = df_prep

# # optional: subset
# df_heat_dist = df_prep[['batch_import_KNMI_sol_ghi__W_m_2',
#                         'batch_import_KNMI_temp_outdoor__degC',
#                         'batch_import_KNMI_wind__m_s_1',
#                         'device_p1-reader_g_use_hhv__W',
#                         'batch_import_remeha_boiler_status_burning_ch__bool',
#                         'batch_import_remeha_boiler_status_burning_dhw__bool', 
#                         'batch_import_remeha_gas_valve_closed__bool',
#                         'batch_import_remeha_gas_valve_open__bool',
#                         'batch_import_remeha_fan_rotations__min_1', 
#                         'batch_import_remeha_flow_dstr_pump_speed__pct', 
#                         'batch_import_remeha_g_use_ch_lhv__W',
#                         'batch_import_remeha_g_use_dhw_lhv__W',
#                         'batch_import_remeha_temp_set__degC',
#                         'batch_import_remeha_temp_indoor__degC',
#                         'device_living_room_calibrated_temp_indoor__degC',
#                         'batch_import_remeha_temp_flow__degC',
#                         'batch_import_remeha_temp_ret__degC',
#                         'batch_import_remeha_temp_flow_ch__degC',
#                         'batch_import_remeha_temp_ret_ch__degC',
#                         'interpolated_batch_import_remeha_temp_flow_ch__degC',
#                         'interpolated_batch_import_remeha_temp_ret_ch__degC',
#                         'batch_import_remeha_temp_flow_ch_max__degC', 
#                         'g_use_p1_hhv__W',
#                         'g_use_fan_hhv__W',
#                         'g_use_fan_dhw_hhv__W',
#                         'g_use_fan_ch_hhv__W',
#                         'eta_ch_hhv__W0',
#                         'heat_ch__W',
#                         'g_use_fan_load__pct',
#                        ]]

In [None]:
df_heat_dist.info()

In [None]:
%%time 
df_heat_dist.to_parquet(rhc_heat_dstr_preprocessed_poperties_file, index=True, engine='pyarrow')

### Optional: writing heat distribution preprocessing results to multiple zipped CSV files

In [None]:
# %%time 
# # uncomment this entire block of code to enable it 
# # for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()[:3]):
# # for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()[3:]):
# # for home_id in [483173]:
# for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()):
#     df_heat_dist.xs(home_id, drop_level=False).to_csv(
#         f'{home_id}_heat_dstr_preprocessed_properties.zip',
#         encoding='utf-8',
#         compression= dict(method='zip',
#                           archive_name=f'{home_id}_heat_dstr_preprocessed_properties.csv'),
#         date_format='%Y-%m-%dT%H:%M:%S%z'
#     )


## Inspect  return temperatures

In [None]:
# time mask for one week in winter with almost all ids having data
returntemp_mask = (
    # (df_prep.index.get_level_values('id') == 483173)
    (df_prep.index.get_level_values('id') == 403603)
    & 
    (df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-04 00:00:00+01:00'))
    & 
    (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-03-11 00:00:00+01:00'))
)


In [None]:
replace_boolprops = ['batch_import_remeha_boiler_status_burning_ch__bool',
                     'batch_import_remeha_boiler_status_pump_post_run__bool', 
                     'batch_import_remeha_boiler_status_burning_dhw__bool',
                    ]

In [None]:
%%time
for prop in replace_boolprops:
    df_prep[f"{prop}01"] = df_prep[prop].astype('Int8')

In [None]:
replaced_boolprops = [f"{prop}01" for prop in replace_boolprops]

In [None]:
df_prep[replaced_boolprops].describe()

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][['batch_import_remeha_flow_dstr_pump_speed__pct',
                                                           'batch_import_remeha_temp_flow__degC',
                                                           'batch_import_remeha_temp_ret__degC',
                                                           'batch_import_remeha_temp_flow_ch__degC',
                                                           'batch_import_remeha_temp_ret_ch__degC',
                                                           'interpolated_batch_import_remeha_temp_flow_ch__degC',
                                                           'interpolated_batch_import_remeha_temp_ret_ch__degC',
                                                           'batch_import_remeha_boiler_status_burning_ch__bool01',
                                                           'batch_import_remeha_boiler_status_pump_post_run__bool01', 
                                                           'batch_import_remeha_boiler_status_burning_dhw__bool01',
                                                           'batch_import_remeha_dhw_flow__l_min_1',
                                                           'g_use_fan_ch_hhv__W',
                                                           'g_use_fan_dhw_hhv__W']
                                 ], units_to_mathtext)

In [None]:
returntemp_plot_columns = ['batch_import_remeha_temp_flow__degC',
                           'batch_import_remeha_temp_ret__degC',
                           'interpolated_batch_import_remeha_temp_flow_ch__degC',
                           'interpolated_batch_import_remeha_temp_ret_ch__degC',
                           'g_use_fan_ch_hhv__W',
                           'g_use_fan_dhw_hhv__W',
                           'batch_import_remeha_boiler_status_burning_ch__bool01',
                           'batch_import_remeha_boiler_status_pump_post_run__bool01', 
                           'batch_import_remeha_boiler_status_burning_dhw__bool01',
                           # 'batch_import_remeha_flow_dstr_pump_speed__pct',
                           ]

In [None]:
df_prep[returntemp_mask][returntemp_plot_columns].describe().T

In [None]:
%autoreload 2
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][returntemp_plot_columns], units_to_mathtext)

In [None]:
df_prep[returntemp_mask][returntemp_plot_columns].describe().T

In [None]:
filtered_returntemp_plot_columns = ['interpolated_batch_import_remeha_temp_flow_ch__degC',
                                    'interpolated_batch_import_remeha_temp_ret_ch__degC',
                                    'g_use_fan_ch_hhv__W',
                                    'g_use_fan_dhw_hhv__W',
                                    'batch_import_remeha_boiler_status_burning_ch__bool01',
                                    'batch_import_remeha_boiler_status_pump_post_run__bool01', 
                                    'batch_import_remeha_boiler_status_burning_dhw__bool01',
                                    'batch_import_remeha_flow_dstr_pump_speed__pct',
                                   ]

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][filtered_returntemp_plot_columns], units_to_mathtext)

## Inspect relation between flow_dstr_pump_speed__pct and calculated_flow_ch__dm3_min_1

### Histogram and frequently occurring values

In [None]:
df_prep['batch_import_remeha_flow_dstr_pump_speed__pct'].value_counts().nlargest(10)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'batch_import_remeha_flow_dstr_pump_speed__pct'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)


### Histogram and frequently occurring values: burning for DHW

In [None]:
df_prep[df_prep['batch_import_remeha_boiler_status_burning_dhw__bool']==True]['batch_import_remeha_flow_dstr_pump_speed__pct'].value_counts().nlargest(10)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'batch_import_remeha_flow_dstr_pump_speed__pct'
df_prep[df_prep['batch_import_remeha_boiler_status_burning_dhw__bool']==True][prop].plot.hist(bins=200, alpha=0.5, title = prop)


### Histogram and frequently occurring values: burning for CH

In [None]:
df_prep[df_prep['batch_import_remeha_boiler_status_burning_ch__bool']==True]['batch_import_remeha_flow_dstr_pump_speed__pct'].value_counts().nlargest(10)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'batch_import_remeha_flow_dstr_pump_speed__pct'
df_prep[df_prep['batch_import_remeha_boiler_status_burning_ch__bool']==True][prop].plot.hist(bins=200, alpha=0.5, title = prop)


### Scatterplot flow vs pump_speed

In [None]:
%%time
# Ensure that the necessary columns exist in both DataFrames
columns_to_merge = ['brand_model', 'hydronic_pump_brand', 'hydronic_pump_model']

if not all(col in df_boilers.columns for col in columns_to_merge):
    print("Some required columns are missing in df_boilers!")
else:
    # Step 1: Reset the index of df_prep to make brand_model a normal column
    df_prep = df_prep.reset_index()  # Resets 'id' and 'timestamp'

    # Step 2: Merge df_prep_reset with df_boilers on 'brand_model'
    df_prep = pd.merge(
        df_prep,              # The DataFrame we're merging into
        df_boilers[columns_to_merge],  # Only keeping relevant columns from df_boilers
        how='left',                 # Use 'left' join to keep all rows from df_prep
        on='brand_model'            # The column to join on
    )

    # Convert specific object columns to categorical
    df_prep['hydronic_pump_brand'] = df_prep['hydronic_pump_brand'].astype('category')
    df_prep['hydronic_pump_model'] = df_prep['hydronic_pump_model'].astype('category')

    # Step 3: Set the 'id' and 'timestamp' columns back to MultiIndex
    df_prep = df_prep.set_index(['id', 'timestamp'])

In [None]:
required_columns = ['batch_import_remeha_flow_dstr_pump_speed__pct', 'flow_dstr__dm3_s_1']
# color_by = 'hydronic_pump_brand'
color_by = 'hydronic_pump_model'

if not all(col in df_prep.columns for col in required_columns):
    print("Some required columns are missing!")

# df_sample = df_prep.sample(n=1000000, random_state=42)
# df_sample = df_prep[df_prep['batch_import_remeha_boiler_status_burning_ch__bool']==True].sample(n=100000, random_state=42)
# df_sample = df_prep[df_prep['batch_import_remeha_boiler_status_controlled_stop__bool']==True].sample(n=10000, random_state=42)
df_sample = df_prep[
(df_prep['batch_import_remeha_boiler_status_burning_ch__bool']==True) 
& (df_prep['batch_import_remeha_gas_valve_open__bool']==True)
# & (df_prep['batch_import_remeha_boiler_status_pump_post_run__bool']==False)
# & (df_prep['batch_import_remeha_boiler_status_controlled_stop__bool']==False)
].sample(n=100000, random_state=42)

plt.figure(figsize=(10, 6))

# Create a unique color palette for each brand using matplotlib's colormap
brands = df_sample[color_by].cat.categories
num_brands = len(brands)

# Use a colormap from matplotlib
cmap = plt.get_cmap('tab10', num_brands)  # 'tab10' is good for discrete categories

# Scatter plot with color based on 'hydronic_pump_brand'
for idx, brand in enumerate(brands):
    brand_data = df_sample[df_sample[color_by] == brand]
    plt.scatter(brand_data['batch_import_remeha_flow_dstr_pump_speed__pct'], 
                brand_data['flow_dstr__dm3_s_1'], 
                color=cmap(idx),  # Use the colormap to get the color for the current brand
                alpha=0.5, s=1, label=brand)  # Label for legend
    
plt.title('Scatter plot of Water Pump Speed vs. Calculated Flow (all ids)')
plt.xlabel('batch_import_remeha_flow_dstr_pump_speed__pct')
plt.ylabel('flow_dstr__dm3_s_1')
plt.grid(True)

# Add a legend
plt.legend(title=color_by, loc='best', markerscale=5)

plt.show()

In [None]:
# Filter the data as needed
df_sample = df_prep[
    (df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True) & 
    (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
][['batch_import_remeha_flow_dstr_pump_speed__pct', 'flow_dstr__dm3_s_1', 'delta_t_flow_ret__K']].sample(n=100000, random_state=42)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'delta_t_flow_ret__K'
df_sample[prop].plot.hist(bins=200, alpha=0.5, title = prop)


In [None]:
df_sample.describe()

In [None]:
# Define a colormap normalization to focus on the range [0, 10]
norm = Normalize(vmin=0, vmax=8)

plt.figure(figsize=(10, 6))

# Scatter plot with color based on temperature difference
scatter = plt.scatter(
    df_sample['batch_import_remeha_flow_dstr_pump_speed__pct'], 
    df_sample['flow_dstr__dm3_s_1'], 
    c=df_sample['delta_t_flow_ret__K'],  # Use delta_t_flow_ret__K for color
    cmap='viridis',  # Use a colormap suitable for continuous data
    norm=norm,  # Apply normalization
    alpha=0.5, 
    s=1
)

plt.title('Pump Speed vs. Flow Rate based on ΔT:flow-return')
plt.xlabel('Pump Speed (%)')
plt.ylabel('Flow Rate (L/s)')
plt.grid(True)

# Add a colorbar to represent the temperature difference scale
cbar = plt.colorbar(scatter)
cbar.set_label('ΔT:flow-return [K]')
cbar.ax.set_yticks([0, 1, 2, 4, 8])  # Highlight specific values within the range

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ipywidgets import interact, FloatSlider, SelectMultiple, VBox, HBox, Output, Checkbox
from matplotlib.colors import Normalize
from IPython.display import display
from scipy.stats import linregress

# Ensure temp_diff column is present
df_prep['delta_t_flow_ret__K'] = (
    df_prep['batch_import_remeha_temp_flow_ch__degC'] -
    df_prep['batch_import_remeha_temp_ret_ch__degC']
)

# Helper function to calculate metrics, plot the graph, and optionally add the regression line
def update_graph(threshold, selected_ids, color_min, color_max, show_regression_line):
    # Filter data based on temp_diff threshold
    df_filtered = df_sample[df_sample['delta_t_flow_ret__K'] >= threshold]
    
    # Calculate percentage of dots left compared to unfiltered
    total_dots_left_pct = len(df_filtered) / len(df_sample) * 100
    
    # Calculate smallest percentage of dots left for any ID
    group_sizes_original = df_sample.groupby(level='id').size()
    group_sizes_filtered = df_filtered.groupby(level='id').size()
    id_with_smallest_pct = (group_sizes_filtered / group_sizes_original).fillna(0).idxmin()
    smallest_pct_per_id = (group_sizes_filtered / group_sizes_original).fillna(0).min() * 100
    

    # Display metrics dynamically
    print(f"Threshold: {threshold:.2f} K")
    print(f"Percentage of dots left: {total_dots_left_pct:.2f}%")
    print(f"Smallest percentage of dots left for a specific ID: {smallest_pct_per_id:.2f}%")
    print(f"ID with the smallest percentage: {id_with_smallest_pct}")
    
    # Filter based on selected IDs
    if selected_ids:
        df_filtered = df_filtered[df_filtered.index.get_level_values('id').isin(selected_ids)]

    # Scatter plot
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(
        df_filtered['batch_import_remeha_flow_dstr_pump_speed__pct'],
        df_filtered['flow_dstr__dm3_s_1'],
        c=df_filtered['delta_t_flow_ret__K'], 
        cmap='viridis',
        norm=Normalize(vmin=color_min, vmax=color_max),  # Dynamic color scale
        alpha=0.5, 
        s=1
    )

    plt.title(f"Pump Speed vs. Flow Rate based on ΔT:flow-return > {threshold:.1f} K")
    plt.xlabel('Pump Speed (%)')
    plt.ylabel('Flow Rate (L/s)')
    plt.grid(True)
    
    # Color bar with intermediary labels
    cbar = plt.colorbar(scatter)
    cbar.set_label('ΔT:flow-return [K]')
    cbar.set_ticks(np.linspace(color_min, color_max, 5))  # Intermediary labels
    cbar.set_ticklabels([f"{x:.1f}" for x in np.linspace(color_min, color_max, 5)])  # Custom labels

    # Optionally add linear regression line
    if show_regression_line:
        # Perform linear regression on the filtered data
        slope, intercept, _, _, _ = linregress(df_filtered['batch_import_remeha_flow_dstr_pump_speed__pct'],
                                                df_filtered['flow_dstr__dm3_s_1'])
        # Calculate the estimated flow at 100% pump speed
        flow_at_100_pct = slope * 100 + intercept
        
        # Plot the regression line
        x_vals = np.linspace(df_filtered['batch_import_remeha_flow_dstr_pump_speed__pct'].min(),
                             df_filtered['batch_import_remeha_flow_dstr_pump_speed__pct'].max(), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, 'r--', label='Linear Fit', linewidth=2)  # Dotted red line
        print(f"Slope: {slope:.4f} L/s per %")
        print(f"Intercept: {intercept:.4f} L/s")
        print(f"Flow at 100%: {flow_at_100_pct:.4f} L/s")
    else:
        print("Slope: None")
        print("Intercept: None")
        print("Flow at 100%: None")
        
    plt.show()

# Create a list of unique IDs
unique_ids = tuple(df_prep.index.get_level_values('id').unique())

# Create interactive widgets
threshold_slider = FloatSlider(
    value=0, min=0, max=15, step=0.2, 
    description='Threshold (K)', continuous_update=False
)

id_selector = SelectMultiple(
    options=unique_ids,
    value=unique_ids,  # Default: all IDs selected
    description="IDs",
    disabled=False,
    layout={'height': '400px'}  # Set height for 20 items
)

# Sliders for dynamic color scale limits
color_min_slider = FloatSlider(
    value=0, min=0, max=10, step=0.2, 
    description='Color Min (K)', continuous_update=False
)

color_max_slider = FloatSlider(
    value=10, min=0, max=15, step=0.2, 
    description='Color Max (K)', continuous_update=False
)

# Checkbox for toggling regression line
show_regression_checkbox = Checkbox(
    value=True,
    description='Show Linear Fit',
    disabled=False
)

# Create an Output widget to capture the graph output
output = Output()

# Function to handle widget interactions and plot updates
def interactive_update(threshold, selected_ids, color_min, color_max, show_regression_line):
    with output:
        output.clear_output(wait=True)
        update_graph(threshold, selected_ids, color_min, color_max, show_regression_line)

# Display widgets and graph side by side using HBox and VBox
display(HBox([VBox([threshold_slider, id_selector, color_min_slider, color_max_slider, show_regression_checkbox]), output]))

# Interact with the updated function
interact(interactive_update, 
         threshold=threshold_slider, 
         selected_ids=id_selector, 
         color_min=color_min_slider, 
         color_max=color_max_slider,
         show_regression_line=show_regression_checkbox)


# Optional: Read heat distribution preprocessing results
This step can be used to test learning algorithms without having to do the analysis perprocessing.

In [None]:
%%time

# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_heat_dstr_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

# Learn heat performance signature parameters
Most of the heavy lifting is done by the `learn_system_prameters()` function, which again uses the [GEKKO Python](https://machinelearning.byu.edu/) dynamic optimization toolkit.

In [None]:
%autoreload 2

# select which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC',
    'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1',
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
    'flow_dstr__dm3_s_1':       'flow_dstr__dm3_s_1',
}

### Define (subsets of) learning periods and (subsets of) home ids to perform the learning on

In [None]:
# time mask for core of winter with most data
janfebmrt24_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-01-01 00:00:00+01:00')) 
                    & 
                    (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-04-01 00:00:00+02:00'))
                   )

In [None]:
# time mask for one week in winter with almost all ids having data
febwk3_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-18 00:00:00+01:00'))
               & 
               (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-02-25 00:00:00+01:00'))
              )


In [None]:
# time mask for one week in winter with almost all ids having data
feb_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-01 00:00:00+01:00'))
            & 
            (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-02-29 00:00:00+01:00'))
           )


In [None]:
specific_id = 434931
specific_ids = [434931, 450298, 495906]

### Optional: Restrict the analysis size

In [None]:
# Slim dataset down for testing (work on full df_prep during final analysis)
idx = pd.IndexSlice
# df_prep = df_prep[febwk3_mask].loc[idx[[specific_id]], :]
# df_prep = df_prep[febwk3_mask].loc[idx[specific_ids], :]
# df_prep = df_prep[feb_mask].loc[idx[specific_ids], :]                                                                
df_prep = df_prep[janfebmrt24_mask]   

In [None]:
# max_periods = 10 # set max_periods to a value other than None, e.g. 10 or 100 during testing of learning algorithms
# max_periods = 100 # set max_periods to a value other than None, e.g. 10 or 100 during testing of learning algorithms
# max_periods = 1000 # set max_periods to a value other than None, e.g. 10 or 100 during testing of learning algorithms
max_periods = None # set max_periods to a None for full learning

## Learn thermostat parameters

### Initial explorations to discover the type of thermostat control

In [None]:
# Example: Assuming df_prep is your DataFrame with relevant columns
# 'batch_import_remeha_boiler_status_burning_ch__bool' and the temperature data
df_prep['temp_delta_error_temp_delta_indoor_set__K'] = df_prep[property_sources['temp_set__degC']] - df_prep[property_sources['temp_indoor__degC']]

In [None]:
# Create a column to mark transitions
status_col = 'batch_import_remeha_boiler_status_burning_ch__bool'

In [None]:
df_prep[status_col].info()

In [None]:
# Shift the boolean status column directly
df_prep['prev_status'] = df_prep[status_col].shift(1, fill_value=False)
df_prep['next_status'] = df_prep[status_col].shift(-1, fill_value=False)

In [None]:
# Detect transitions
df_prep['is_ON_minus_1'] = (~df_prep[status_col]) & (df_prep['next_status'])
df_prep['is_ON_plus_0'] = (~df_prep['prev_status']) & (df_prep[status_col])

df_prep['is_OFF_minus_1'] = (df_prep[status_col]) & (~df_prep['next_status'])
df_prep['is_OFF_plus_0'] = (df_prep['prev_status']) & (~df_prep[status_col])

In [None]:
# Extract rows for each transition type
ON_minus_1 = df_prep.loc[df_prep['is_ON_minus_1'], 'temp_delta_error_temp_delta_indoor_set__K']
ON_plus_0 = df_prep.loc[df_prep['is_ON_plus_0'], 'temp_delta_error_temp_delta_indoor_set__K']

OFF_minus_1 = df_prep.loc[df_prep['is_OFF_minus_1'], 'temp_delta_error_temp_delta_indoor_set__K']
OFF_plus_0 = df_prep.loc[df_prep['is_OFF_plus_0'], 'temp_delta_error_temp_delta_indoor_set__K']

In [None]:
# Calculate median and mean for each situation
hysteresis_values = {
    'TURN-ON-1': {'median': ON_minus_1.median(), 'mean': ON_minus_1.mean()},
    'TURN-ON+0': {'median': ON_plus_0.median(), 'mean': ON_plus_0.mean()},
    'TURN-OFF-1': {'median': OFF_minus_1.median(), 'mean': OFF_minus_1.mean()},
    'TURN-OFF+0': {'median': OFF_plus_0.median(), 'mean': OFF_plus_0.mean()},
}

In [None]:
# Print the results
for key, values in hysteresis_values.items():
    print(f"{key}: Median = {values['median']}, Mean = {values['mean']}")

In [None]:
# Collect the data for the boxplots
data = [
    ON_minus_1.dropna(),  # Drop NaN to avoid issues with boxplot
    ON_plus_0.dropna(),
    OFF_minus_1.dropna(),
    OFF_plus_0.dropna()
]

# Define labels for the boxplots
labels = ['TURN-ON-1', 'TURN-ON+0', 'TURN-OFF-1', 'TURN-OFF+0']

# Create the figure and axis
plt.figure(figsize=(10, 6))
plt.boxplot(data, labels=labels, patch_artist=True, boxprops=dict(facecolor="lightblue"))

# Add title and labels
plt.title("Boxplots of temp_delta_error_temp_delta_indoor_set__K by Transition Type")
plt.ylabel("temp_delta_error_temp_delta_indoor_set__K (K)")
plt.xlabel("Transition Type")

# Show the grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.show()

In [None]:
%%time
# TO DO: make sure that dataframe_preprocessed_plot can deal with nullable pandas datatype 'Boolean', so this boolean convertion is no longer needed
# Create converted columns with the desired naming convention
convert_column_names = ['batch_import_remeha_boiler_status_burning_ch__bool', 'batch_import_remeha_boiler_status_burning_dhw__bool']
converted_column_names = [col+'01' for col in convert_column_names]
for col in convert_column_names:
    converted_column_name = f'converted_{col}'
    df_prep[converted_column_name] = df_prep[col].astype('Float32')
    

In [None]:
df_prep['temp_delta_error_temp_delta_set_indoor__K'] = -df_prep['temp_delta_error_temp_delta_indoor_set__K']

In [None]:
converted_column_names

In [None]:
# Display the first few rows to confirm
df_prep[converted_column_names].describe()

In [None]:
hysteresis_cols = ['temp_delta_error_temp_delta_set_indoor__K', property_sources['temp_set__degC'], property_sources['temp_indoor__degC']] + converted_column_names

In [None]:
%autoreload 2
Plot.dataframe_preprocessed_plot(df_prep[hysteresis_cols].xs(specific_id, level='id', drop_level=False), units_to_mathtext)

### Learn thermostat control parameters

In [None]:
%%time
# We did not measure the temperature flow setpoint. However, we can create a proxy for learning thermostat based on the correspondence of flow temperatures and flow temperature setpoints
# Ensure the 'temp_flow_ch_set__degC' column exists
if property_sources['temp_flow_ch_set__degC'] not in df_prep.columns:
    df_prep[property_sources['temp_flow_ch_set__degC']] = df_prep[property_sources['temp_flow_ch__degC']]

    # Modify the function to return only the updated column
    def process_group(group):
        temp_flow_ch_set = group[property_sources['temp_flow_ch_set__degC']].shift(
            -1, fill_value=group[property_sources['temp_flow_ch_set__degC']].iloc[-1]
        )
        temp_flow_ch_set[group[property_sources['temp_indoor__degC']] > group[property_sources['temp_set__degC']]] = 0
        return temp_flow_ch_set
    
    # Apply and directly update the column
    df_prep[property_sources['temp_flow_ch_set__degC']] = df_prep.groupby('id').apply(process_group).droplevel(0)


In [None]:
df_prep[[property_sources['temp_flow_ch_set__degC'], property_sources['temp_flow_ch__degC']]].describe()

In [None]:
# optionally change the selection which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    'temp_flow_ch__degC':       'batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_ret_ch__degC':        'batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1',
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
}

In [None]:
req_props_thrm = {'temp_indoor__degC',
                  'temp_set__degC',
                  'temp_flow_ch_max__degC',
                 }

# Parameters that are learned
learn_params_thrm = {'thermostat_hysteresis__K'}
learned_params_thrm = {'learned_' + param for param in learn_params_thrm}

# properties to be predicted
predict_props_thrm = {'temp_flow_ch_set__degC'}

minimum_duration__min = 20

In [None]:
req_cols_thrm = {property_sources[prop] for prop in req_props_thrm & property_sources.keys()}

In [None]:
req_cols_thrm

#### Assuming algorithmic control (hysteresis)

In [None]:
param_hints_thrm = dict()

mode= Model.ControlMode.ALGORITHMIC

In [None]:
#define the length of the learning period
learn_period__d=7
# learn_period__d=3


In [None]:
# %%time
# %autoreload 2
# df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
#     df_prep,
#     df_bldng_data=df_bldng_data,
#     system_model_fn = Model.thermostat,
#     job_identification_fn = Learner.periodic_learn_list,
#     property_sources=property_sources,
#     learn_params=learn_params_thrm,
#     param_hints=param_hints_thrm,
#     predict_props=predict_props_thrm,
#     req_props=req_props_thrm,
#     duration_threshold=timedelta(minutes=minimum_duration__min),
#     learn_period__d=learn_period__d,
#     max_periods=10, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
#     mode=mode,
#     max_iter=10,
# )


#### Process learned parameters

In [None]:
# print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
#     if error_cols:
#         display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
# df_learned_parameters_per_period

In [None]:
# # Determine learned parameters 
# df_learned_parameters = df_learned_parameters_per_period.groupby('id').median()

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     display(df_learned_parameters.describe())

In [None]:
# # merge learned data into building metadata
# df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
#                                     how='left',
#                                     left_on='id',
#                                     right_index=True
#                                    )

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     display(df_bldng_data.T)

In [None]:
# df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
# # Merge predicted properties back into the main DataFrame
# if not df_predicted_properties.empty:
#     df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
#     df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
# df_prep.columns

#### Assuming PID control
(NB Code disabled by default because learning takes too long)

In [None]:
# param_hints_thrm = {
#     'thermostat': {
#         'p': {'initial_guess': 1.0, 'lower_bound': 0.1, 'upper_bound': 10.0},
#         'i': {'initial_guess': 0.1, 'lower_bound': 0.0, 'upper_bound': 1.0},
#         'd': {'initial_guess': 0.01, 'lower_bound': 0.0, 'upper_bound': 1.0},
#     }
# }

# mode= Model.ControlMode.PID

In [None]:
# %%time
# %autoreload 2
# df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
#     df_prep,
#     df_bldng_data=df_bldng_data,
#     system_model_fn = Model.thermostat,
#     job_identification_fn = Learner.periodic_learn_list,
#     property_sources=property_sources,
#     learn_params=learn_params_thrm,
#     param_hints=param_hints_thrm,
#     predict_props=predict_props_thrm,
#     req_props=req_props_thrm,
#     duration_threshold=timedelta(minutes=minimum_duration__min),
#     learn_period__d=learn_period__d,
#     max_periods=max_periods, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
#     mode=mode,
# )


#### Process learned parameters

In [None]:
# print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
#     if error_cols:
#         display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
# df_learned_parameters_per_period

In [None]:
# # Determine learned parameters 
# if not df_learned_parameters_per_period.empty:
#     df_learned_parameters = df_learned_parameters_per_period.groupby('id').median()

In [None]:
# if not df_learned_parameters_per_period.empty:
#     with pd.option_context('display.float_format', '{:.2f}'.format):
#         display(df_learned_parameters.describe())

In [None]:
# if not df_learned_parameters_per_period.empty:
#     # merge learned data into building metadata
#     df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
#                                         how='left',
#                                         left_on='id',
#                                         right_index=True
#                                        )

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     display(df_bldng_data.T)

In [None]:
# df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
# # Merge predicted properties back into the main DataFrame
# if not df_predicted_properties.empty:
#     df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
#     df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
# df_prep.columns

## Learn boiler control characteristics

In [None]:
# optionally change the selection which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    'temp_flow_ch__degC':       'batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_ret_ch__degC':        'batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1',
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
}

In [None]:
req_props_boiler = {
    'temp_flow_ch_set__degC', 
    'temp_flow_ch__degC',
    'temp_ret_ch__degC',
    'temp_flow_ch_max__degC',
    'fan_speed__pct', 
    'flow_dstr_pump_speed__pct',
}

predict_props_boiler = {
    'fan_speed__pct',
    'flow_dstr_pump_speed__pct'
} 

minimum_duration__min = 20
# minimum_duration__min = 5

learn_period__d=None



In [None]:
# use calulated flow setpoint from the thermostat, but if not learnt, set flow setpoint to 60 degC for learning boiler control
if property_sources['temp_flow_ch_set__degC'] not in df_prep.columns:
    print("temp_flow_ch_set_-degC temperatures not learned; setting to 60 degrees")
    df_prep[property_sources['temp_flow_ch_set__degC']] = 60 

### Assuming algorithmic control 
(NB Code disabled by default because learning takes too long)

In [None]:
# mode = Model.ControlMode.ALGORITHMIC

# learn_params_boiler = {
#     'fan_rotations_max_gain__pct_min_1',
#     'error_threshold_temp_delta_flow_flowset__K',
#     'flow_dstr_pump_speed_max_gain__pct_min_1',
#     'error_threshold_temp_delta_flow_ret__K',
# } 

# learned_params_boiler = {'learned_' + param for param in learn_params_boiler}

In [None]:
# %%time
# %autoreload 2
# df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
#     df_prep,
#     df_bldng_data=df_bldng_data,
#     system_model_fn = Model.boiler,
#     job_identification_fn = Learner.valid_learn_list,
#     property_sources=property_sources,
#     learn_params=learn_params_boiler,
#     param_hints=None,
#     predict_props=predict_props_boiler,
#     req_props=req_props_boiler,
#     duration_threshold=timedelta(minutes=minimum_duration__min),
#     learn_period__d=learn_period__d,
#     max_periods=10, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
#     mode=mode,
#     max_iter=10, # Limit iterations to prevent long runtimes if the optimizer struggles to converge due to discontinuities in algorithmic models.
# )

#### Process learned parameters

In [None]:
# print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
#     if error_cols:
#         display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
# df_learned_parameters_per_period

In [None]:
# # Determine learned parameters 
# if not df_learned_parameters_per_period.empty:
#     df_learned_parameters = df_learned_parameters_per_period.groupby('id').median()

In [None]:
# if not df_learned_parameters_per_period.empty:
#     with pd.option_context('display.float_format', '{:.2f}'.format):
#         display(df_learned_parameters.describe())

In [None]:
# if not df_learned_parameters_per_period.empty:
#     for learned_param in df_learned_parameters_per_period:
#         Plot.nfh_property_per_id_boxplot(df_learned_parameters_per_period, property_col=learned_param)

In [None]:
# if not df_learned_parameters_per_period.empty:
#     # merge learned data into building metadata
#     df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
#                                         how='left',
#                                         left_on='id',
#                                         right_index=True
#                                        )

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     display(df_bldng_data.T)

In [None]:
# df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
# # Merge predicted properties back into the main DataFrame
# if not df_predicted_properties.empty:
#     df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
#     df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
# df_prep.columns

### Assuming PID control

In [None]:
boiler_pid_hints_bounds = {
    # 'thermostat': {
    #     'p': {'initial_guess': 1.0, 'lower_bound': 0.1, 'upper_bound': 10.0},
    #     'i': {'initial_guess': 0.1, 'lower_bound': 0.0, 'upper_bound': 1.0},
    #     'd': {'initial_guess': 0.01, 'lower_bound': 0.0, 'upper_bound': 1.0},
    # },
    'fan': {
        'p': {'initial_guess': 0.5, 'lower_bound': 0.1, 'upper_bound': 2.0},
        'i': {'initial_guess': 0.1, 'lower_bound': 0.01, 'upper_bound': 1.0},
        'd': {'initial_guess': 0.01, 'lower_bound': 0.001, 'upper_bound': 0.1},
    },
    'pump': {
        'p': {'initial_guess': 0.7, 'lower_bound': 0.2, 'upper_bound': 3.0},
        'i': {'initial_guess': 0.2, 'lower_bound': 0.05, 'upper_bound': 1.5},
        'd': {'initial_guess': 0.02, 'lower_bound': 0.005, 'upper_bound': 0.2},
    }
}


mode = Model.ControlMode.PID

# learn_params_boiler and learned_params_boiler are currently actually determined inside the boiler() function, but learn_params_boiler cannot be None otherwise it is switching to simulation mode
learn_params_boiler = {
    'fan_Kp',
    'fan_Ki',
    'fan_Kd',
    'pump_Kp',
    'pump_Ki',
    'pump_Kd',
}

learned_params_boiler = {'learned_' + param for param in learn_params_boiler}


In [None]:
# %%time
# %autoreload 2
# df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
#     df_prep,
#     df_bldng_data=df_bldng_data,
#     system_model_fn = Model.boiler,
#     job_identification_fn = Learner.valid_learn_list,
#     property_sources=property_sources,
#     learn_params=learn_params_boiler,
#     param_hints=boiler_pid_hints_bounds,
#     predict_props=predict_props_boiler,
#     req_props=req_props_boiler,
#     duration_threshold=timedelta(minutes=minimum_duration__min),
#     learn_period__d=learn_period__d,
#     max_periods=max_periods, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
#     mode=mode,
#     # max_iter=10, # Limit iterations to prevent long runtimes if the optimizer struggles to converge due to discontinuities in algorithmic models.
# )

#### Process learned parameters

In [None]:
# print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
#     if error_cols:
#         display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
# df_learned_parameters_per_period

In [None]:
# # Determine learned parameters 
# if not df_learned_parameters_per_period.empty:
#     df_learned_parameters = df_learned_parameters_per_period.groupby('id').median()

In [None]:
# if not df_learned_parameters_per_period.empty:
#     with pd.option_context('display.float_format', '{:.2f}'.format):
#         display(df_learned_parameters.describe())

In [None]:
# if not df_learned_parameters_per_period.empty:
#     for learned_param in df_learned_parameters_per_period:
#         if df_learned_parameters_per_period[learned_param].notna().any():
#             Plot.nfh_property_per_id_boxplot(df_learned_parameters_per_period, property_col=learned_param)

In [None]:
# if not df_learned_parameters_per_period.empty:
#     # merge learned data into building metadata
#     df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
#                                         how='left',
#                                         left_on='id',
#                                         right_index=True
#                                        )

In [None]:
# with pd.option_context('display.float_format', '{:.2f}'.format):
#     display(df_bldng_data.T)

In [None]:
# df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
# # Merge predicted properties back into the main DataFrame
# if not df_predicted_properties.empty:
#     df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
#     df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
# df_prep.columns

## Learn heat distribution characteristics

In [None]:
# optionally change the selection which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    # 'temp_flow_ch__degC':       'batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_ret_ch__degC':        'batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1',
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
    'flow_dstr__dm3_s_1':       'flow_dstr__dm3_s_1',
}

In [None]:
%%time
df_prep['delta_dstr']=df_prep[property_sources['temp_dstr__degC']]-df_prep[property_sources['temp_indoor__degC']]
df_prep['delta_inout']=df_prep[property_sources['temp_indoor__degC']]-df_prep[property_sources['temp_outdoor__degC']]

In [None]:
df_prep[['delta_dstr', 'delta_inout']].describe().T

In [None]:
df_prep[['delta_dstr', 'delta_inout']].groupby('id').describe().T

In [None]:
df_prep[property_sources['heat_ch__W']].describe()

In [None]:
req_props_dstr = {'temp_indoor__degC',
                  'temp_flow_ch__degC',
                  'temp_ret_ch__degC',
                  'heat_ch__W',
                  # 'flow_dstr_pump_speed__pct',
                  # 'flow_dstr__dm3_s_1',
                 }

# Parameters that are learned
learn_params_dstr = {'heat_tr_dstr__W_K_1',
                     'th_mass_dstr__Wh_K_1',
                     'th_inert_dstr__h',
                     # 'flow_dstr_capacity__dm3_s_1',
                     # 'flow_dstr_resistance__Pa_dm_6_s2',  # this can only be learnt if df_bldng_data['pump_head__m'] is available
                    }


param_hints_dstr = {'heat_tr_dstr__W_K_1': heat_tr_dstr_nl_avg__W_K_1,  # Use average value
                    'th_mass_dstr__Wh_K_1': th_mass_dstr_nl_avg__Wh_K_1,  # Use average value
                    'th_inert_dstr__h': th_mass_dstr_nl_avg__Wh_K_1/heat_tr_dstr_nl_avg__W_K_1,
                    'flow_dstr_capacity__dm3_s_1': flow_dstr_capacity_nl_avg__dm3_s_1,
                   }

learned_params_dstr = {'learned_' + param for param in learn_params_dstr}

# properties to be predicted
predict_props_dstr = {'temp_ret_ch__degC',
                      'temp_dstr__degC',
                     }
minimum_duration__min = 20
# minimum_duration__min = 5

learn_period__d=7


In [None]:
req_cols_dstr = {property_sources[prop] for prop in req_props_dstr & property_sources.keys()}

In [None]:
df_prep[list(req_cols_dstr)].describe().T

In [None]:
df_prep[list(req_cols_dstr)][boiler_burning_mask].describe().T

In [None]:
df_prep[list(req_cols_dstr)][boiler_ch_mask].describe().T

In [None]:
%%time
%autoreload 2
df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
    df_prep,
    df_bldng_data=df_bldng_data,
    system_model_fn = Model.heat_distribution,
    job_identification_fn = Learner.valid_learn_list,
    property_sources=property_sources,
    learn_params=learn_params_dstr,
    param_hints=param_hints_dstr,
    predict_props=predict_props_dstr,
    req_props=req_props_dstr,
    duration_threshold=timedelta(minutes=minimum_duration__min),
    learn_period__d=learn_period__d,
    max_periods=max_periods, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
)

#### Process learned parameters

In [None]:
print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
    if error_cols:
        display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
df_learned_parameters_per_period

In [None]:
# Determine learned parameters 
threshold_duration = pd.Timedelta(minutes=20)
df_learned_parameters = df_learned_parameters_per_period[df_learned_parameters_per_period.index.get_level_values('duration') >= threshold_duration].groupby('id').median()

In [None]:
if not df_learned_parameters_per_period.empty:
    with pd.option_context('display.float_format', '{:.2f}'.format):
        display(df_learned_parameters.describe())

In [None]:
if not df_learned_parameters_per_period.empty:
    for learned_param in df_learned_parameters_per_period:
        Plot.nfh_property_per_id_boxplot(df_learned_parameters_per_period, property_col=learned_param)

In [None]:
if not df_learned_parameters_per_period.empty:
    # merge learned data into building metadata
    df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
                                        how='left',
                                        left_on='id',
                                        right_index=True
                                       )

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.T)

In [None]:
df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
# Merge predicted properties back into the main DataFrame
if not df_predicted_properties.empty:
    df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
    df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
df_prep.columns

### Visualize distribution of learn duration for distribution system parameters

In [None]:
df_dstr_results_per_period = df_learned_parameters_per_period

In [None]:
df_dstr_results_per_period.head()

### Compare heat distribution parameters floor_heating or not

In [None]:
%%time
# Merging info about floor heating from df_homes_boilers
if 'floor_heating__bool' not in df_dstr_results_per_period.index.names:
    df_dstr_results_per_period= df_dstr_results_per_period.merge(df_homes_boilers[['floor_heating__bool']],
                                                                 how='left',
                                                                 left_on='id',
                                                                 right_index=True
                                                                 ).set_index('floor_heating__bool', append=True)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_dstr_results_per_period.groupby(['floor_heating__bool']).agg(['median', 'mean', 'std']).T)

In [None]:
# hint for heat dissipation capacity of the heat distribution system
heat_tr_dstr_nl_avg__W_K_1

In [None]:
# hint for thermal mass of the heat distribution system
th_mass_dstr_nl_avg__Wh_K_1

In [None]:
# hint for thermal inertia of the heat distribution system
th_mass_dstr_nl_avg__Wh_K_1 / heat_tr_dstr_nl_avg__W_K_1

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_dstr_results_per_period.groupby(['floor_heating__bool']).median().T)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_dstr_results_per_period.groupby(['floor_heating__bool', 'id']).median().T)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_dstr_results_per_period.groupby(['floor_heating__bool', 'id']).mean().T)

In [None]:
learned_params_dstr

In [None]:
for learned_param in learned_params_dstr:
        Plot.nfh_property_per_id_boxplot(df_dstr_results_per_period[df_dstr_results_per_period.index.get_level_values('duration') >= threshold_duration], property_col=learned_param)

In [None]:
%autoreload 2
for learned_param in learned_params_dstr:
    Plot.nfh_property_grouped_boxplot(df_dstr_results_per_period[df_dstr_results_per_period.index.get_level_values('duration') >= threshold_duration],
                                      property_col=learned_param,
                                      groupby_level='floor_heating__bool'
                                     )

### Inspect influence of duration minimum

In [None]:
%%time
# Initialize a dictionary to store results
results = {'duration': []}

# Dynamically add keys for each parameter
for learned_param in learned_params_dstr:
    results[f'median_{learned_param}'] = []
    results[f'weighted_mean_{learned_param}'] = []
    results[f'weighted_std_{learned_param}'] = []


# Loop over duration thresholds
for threshold in range(minimum_duration__min, 241, 5):  # From 15 to 240, steps of 5
    # Filter DataFrame based on duration__min
    filtered_df = df_dstr_results_per_period[
        df_dstr_results_per_period.index.get_level_values('duration') >= pd.Timedelta(minutes=threshold)
    ]
    
    if not filtered_df.empty:
        # Extract weights (duration__min)
        weights = filtered_df.index.get_level_values('duration').total_seconds() / 60

        # Loop through the properties
        for learned_param in learned_params_dstr:
            values = filtered_df[learned_param]
            # Calculate median
            median = np.median(values)
            # Calculate weighted mean
            weighted_mean = np.average(values, weights=weights)
            # Calculate weighted standard deviation
            weighted_std = np.sqrt(np.average((values - weighted_mean) ** 2, weights=weights))
            # Store results
            results[f'median_{learned_param}'].append(median)
            results[f'weighted_mean_{learned_param}'].append(weighted_mean)
            results[f'weighted_std_{learned_param}'].append(weighted_std)

        # Store the threshold
        results['duration'].append(threshold)

# Convert results dictionary to a DataFrame for plotting
results_df = pd.DataFrame(results)

#### Inspect influence of duration__min minimum on weighted mean and weighted standard deviation

In [None]:
# Plot the results

# Clear the current figure if it exists
plt.clf()

plt.figure(figsize=(10, 6))
for learned_param in learned_params_dstr:
    for metric in ['median', 'weighted_mean', 'weighted_std']:
        plt.plot(
            results_df['duration'], 
            results_df[f"{metric}_{learned_param}"],
            '.--',
            label=f"{metric}__{learned_param}",
        )

# Customize plot
plt.xlabel('Minimum Duration (minutes)')
plt.ylabel('Wighted Mean /  Standard Deviation')
plt.title('Impact of Minimum Duration on Learned Parameters')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
for learned_parameter in learned_params_dstr:

    #select ids
    # all ids learned: only uncomment the line below
    unique_ids = df_dstr_results_per_period.index.get_level_values('id').unique()
    # ids learned WITHOUT floor heating: only uncomment the line below
    # unique_ids = [id for id in df_dstr_results_per_period.index.get_level_values('id').unique() if df_bldng_data.loc[id]['floor_heating__bool'] == False]
    # ids learned WITH floor heating: only uncomment the line below
    # unique_ids = [id for id in df_dstr_results_per_period.index.get_level_values('id').unique() if df_bldng_data.loc[id]['floor_heating__bool'] == True]

    # Prepare a colormap for distinct IDs
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_ids)))
    
    # Start a new figure
    # Clear the current figure if it exists
    plt.figure(figsize=(12, 8))
    
    # Iterate over unique IDs to create separate lines per ID
    for idx, (id_value, color) in enumerate(zip(unique_ids, colors)):
        # Filter the DataFrame for the current ID
        df_id = df_dstr_results_per_period.loc[df_dstr_results_per_period.index.get_level_values('id') == id_value]
    
        # Initialize lists for thresholds, weighted means, and weighted stds
        thresholds = []
        weighted_means = []
        weighted_stds = []
    
        # Loop over duration thresholds
        for threshold in range(minimum_duration__min, 241, 1):  # From 15 to 240, steps of 1
            # Filter DataFrame based on duration__min
            filtered_df = df_id[df_id.index.get_level_values('duration') >= pd.Timedelta(minutes=threshold)]
    
            if not filtered_df.empty:
                # Extract weights (duration__min)
                weights = filtered_df.index.get_level_values('duration').total_seconds() / 60
    
                # Compute weighted mean and std for the chosen learned parameter
                values = filtered_df[learned_parameter]
                
                weighted_mean = np.average(values, weights=weights)
                weighted_std = np.sqrt(np.average((values - weighted_mean)**2, weights=weights))
    
                # Append the results
                thresholds.append(threshold)
                weighted_means.append(weighted_mean)
                weighted_stds.append(weighted_std)
    
        # Plot the weighted mean for the current ID
        plt.plot(thresholds, weighted_means, '.--', color=color, label=f'ID: {id_value}')
    
        # Add the shaded region for the weighted standard deviation
        plt.fill_between(
            thresholds,
            np.array(weighted_means) - np.array(weighted_stds),
            np.array(weighted_means) + np.array(weighted_stds),
            color=color,
            alpha=0.2,
        )
    
    # Customize the plot
    plt.xlabel('Minimum Duration (minutes)')
    plt.ylabel(f'Learned {learned_parameter}')
    plt.title('Impact of Minimum Duration on Weighted Mean and StDev per ID')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='ID')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Show the plot
    plt.show()


#### Inspect influence of duration minimum on median and interquartile range

In [None]:
for learned_parameter in learned_params_dstr:

    #select ids
    # all ids learned: only uncomment the line below
    unique_ids = df_dstr_results_per_period.index.get_level_values('id').unique()
    # ids learned WITHOUT floor heating: only uncomment the line below
    # unique_ids = [id for id in df_dstr_results_per_period.index.get_level_values('id').unique() if df_bldng_data.loc[id]['floor_heating__bool'] == False]
    # ids learned WITH floor heating: only uncomment the line below
    # unique_ids = [id for id in df_dstr_results_per_period.index.get_level_values('id').unique() if df_bldng_data.loc[id]['floor_heating__bool'] == True]

    # Prepare a colormap for distinct IDs
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_ids)))
    
    # Start a new figure
    # Clear the current figure if it exists
    plt.figure(figsize=(12, 8))
    
    # Iterate over unique IDs to create separate lines per ID
    for idx, (id_value, color) in enumerate(zip(unique_ids, colors)):
        # Filter the DataFrame for the current ID
        df_id = df_dstr_results_per_period.loc[df_dstr_results_per_period.index.get_level_values('id') == id_value]
    
        # Initialize lists for thresholds, weighted means, and weighted stds
        thresholds = []
        medians = []
        Q1_values = []
        Q3_values = []
    
        # Loop over duration thresholds
        for threshold in range(minimum_duration__min, 241, 1):  # From 15 to 240, steps of 1
            # Filter DataFrame based on duration__min
            filtered_df = df_id[df_id.index.get_level_values('duration') >= pd.Timedelta(minutes=threshold)]
    
            if not filtered_df.empty:
                # Compute weighted mean and std for the chosen learned parameter
                values = filtered_df[learned_parameter]
                
                median = np.median(values)
                Q1 = np.quantile(values, 0.25)
                Q3 = np.quantile(values, 0.75)
    
                # Append the results
                thresholds.append(threshold)
                medians.append(median)
                Q1_values.append(Q1)
                Q3_values.append(Q3)
    
        # Plot the median for the current ID
        plt.plot(thresholds, medians, '.--', color=color, label=f'ID: {id_value}')
        
        # Add the shaded region for the weighted standard deviation
        plt.fill_between(
            thresholds,
            np.array(Q1_values),
            np.array(Q3_values),
            color=color,
            alpha=0.2,
        )
    
    # Customize the plot
    plt.xlabel('Minimum Duration (minutes)')
    plt.ylabel(f'Learned {learned_parameter}')
    plt.title('Impact of Minimum Duration on Median and Interquartile Range per ID')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='ID')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    
    # Show the plot
    plt.show()


### Calculate flow capacity and flow resistance

In [None]:
columns_to_check = ['flow_dstr__dm3_s_1', property_sources['flow_dstr_pump_speed__pct'], 'flow_dstr_capacity__dm3_s_1']

In [None]:
# Add flow_dstr_capacity__dm3_s_1 to df_prep
df_prep['flow_dstr_capacity__dm3_s_1'] = (
    df_prep['flow_dstr__dm3_s_1'] / (df_prep[property_sources['flow_dstr_pump_speed__pct']] / 100)
)

In [None]:
df_prep.groupby('id')['flow_dstr_capacity__dm3_s_1'].max()

In [None]:
# Group by id and calculate medians for flow_dstr_capacity__dm3_s_1
# flow_capacity_median = df_prep.groupby('id')['flow_dstr_capacity__dm3_s_1'].median() # this did  now work for some weird reason
flow_capacity_median = df_prep.groupby('id')['flow_dstr_capacity__dm3_s_1'].apply(lambda x: x.median())

In [None]:
with pd.option_context('display.float_format', '{:.6f}'.format):
    display(flow_capacity_median.to_frame())

In [None]:
# Update df_bldng_data with learned flow capacities
# df_bldng_data['learned_flow_dstr_capacity__dm3_s_1'] = flow_capacity_median

In [None]:
# # Calculate flow_dstr_resistance__Pa_dm_6_s2 for each row in df_prep
# df_prep['flow_dstr_resistance__Pa_dm_6_s2'] = (
#     water_density__kg_dm_3(df_prep[flow_capacity_median['temp_ret_ch__degC']], heat_dstr_nl_avg_abs__Pa)
#     * g__m_s_2
#     * df_bldng_data.loc[df_prep.index.get_level_values('id'), 'pump_head__m'].values
#     / (df_prep['flow_dstr_capacity__dm3_s_1'] ** 2 / dm3_m_3)
# )

In [None]:
# # Group by id and calculate medians for flow_dstr_resistance__Pa_dm_6_s2
# flow_resistance_median = df_prep.groupby('id')['flow_dstr_resistance__Pa_dm_6_s2'].median()

In [None]:
# # Update df_bldng_data with learned flow resistances
# df_bldng_data['flow_dstr_resistance__Pa_dm_6_s2'] = flow_resistance_median

## Learn ventilation characteristics

In [None]:
# optionally change the selection which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    'temp_flow_ch__degC':       'batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_ret_ch__degC':        'batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1', 
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
}

In [None]:
req_props_vent ={
    'co2_indoor__ppm',
    'occupancy__p',
    'wind__m_s_1'
}

req_cols_vent = {property_sources[prop] for prop in req_props_vent & property_sources.keys()}

learn_params_vent = {
    'aperture_inf_vent__cm2',
}

learned_params_vent = {'learned_' + param for param in learn_params_vent}

predict_props_vent = {
    'ventilation__dm3_s_1'
}

param_hints_vent = {
    'wind_chill__K_s_m_1':          wind_chill_nl_avg__K_s_m_1,          # wind chill factor
    'aperture_inf__cm2':            aperture_inf_nl_avg__cm2,            # effective infiltration area 
    'occupancy__p':                 occupancy_nl_avg__p,                 # house occupancy
    'ventilation_default__dm3_s_1': 7.0,                                 # default ventilation rate for the entire home
    'ventilation_max__dm3_s_1_m_2': 1.0,                                 # maximum ventilation rate per m2 floor area
    'co2_outdoor__ppm':             co2_outdoor_eu_avg_2022__ppm,        # average CO₂ outdoor concentration
}

In [None]:
req_props_vent

In [None]:
req_cols_vent

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=[property_sources[prop] for prop in req_props_vent], freq='1W', title_fontsize=8)

In [None]:
# learn_period__d = 3
learn_period__d = 7
# max_periods = 10
# max_periods = None

minimum_duration__min = 60

In [None]:
%%time
%autoreload 2
df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
    df_prep,
    df_bldng_data=df_bldng_data,
    system_model_fn = Model.ventilation,
    job_identification_fn = Learner.periodic_learn_list,
    property_sources=property_sources,
    learn_params=learn_params_vent,
    param_hints=param_hints_vent,
    predict_props=predict_props_vent,
    req_props=req_props_vent,
    duration_threshold=timedelta(minutes=minimum_duration__min),
    learn_period__d=learn_period__d,
    max_periods=max_periods, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
    # max_iter=10,
)


In [None]:
df_prep[property_sources['co2_indoor__ppm']].groupby('id').count()

In [None]:
df_predicted_properties['predicted_ventilation__dm3_s_1'].groupby('id').count()

In [None]:
%%time

# Postprocess ventilation values
def resample_and_fill(group):
    # Ensure 'timestamp' is the index for resampling
    group = group.droplevel('id')  # Temporarily drop 'id' to resample 'timestamp'
    group = group.resample('1min').asfreq()  # Resample to 1-minute frequency
    group['predicted_ventilation__dm3_s_1'] = group['predicted_ventilation__dm3_s_1'].ffill(limit=30)
    return group

# Apply the function per 'id'
df_filled = (
    df_predicted_properties
    .groupby(level='id')
    .apply(resample_and_fill)
)

df_predicted_properties = df_filled.reset_index().set_index(['id', 'timestamp'])

#### Process learned parameters

In [None]:
print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
    if error_cols:
        display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
df_learned_parameters_per_period

In [None]:
# Determine learned parameters 
if not df_learned_parameters_per_period.empty:
    df_learned_parameters = df_learned_parameters_per_period.groupby('id').median()

In [None]:
if not df_learned_parameters_per_period.empty:
    with pd.option_context('display.float_format', '{:.2f}'.format):
        display(df_learned_parameters.describe())

In [None]:
if not df_learned_parameters_per_period.empty:
    # merge learned data into building metadata
    df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
                                        how='left',
                                        left_on='id',
                                        right_index=True
                                       )

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.T)

In [None]:
df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

#### Process predicted properties

In [None]:
print(f"We predicted properties for {0 if df_predicted_properties.empty else len(df_predicted_properties.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
Plot.plot_missing_data_overview(df_predicted_properties, properties_include=[property_sources[prop] for prop in predict_props_vent], freq='1W', title_fontsize=8)

In [None]:
# Merge predicted properties back into the main DataFrame
if not df_predicted_properties.empty:
    df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
    df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
df_prep.columns

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=[property_sources[prop] for prop in predict_props_vent], freq='1W', title_fontsize=8)

## Learn building parameters

In [None]:
# optionally change the selection which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':        'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':           'batch_import_remeha_temp_set__degC',
    'comfortable__bool':        'comfortable__bool',
    'temp_outdoor__degC':       'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':              'batch_import_KNMI_wind__m_s_1',
    'sol_ghi__W_m_2':           'batch_import_KNMI_sol_ghi__W_m_2', 
    'g_use_ch_hhv__W':          'g_use_fan_ch_hhv__W',
    'eta_ch_hhv__W0':           'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W':         'g_use_fan_dhw_hhv__W',
    'e__W':                     'device_p1-reader_e__W',    
    'occupancy__p':             'device_living_room_occupancy__p',
    'co2_indoor__ppm':          'device_living_room_co2_indoor__ppm',
    'temp_flow__degC':          'batch_import_remeha_temp_flow__degC',
    'temp_ret__degC':           'batch_import_remeha_temp_ret__degC',
    'temp_flow_ch__degC':       'batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_ret_ch__degC':        'batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_flow_ch__degC':       'interpolated_batch_import_remeha_temp_flow_ch__degC', # choose between interpolated or not depending on needs of the analysis algorithm
    # 'temp_ret_ch__degC':        'interpolated_batch_import_remeha_temp_ret_ch__degC',  # choose between interpolated or not depending on needs of the analysis algorithm
    'temp_flow_ch_max__degC':   'batch_import_remeha_temp_flow_ch_max__degC', 
    'fan_rotations__min_1':     'batch_import_remeha_fan_rotations__min_1', 
    'fan_speed__pct':           'fan_speed__pct', 
    'flow_dstr_pump_speed__pct':'batch_import_remeha_flow_dstr_pump_speed__pct',
    'temp_dstr__degC':          'temp_dstr__degC', # added to get mae and rmse values
    'ventilation__dm3_s_1':     'predicted_ventilation__dm3_s_1', 
    'temp_flow_ch_set__degC':   'predicted_temp_flow_ch_set__degC',
    'heat_ch__W':               'heat_ch__W',
}

In [None]:
# use calulated flow setpoint from the thermostat, but if not learnt, set flow setpoint to 60 degC for learning boiler control
if property_sources['temp_flow_ch_set__degC'] not in df_prep.columns:
    print("temp_flow_ch_set__degC temperatures not learned; setting to 60 degrees")
    df_prep[property_sources['temp_flow_ch_set__degC']] = 60 

In [None]:
req_props_bldng = {
    'e__W',
    'g_use_ch_hhv__W',
    'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W',
    'occupancy__p',
    'sol_ghi__W_m_2',
    'wind__m_s_1',
    'temp_outdoor__degC',
    'temp_indoor__degC',
    'heat_ch__W', 
    # 'ventilation__dm3_s_1', 
    # 'co2_indoor__ppm',
    # 'fan_rotations__min_1',
    # 'fan_speed__pct',
    # 'flow_dstr_pump_speed__pct',
}

helper_props_bldng = {
    'comfortable__bool',  
    'temp_set__degC',
    'temp_flow_ch_max__degC',
    'temp_flow_ch__degC',   
    'temp_ret_ch__degC',
    'temp_flow_ch_max__degC',
    'temp_flow_ch_set__degC',
    'temp_flow__degC',
    'temp_ret__degC',
    'temp_dstr__degC',
    'eta_ch_hhv__W0',
    # 'comfortable__bool',
    # 'fan_speed__pct',
    # 'flow_dstr_pump_speed__pct',
}

learn_params_bldng = {
    'heat_tr_bldng_cond__W_K_1',
    'th_inert_bldng__h',
    'th_mass_bldng__Wh_K_1',
    'aperture_sol__m2',
    'aperture_inf__cm2',
    # 'heat_tr_dstr__W_K_1',
    # 'th_mass_dstr__J_K_1',
}

param_hints_bldng = {
    'heat_tr_bldng_cond__W_K_1':    heat_tr_bldng_nl_avg__W_K_1,         # specific heat loss of the building via conduction
    'th_inert_bldng__h':            th_inert_bldng_nl_avg__h,            # thermal inertia
    'aperture_sol__m2':             aperture_sol_nl_avg__m2,             # apparent solar aperture
    'wind_chill__K_s_m_1':          wind_chill_nl_avg__K_s_m_1,          # wind chill factor
    'aperture_inf__cm2':            aperture_inf_nl_avg__cm2,            # effective infiltration area 
    'occupancy__p':                 occupancy_nl_avg__p,                 # house occupancy
    'heat_int__W_p_1':              heat_int_nl_avg__W_p_1,              # heat gain per occupant
    'eta_ch_hhv__W0':               eta_ch_nl_avg_hhv__W0,               # home heating efficiency of a gas boiler (based on higher heating value)
    'eta_dhw_hhv__W0':              eta_dhw_nl_avg_hhv__W0,              # domestic hot water efficiency
    'frac_remain_dhw__0':           frac_remain_dhw_nl_avg__0,           # fraction of domestic hot water heat contributing to heating the home
    'g_use_cooking_hhv__W':         g_use_cooking_nl_avg_hhv__W,         # gas power (higher heating value) for cooking 
    'eta_cooking_hhv__W0':          eta_cooking_nl_avg_hhv__W0,          # cooking efficiency
    'frac_remain_cooking__0':       frac_remain_cooking_nl_avg__0,       # fraction of cooking heat contributing to heating the home
    'heat_tr_dstr__W_K_1':          heat_tr_dstr_nl_avg__W_K_1,          # heat dissipation capacity of the heat distribution system
    'th_mass_dstr__Wh_K_1':         th_mass_dstr_nl_avg__Wh_K_1,         # thermal mass of the heat distribution system
    'ventilation_default__dm3_s_1': 7.0,                                 # default ventilation rate for the entire home
    'ventilation_max__dm3_s_1_m_2': 1.0,                                 # maximum ventilation rate per m2 floor area
    'co2_outdoor__ppm':             co2_outdoor_eu_avg_2022__ppm,        # average CO₂ outdoor concentration
}

learned_params = {'learned_' + param for param in learn_params_bldng}

predict_props_bldng = {
    'temp_indoor__degC',
}


In [None]:
#define the length of the learning period
learn_period__d=7
# learn_period__d=3

#TO DO: check whether minimum_duration__min is really needed for building parameter learning, or whether there is some other threshold effective (make settable?)
minimum_duration__min = 60

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=[property_sources[prop] for prop in req_props_bldng & property_sources.keys()], freq='1W', title_fontsize=8)

In [None]:
df_prep[[property_sources[prop] for prop in req_props_bldng]].describe().T

In [None]:
%%time
%autoreload 2
df_learned_parameters_per_period, df_predicted_properties = Learner.learn_system_parameters(
    df_prep,
    df_bldng_data=df_bldng_data,
    system_model_fn = Model.building,
    job_identification_fn = Learner.periodic_learn_list,
    property_sources=property_sources,
    learn_params=learn_params_bldng,
    param_hints=param_hints_bldng,
    predict_props=predict_props_bldng,
    req_props=req_props_bldng,
    helper_props=helper_props_bldng,
    duration_threshold=timedelta(minutes=minimum_duration__min),
    learn_period__d=learn_period__d,
    max_periods=max_periods, # set max_periods to a value other than None, e.g. 100 during testing of learning algorithms
    # max_iter=10,
)

In [None]:
# Primary metric how well the training phase went
print(f"MAE = {df_learned_parameters_per_period['mae_temp_indoor__degC'].mean():.2f} K (Mean Absolute Error of the learning phase)")

#### Process learned parameters

In [None]:
print(f"We learned parameters for {0 if df_learned_parameters_per_period.empty else len(df_learned_parameters_per_period.index.get_level_values('id').unique())} out of {len(df_prep.index.get_level_values('id').unique())} homes")

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    error_cols = [prop for prop in df_learned_parameters_per_period.columns if prop.startswith('mae_') or prop.startswith('rmse_')]
    if error_cols:
        display(df_learned_parameters_per_period[error_cols].describe())

In [None]:
df_learned_parameters_per_period

In [None]:
# Determine learned parameters 
threshold_duration = timedelta(days=1)

df_learned_parameters = df_learned_parameters_per_period[df_learned_parameters_per_period.index.get_level_values('duration') >= threshold_duration].groupby('id').median()

In [None]:
df_learned_parameters

In [None]:
if not df_learned_parameters_per_period.empty:
    with pd.option_context('display.float_format', '{:.2f}'.format):
        display(df_learned_parameters.describe().T)

#### Calculate how well parameters were learned

In [None]:
# show essential statistics for the errors; all periods per home
df_learned_parameters_per_period[['mae_temp_indoor__degC', 'rmse_temp_indoor__degC']].groupby('id').describe().T

In [None]:
if not df_learned_parameters_per_period.empty:
    for learned_param in df_learned_parameters_per_period:
        Plot.nfh_property_per_id_boxplot(df_learned_parameters_per_period, property_col=learned_param)

In [None]:
df_bldng_data.T

In [None]:
if not df_learned_parameters_per_period.empty:
    # merge learned data into building metadata
    df_bldng_data = df_bldng_data.merge(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('learned_')]],
                                        how='left',
                                        left_on='id',
                                        right_index=True
                                       )

In [None]:
# temporary solution
if 'learned_aperture_inf__cm2_y' in df_bldng_data.columns:
    df_bldng_data.rename(columns={'learned_aperture_inf__cm2_y': 'learned_aperture_inf__cm2'}, inplace=True)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_bldng_data.T)

In [None]:
# # Uncomment this code when needed, e.g. when multiple runs are done

# # Remove columns that end with '_x'
# df_bldng_data = df_bldng_data.loc[:, ~df_bldng_data.columns.str.endswith('_x')]

# # Rename columns to remove '_y' postfix
# df_bldng_data = df_bldng_data.rename(columns=lambda col: col[:-2] if col.endswith('_y') else col)


In [None]:
df_bldng_data.to_parquet(home_metadata_file_path, index=True, engine='pyarrow')

In [None]:
[f"learned_{col}" for col in (learn_params_bldng | learn_params_dstr)]

In [None]:
df_bldng_data.columns

In [None]:
df_learned_parameters_per_pseudonym = df_bldng_data[[
    'learned_heat_tr_bldng_cond__W_K_1',
    'learned_th_mass_bldng__Wh_K_1',
    'learned_th_inert_bldng__h',
    'learned_aperture_sol__m2',
    'learned_aperture_inf__cm2',
    'learned_heat_tr_dstr__W_K_1',
    'learned_th_mass_dstr__Wh_K_1',
    'learned_th_inert_dstr__h',
    'usable_area__m2',
    'floor_heating__bool',
    'pump_head__m',
]].T

In [None]:
df_learned_parameters_per_pseudonym

In [None]:
df_learned_parameters_per_pseudonym.to_excel('rhc_learned_parameters_per_pseudonym.xlsx', index=True)

#### Process predicted properties

In [None]:
# Merge predicted properties back into the main DataFrame
if not df_predicted_properties.empty:
    df_prep = df_prep.drop(columns=df_prep.columns.intersection(df_predicted_properties.columns))
    df_prep = df_prep.merge(df_predicted_properties, left_index=True, right_index=True, how="left")

In [None]:
df_prep.columns

#### Display learned averages

In [None]:
df_learned_parameters

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_learned_parameters[[prop for prop in df_learned_parameters.columns if prop.startswith('avg_')]].mean().T)

In [None]:
df_prep['heat_ch__W'].mean()

In [None]:
(df_learned_parameters['avg_g_use_fan_ch_hhv__W']*df_learned_parameters['avg_eta_ch_hhv__W0'])

In [None]:
%%time 
# Convert all datetime columns to timezone-naive
df_learned_parameters_per_period_no_tz = df_learned_parameters_per_period.reset_index(drop=False).apply(lambda x: x.dt.tz_localize(None) if x.dtype.kind == 'M' else x)

# Convert 'duration' index level to total seconds
df_learned_parameters_per_period_no_tz['duration'] = df_learned_parameters_per_period_no_tz['duration'].dt.total_seconds()

# Export to Excel
df_learned_parameters_per_period_no_tz.to_excel(rhc_analysis_results_per_period_file, index=False)

In [None]:
%%time 
df_predicted_properties.to_parquet(rhc_analysis_results_file, index=True, engine='pyarrow')

# Scenario analysis

## Add correction factors to make periods comparable

In [None]:
df_results_per_period = df_learned_parameters_per_period

In [None]:
# Calculate period length in seconds using the 'end' and 'start' MultiIndex levels
df_results_per_period['duration__s'] = df_results_per_period.index.get_level_values('duration').total_seconds()

In [None]:
# Calculate total period duration per `id`
total_duration__s = df_results_per_period.groupby('id')['duration__s'].transform('sum')

In [None]:
# Calculate `period_avg_int_heat__K`, `period_avg_inf_chill__K`, and `period_avg_vent_chill__K` with division by `learned_heat_tr_bldng_cond__W_K_1`
df_results_per_period['period_avg_int_heat__K'] = np.where(
    df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'] != 0,
    df_results_per_period['avg_heat_int__W'] / df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'],
    np.nan
)

df_results_per_period['period_avg_inf_chill__K'] = np.where(
    df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'] != 0,
    df_results_per_period['avg_heat_loss_bldng_inf__W'] / df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'],
    np.nan
)

df_results_per_period['period_avg_vent_chill__K'] = np.where(
    df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'] != 0,
    df_results_per_period['avg_heat_loss_bldng_vent__W'] / df_results_per_period['learned_heat_tr_bldng_cond__W_K_1'],
    np.nan
)

In [None]:
# Calculate the weighted sums
df_results_per_period['int_heat__K_s'] = df_results_per_period['period_avg_int_heat__K'] * df_results_per_period['duration__s']
df_results_per_period['inf_chill__K_s'] = df_results_per_period['period_avg_inf_chill__K'] * df_results_per_period['duration__s']
df_results_per_period['vent_chill__K_s'] = df_results_per_period['period_avg_vent_chill__K'] * df_results_per_period['duration__s']
df_results_per_period['temp_outdoor__degC_s'] = df_results_per_period['avg_temp_outdoor__degC'] * df_results_per_period['duration__s']

In [None]:
# Calculate time-weighted averages by dividing the weighted sums by `total_duration__s`
df_results_per_period['all_periods_avg_int_heat__K'] = df_results_per_period.groupby('id')['int_heat__K_s'].transform('sum') / total_duration__s
df_results_per_period['all_periods_avg_inf_chill__K'] = df_results_per_period.groupby('id')['inf_chill__K_s'].transform('sum') / total_duration__s
df_results_per_period['all_periods_avg_vent_chill__K'] = df_results_per_period.groupby('id')['vent_chill__K_s'].transform('sum') / total_duration__s
df_results_per_period['all_periods_avg_temp_outdoor__degC'] = df_results_per_period.groupby('id')['temp_outdoor__degC_s'].transform('sum') / total_duration__s

In [None]:
# Calculate period heat demand
df_results_per_period['period_heat_demand__K'] = (
    df_results_per_period['avg_indoor_outdoor_delta__K'] +
    df_results_per_period['period_avg_inf_chill__K'] +
    df_results_per_period['period_avg_vent_chill__K'] -
    df_results_per_period['period_avg_int_heat__K']
).clip(lower=0)

In [None]:
# Calculate period corrected heat demand 
df_results_per_period['corrected_heat_demand__K'] = (
    df_results_per_period['avg_indoor_outdoor_delta__K'] +
    df_results_per_period['all_periods_avg_inf_chill__K'] +
    df_results_per_period['all_periods_avg_vent_chill__K'] -
    df_results_per_period['all_periods_avg_int_heat__K'] +
    (df_results_per_period['avg_temp_outdoor__degC'] - df_results_per_period['all_periods_avg_temp_outdoor__degC'])
).clip(lower=0)

In [None]:
# Calculate the correction factor, avoiding division by zero
df_results_per_period['energy_ch_correction_factor__0'] = np.where(
    df_results_per_period['period_heat_demand__K'] != 0,
    df_results_per_period['corrected_heat_demand__K'] / df_results_per_period['period_heat_demand__K'],
    1.0  # Default to 1.0 if period_heat_demand__K is zero to avoid division by zero
)

In [None]:
# Apply the correction factor to calculate corrected_avg_energy_ch__W and corrected_avg_co2_ch__g_s_1
df_results_per_period['corrected_avg_energy_ch__W'] = (
    df_results_per_period['avg_energy_ch__W'] * df_results_per_period['energy_ch_correction_factor__0']
)
df_results_per_period['corrected_avg_co2_ch__g_s_1'] = (
    df_results_per_period['avg_co2_ch__g_s_1'] * df_results_per_period['energy_ch_correction_factor__0']
)

In [None]:
with pd.option_context('display.float_format', '{:.2f}'.format):
    display(df_results_per_period.describe().T)


In [None]:
# Optionally drop intermediate columns for a clean final DataFrame
# df_results_per_period = df_results_per_period.drop(columns=['period_avg_int_heat__K', 'period_avg_inf_chill__K', 'period_avg_vent_chill__K',
#                                                             'int_heat__K_s', 'inf_chill__K_s', 'vent_chill__K_s', 'temp_outdoor__degC_s',
#                                                             'heat_demand__K', 'corrected_heat_demand__K'])

## Visualize learned parameters (boxplots per id to illustrate variability)

In [None]:
df_results_per_period[list(learned_params)].describe().T

In [None]:
df_results_per_period[list(learned_params)].groupby('id').describe().T

In [None]:
for learned_param in learned_params:
    Plot.nfh_property_per_id_boxplot(df_results_per_period, property_col=learned_param)

### Visualization of predicted temperatures

In [None]:
df_prep['predicted_temp_indoor__degC'].groupby('id').count().to_frame().T

In [None]:
if 'ventilation__dm3_s_1' in predict_props_bldng:
    display(df_prep['predicted_ventilation__dm3_s_1'].groupby('id').count().to_frame().T)
    display(df_prep['predicted_ventilation__dm3_s_1'].groupby('id').describe().T)

In [None]:
if 'ventilation__dm3_s_1' in predict_props_bldng:
    display(df_prep['predicted_ventilation__dm3_s_1'])

In [None]:
#Plot only temperatures from all sources for all ids
if 'ventilation__dm3_s_1' in predict_props_bldng:
    Plot.dataframe_preprocessed_plot(df_prep[['batch_import_remeha_temp_indoor__degC', 'predicted_temp_indoor__degC', 'predicted_ventilation__dm3_s_1']].xs(specific_id, level='id', drop_level=False), units_to_mathtext)
else:
    Plot.dataframe_preprocessed_plot(df_prep[['batch_import_remeha_temp_indoor__degC', 'predicted_temp_indoor__degC']].xs(specific_id, level='id', drop_level=False), units_to_mathtext)

## Visualize calculated metrics for Energy Case, Comfort Case and Carbon Case

In [None]:
metrics = {
    'corrected_avg_energy_ch__W',
    'avg_comfortable__0',
    'corrected_avg_co2_ch__g_s_1'
}

In [None]:
df_results_per_period[list(metrics & set(df_results_per_period.columns))].groupby('id').describe().T

## Analyze and visualize intervention A1: change thermostat program

In [None]:
# Load the regime change data from Excel
regime_changes_A1 = pd.read_excel(regime_change_A1_file_path)

In [None]:
# Convert the program change datetime to a datetime object only if the datetime and timezone values are non-empty
regime_changes_A1['A_thermostat_program_change_datetime'] = pd.to_datetime(
    regime_changes_A1['A_thermostat_program_change_datetime'], errors='coerce'
)

# Apply timezone localization conditionally
regime_changes_A1['A_thermostat_program_change_datetime'] = regime_changes_A1.apply(
    lambda row: row['A_thermostat_program_change_datetime'].tz_localize(row['timezone'])
    if pd.notna(row['A_thermostat_program_change_datetime']) and pd.notna(row['timezone'])
    else row['A_thermostat_program_change_datetime'],
    axis=1
)

In [None]:
regime_changes_A1 = regime_changes_A1.drop(columns=['timezone'])

In [None]:
regime_changes_A1['regime_sequence_number'] = regime_changes_A1.groupby('id').cumcount() + 1

In [None]:
regime_changes_A1

In [None]:
df_results_per_period

In [None]:
# Label results per period with regime sequence numbers, if there was no regime change in the period

# Initialize the regime_sequence_number column
df_results_per_period['regime_sequence_number'] = None

# reset index
df_results_per_period = df_results_per_period.reset_index()

# Loop over each row in df_results_per_period
for index, row in df_results_per_period.iterrows():
    # Get the current id, start, and end for the row
    current_id = row['id']
    start_time = row['start']
    end_time = row['end']
    duration = row['duration']

    # Check for regime changes within the interval
    regime_changes = regime_changes_A1[
        (regime_changes_A1['id'] == current_id) &
        (regime_changes_A1['A_thermostat_program_change_datetime'] >= start_time) &
        (regime_changes_A1['A_thermostat_program_change_datetime'] <= end_time)
    ]

    # If there are regime changes, do not assign a regime sequence number
    if not regime_changes.empty:
        continue  # Skip assigning a regime sequence number for this interval

    # If there are no regime changes, find the latest regime sequence number before the start
    latest_regime = regime_changes_A1[
        (regime_changes_A1['id'] == current_id) &
        (regime_changes_A1['A_thermostat_program_change_datetime'] <= start_time)
    ].sort_values(by='A_thermostat_program_change_datetime', ascending=False)

    # Assign the regime sequence number if found
    if not latest_regime.empty:
        df_results_per_period.at[index, 'regime_sequence_number'] = latest_regime.iloc[0]['regime_sequence_number']



# Convert the regime_sequence_number column to integers
df_results_per_period['regime_sequence_number'] = pd.to_numeric(df_results_per_period['regime_sequence_number'], downcast='integer')

# restore the index
df_results_per_period = df_results_per_period.set_index(['id', 'start', 'end', 'duration'])

In [None]:
regime_changes_A1[['id', 'regime_sequence_number']].groupby('id').describe()

In [None]:
df_results_per_period[['regime_sequence_number']].groupby('id').describe()

In [None]:
# Group by 'id' and calculate standard deviation for 'regime_sequence_number'
std_values = regime_changes_A1.groupby('id')['regime_sequence_number'].std()

# Filter the ids where std > 0
id_mask_intervention_A1 = std_values[std_values > 0].index.tolist()

In [None]:
df_results_per_period.columns.to_list()

In [None]:
intervention_results_columns_a1 = {
    'regime_sequence_number',
    'duration__s',
    'avg_temp_set__degC',
    'avg_temp_indoor__degC',
    'avg_temp_outdoor__degC',
    'avg_energy_ch__W',
    'avg_co2_ch__g_s_1'
} | metrics

In [None]:
df_results_per_a1_intervention = (df_results_per_period[df_results_per_period.index.get_level_values('id').isin(id_mask_intervention_A1)][list(intervention_results_columns_a1)]
                                  .reset_index()
                                  .dropna(subset=['regime_sequence_number'])  # Drop rows where 'regime_sequence_number' is NaN
                                  .dropna(subset=metrics) # Drop rows where any of the metrics is NaN
                                  .set_index(['id', 'regime_sequence_number', 'start', 'end']))

In [None]:
df_results_per_a1_intervention

In [None]:
homes_with_thermostat_program_change = df_results_per_a1_intervention.index.get_level_values('id').unique().size
print(f"Number of homes that did a thermostat program change: {homes_with_thermostat_program_change}")
thermostat_programs_in_homes_with_thermostat_change = df_results_per_a1_intervention.index.to_frame(index=False)[['id', 'regime_sequence_number']].drop_duplicates().size
print(f"Number of thermostat programs of these homes: {thermostat_programs_in_homes_with_thermostat_change}")
if homes_with_thermostat_program_change >0:
    print(f"Average number of thermostat programs in homes with thermostat program change: {thermostat_programs_in_homes_with_thermostat_change/homes_with_thermostat_program_change}")

### Filter out learning periods that don't cover a full thermostat program 

In [None]:
df_results_per_a1_intervention['duration__d'] = df_results_per_a1_intervention['duration__s'] / s_d_1

In [None]:
df_results_per_a1_intervention = df_results_per_a1_intervention[df_results_per_a1_intervention['duration__d'] == learn_period__d]

### Aggregate regime metrics and keep only ids for which 2 or more regimes can be compared

In [None]:
df_results_per_a1_intervention_grouped = df_results_per_a1_intervention.groupby(['id', 'regime_sequence_number']).agg({
        'duration__d': 'sum',  # Summing this specific column
    **{col: 'mean' for col in df_results_per_a1_intervention.columns if col != 'duration__d'}  # Mean for all other columns
})

# Step 1: Find unique regime counts per `id`
unique_counts = df_results_per_a1_intervention_grouped.index.to_frame(index=False).groupby('id')['regime_sequence_number'].nunique()

# Step 2: Identify `id`s with more than one unique regime_sequence_number
ids_with_multiple_regimes = unique_counts[unique_counts > 1].index

# Step 3: Filter the DataFrame to keep only rows with `id`s that have multiple regimes
df_results_per_a1_intervention_grouped = df_results_per_a1_intervention_grouped[df_results_per_a1_intervention_grouped.index.get_level_values('id').isin(ids_with_multiple_regimes)]

In [None]:
df_results_per_a1_intervention_limited_grouped = df_results_per_a1_intervention_grouped

# df_results_per_a1_intervention_limited_grouped = df_results_per_a1_intervention_grouped[['duration__d',
#  'avg_temp_set__degC',
#  'avg_temp_indoor__degC',
#  'avg_temp_outdoor__degC',
#  'corrected_avg_energy_ch__W',
#  'avg_comfortable__0',
#  'corrected_avg_co2_ch__g_s_1']]

In [None]:
df_results_per_a1_intervention_limited_grouped

In [None]:
%%time 
# Convert all datetime columns to timezone-naive and export to Excel
df_results_per_a1_intervention_limited_grouped.reset_index(drop=False).apply(lambda x: x.dt.tz_localize(None) if x.dtype.kind == 'M' else x).to_excel("results_interventions_A1.xlsx", index=False)

### Check thermostat program changes via temperature plots

In [None]:
temp_property_sources_a1 = {
    'temp_indoor__degC':    'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':       'batch_import_remeha_temp_set__degC',
    # 'comfortable__bool':    'comfortable__bool',
    'temp_outdoor__degC':   'batch_import_KNMI_temp_outdoor__degC',
    # 'temp_flow_ch__degC':    'batch_import_remeha_temp_flow_ch__degC',
    # 'temp_ret_ch__degC':    'batch_import_remeha_temp_ret_ch__degC',
    # 'temp_flow_ch_max__degC':'batch_import_remeha_temp_flow_ch_max__degC', 
}

tempprops_a1 = list(temp_property_sources_a1.values())

In [None]:
for id in df_results_per_a1_intervention_limited_grouped.index.get_level_values('id').unique():
    print(id)
    display(df_results_per_a1_intervention_limited_grouped.loc[id])
    Plot.dataframe_preprocessed_plot(df_prep[tempprops_a1].xs(id, level='id', drop_level=False), units_to_mathtext)

In [None]:
# Define your metrics and desired precisions in a dictionary
metrics_format = {
    'corrected_avg_energy_ch__W': '{:.0f}',  # 0 decimal places
    'avg_comfortable__0': '{:.2f}',  # 2 decimal places
    'corrected_avg_co2_ch__g_s_1': '{:.3f}'  # 3 decimal places
}

# Loop through each metric, applying the specified precision
for metric, format_str in metrics_format.items():
    print(metric)
    with pd.option_context('display.float_format', format_str.format):
        display(df_results_per_a1_intervention[metric].groupby(['id', 'regime_sequence_number']).describe().T)


In [None]:
# Plot.calculated_intervention_metrics_boxplot(df_results_per_a1_intervention_limited_grouped, metrics)

## Analyze and visualize intervention B1: lower maximum supply temperatures

In [None]:
# Load the regime change data from Excel
regime_changes_B1 = pd.read_excel(regime_change_B1_file_path)

In [None]:
# Convert the program change datetime to a datetime object only if the datetime and timezone values are non-empty
regime_changes_B1['B_temp_flow_ch_max__datetime'] = pd.to_datetime(
    regime_changes_B1['B_temp_flow_ch_max__datetime'], errors='coerce'
)

# Apply timezone localization conditionally
regime_changes_B1['B_temp_flow_ch_max__datetime'] = regime_changes_B1.apply(
    lambda row: row['B_temp_flow_ch_max__datetime'].tz_localize(row['timezone'])
    if pd.notna(row['B_temp_flow_ch_max__datetime']) and pd.notna(row['timezone'])
    else row['B_temp_flow_ch_max__datetime'],
    axis=1
)

In [None]:
regime_changes_B1 = regime_changes_B1.drop(columns=['timezone'])

In [None]:
regime_changes_B1['regime_sequence_number'] = regime_changes_B1.groupby('id').cumcount() + 1

In [None]:
regime_changes_B1

In [None]:
# Label results per period with regime sequence numbers, if there was no regime change in the period

# Initialize the regime_sequence_number column
df_results_per_period['regime_sequence_number'] = None

# reset index
df_results_per_period = df_results_per_period.reset_index()

# Loop over each row in df_results_per_period
for index, row in df_results_per_period.iterrows():
    # Get the current id, start, and end for the row
    current_id = row['id']
    start_time = row['start']
    end_time = row['end']

    # Check for regime changes within the interval
    regime_changes = regime_changes_B1[
        (regime_changes_B1['id'] == current_id) &
        (regime_changes_B1['B_temp_flow_ch_max__datetime'] >= start_time) &
        (regime_changes_B1['B_temp_flow_ch_max__datetime'] <= end_time)
    ]

    # If there are regime changes, do not assign a regime sequence number
    if not regime_changes.empty:
        continue  # Skip assigning a regime sequence number for this interval

    # If there are no regime changes, find the latest regime sequence number before the start
    latest_regime = regime_changes_B1[
        (regime_changes_B1['id'] == current_id) &
        (regime_changes_B1['B_temp_flow_ch_max__datetime'] <= start_time)
    ].sort_values(by='B_temp_flow_ch_max__datetime', ascending=False)

    # Assign the regime sequence number if found
    if not latest_regime.empty:
        df_results_per_period.at[index, 'regime_sequence_number'] = latest_regime.iloc[0]['regime_sequence_number']



# Convert the regime_sequence_number column to integers
df_results_per_period['regime_sequence_number'] = pd.to_numeric(df_results_per_period['regime_sequence_number'], downcast='integer')

# restore the index
df_results_per_period = df_results_per_period.set_index(['id', 'start', 'end'])

In [None]:
regime_changes_B1[['id', 'regime_sequence_number']].groupby('id').describe()

In [None]:
df_results_per_period[['regime_sequence_number']].groupby('id').describe()

In [None]:
# Group by 'id' and calculate standard deviation for 'regime_sequence_number'
std_values = regime_changes_B1.groupby('id')['regime_sequence_number'].std()

# Filter the ids where std > 0
id_mask_intervention_B1 = std_values[std_values > 0].index.tolist()

In [None]:
df_results_per_period.columns.to_list()

In [None]:
intervention_results_columns_b1 = {
    'regime_sequence_number',
    'duration__s',
    'avg_temp_flow_ch_max__degC',
    'avg_temp_indoor__degC',
    'avg_temp_outdoor__degC',
    'avg_energy_ch__W',
    'avg_co2_ch__g_s_1'
} |  metrics

In [None]:
df_results_per_b1_intervention = (df_results_per_period[df_results_per_period.index.get_level_values('id').isin(id_mask_intervention_B1)][list(intervention_results_columns_b1)]
                                  .reset_index()
                                  .dropna(subset=['regime_sequence_number'])  # Drop rows where 'regime_sequence_number' is NaN
                                  .dropna(subset=metrics) # Drop rows where any of the metrics is NaN
                                  .set_index(['id', 'regime_sequence_number', 'start', 'end']))

In [None]:
df_results_per_b1_intervention

In [None]:
homes_with_max_flow_temp_change = len(df_results_per_b1_intervention.index.get_level_values('id').unique())
print(f"Number of homes that changed their max supply temperature: {homes_with_max_flow_temp_change}")
max_flow_temp_regimes_in_homes_with_max_flow_temp_change = len(df_results_per_b1_intervention.index.to_frame(index=False)[['id', 'regime_sequence_number']].drop_duplicates())
print(f"Number of max supply temperature settings in these homes: {max_flow_temp_regimes_in_homes_with_max_flow_temp_change}")
if homes_with_max_flow_temp_change >0:
    print(f"Average number of max supply temperatures settings in homes with such a change: {max_flow_temp_regimes_in_homes_with_max_flow_temp_change/homes_with_max_flow_temp_change}")


### Filter out learning periods that don't cover a period - 1 day

In [None]:
df_results_per_b1_intervention['duration__d'] = df_results_per_b1_intervention['duration__s'] / s_d_1

In [None]:
df_results_per_b1_intervention = df_results_per_b1_intervention[df_results_per_b1_intervention['duration__d'] > (learn_period__d - 1)]


In [None]:
df_results_per_b1_intervention

### Aggregate regime metrics and keep only ids for which 2 or more regimes can be compared

In [None]:
df_results_per_b1_intervention_grouped = df_results_per_b1_intervention.groupby(['id', 'regime_sequence_number']).agg({
        'duration__d': 'sum',  # Summing this specific column
    **{col: 'mean' for col in df_results_per_b1_intervention.columns if col != 'duration__d'}  # Mean for all other columns
})

# Step 1: Find unique regime counts per `id`
unique_counts = df_results_per_b1_intervention_grouped.index.to_frame(index=False).groupby('id')['regime_sequence_number'].nunique()

# Step 2: Identify `id`s with more than one unique regime_sequence_number
ids_with_multiple_regimes = unique_counts[unique_counts > 1].index

# Step 3: Filter the DataFrame to keep only rows with `id`s that have multiple regimes
df_results_per_b1_intervention_grouped = df_results_per_b1_intervention_grouped[df_results_per_b1_intervention_grouped.index.get_level_values('id').isin(ids_with_multiple_regimes)]

In [None]:
df_results_per_b1_intervention_limited_grouped = df_results_per_b1_intervention_grouped

# df_results_per_a1_intervention_limited_grouped = df_results_per_a1_intervention_grouped[['duration__d',
#  'avg_temp_flow_ch_max__degC',
#  'avg_temp_indoor__degC',
#  'avg_temp_outdoor__degC',
#  'corrected_avg_energy_ch__W',
#  'avg_comfortable__0',
#  'corrected_avg_co2_ch__g_s_1']]

In [None]:
df_results_per_b1_intervention_limited_grouped

In [None]:
%%time 
# Convert all datetime columns to timezone-naive and export to Excel
df_results_per_b1_intervention_limited_grouped.reset_index(drop=False).apply(lambda x: x.dt.tz_localize(None) if x.dtype.kind == 'M' else x).to_excel("results_interventions_B1.xlsx", index=False)

In [None]:
# Define your metrics and desired precisions in a dictionary
metrics_format = {
    'corrected_avg_energy_ch__W': '{:.0f}',  # 0 decimal places
    'avg_comfortable__0': '{:.2f}',  # 2 decimal places
    'corrected_avg_co2_ch__g_s_1': '{:.3f}'  # 3 decimal places
}

# Loop through each metric, applying the specified precision
for metric, format_str in metrics_format.items():
    print(metric)
    with pd.option_context('display.float_format', format_str.format):
        display(df_results_per_b1_intervention[metric].groupby(['id', 'regime_sequence_number']).describe().T)


In [None]:
temp_property_sources_b1 = {
    'temp_indoor__degC':    'batch_import_remeha_temp_indoor__degC',
    'temp_set__degC':       'batch_import_remeha_temp_set__degC',
    # 'comfortable__bool':    'comfortable__bool',
    'temp_outdoor__degC':   'batch_import_KNMI_temp_outdoor__degC',
    'temp_flow_ch__degC':    'batch_import_remeha_temp_flow_ch__degC',
    'temp_ret_ch__degC':    'batch_import_remeha_temp_ret_ch__degC',
    'temp_flow_ch_max__degC':'batch_import_remeha_temp_flow_ch_max__degC', 
}

tempprops_b1 = list(temp_property_sources_b1.values())

In [None]:
for id in df_results_per_b1_intervention_limited_grouped.index.get_level_values('id').unique():
    print(id)
    display(df_results_per_b1_intervention_limited_grouped.loc[id])
    Plot.dataframe_preprocessed_plot(df_prep[tempprops_b1].xs(id, level='id', drop_level=False), units_to_mathtext)

In [None]:
# Plot.calculated_intervention_metrics_boxplot(df_results_per_b1_intervention_limited_grouped, metrics)