# REDUCEDHEATCARB analysis

This JupyterLabs notebook can be used for physics ifnormed machine learning analysis in the REDUCEDHEATCAB project.
Don't forget to install the requirements listed in [requirements.txt](../requirements.txt) first!

## Setting the stage

First several imports and variables need to be defined


### Imports and generic settings

In [None]:
import numpy as np
import pandas as pd

%load_ext autoreload

    
from tqdm.notebook import tqdm

from gekko import GEKKO

import sys
sys.path.append('../data/')
sys.path.append('../view/')
sys.path.append('../analysis/')

from plotter import Plot
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib widget


from nfh_utils import *

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

from rhc_analysis import Learner

# rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_5_min.parquet'
rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_1_min.parquet'
#rhc_preprocessed_poperties_file='rhc_heat_dist_preprocessed_properties.parquet'

home_BAG_data_file_path = 'homes_properties_detailed.xlsx'
boiler_returntemp_load_efficiency_file_path = 'boiler_returntemp_load_efficiency.parquet'

# File for intermediate output (including preprocessing that may likely needs to migrate to the GEKKO model code for the what-if simulations)
rhc_heat_dist_preprocessed_poperties_file='rhc_heat_dist_preprocessed_properties.parquet'

rhc_analysis_results_file = 'rhc_results.parquet'
rhc_analysis_results_per_period_file = 'rhc_results_per_period.xlsx'

import logging 

# Clear any existing handlers to avoid duplicate logs
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Set up logging to the console only
logging.basicConfig(
    level=logging.WARNING,  # Adjust log level as necessary (DEBUG, INFO, WARNING, etc.)
    # level=logging.INFO,  # Adjust log level as necessary (DEBUG, INFO, WARNING, etc.)
    stream=sys.stderr,    # Send logs to stdout (you can also use sys.stderr if needed)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)

### Reading preprocessed interpolated properties from a parquet file

In [None]:
%%time

# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
df_prep.info()

In [None]:
print("[\n'","', \n'".join(sorted(df_prep.columns)),"'\n]")

In [None]:
# visuaize all input data
df_plot = df_prep

In [None]:
list(df_plot.index.unique('id').dropna())

In [None]:
df_plot.index.unique('id').dropna()

In [None]:
# df_plot.loc[[401632]][[prop for prop in df_plot.columns.values if prop.split('__')[-1] in ('degC', 'W', '0', 'bool', 'ppm', 'W_m_2')]]

In [None]:
#Plot all properties from all sources for all ids
#Plot.dataframe_preprocessed_plot(df_plot.loc[[401632]][[prop for prop in df_plot.columns.values if prop.split('__')[-1] in ('degC', 'ppm', 'W_m_2')]], units_to_mathtext)

# Calculate additional properties 
We may have to move some of these calculations to inside the GEKKO Python model code (e.g. for the what-if scenario simulation)

## Use boiler-specific efficiency to calculate heat_ch__W

### Reading boiler efficiency data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_boiler_efficiency = pd.read_parquet(
        # boiler_returntemp_efficiency_file_path, 
        boiler_returntemp_load_efficiency_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
df_boiler_efficiency

### Lookup of momentary boiler efficiency from boiler-specific efficiency curves

In [None]:
# Derive brand-specific min and max for load and return temperature
brand_specific_ranges = df_boiler_efficiency.reset_index().groupby('brand_model').agg({
    'rounded_load__%': ['min', 'max'],
    'rounded_temp_ret__degC': ['min', 'max']
})

In [None]:
# Rename columns for clarity
brand_specific_ranges.columns = ['min_load__%', 'max_load__%', 'min_temp_ret__degC', 'max_temp_ret__degC']

In [None]:
brand_specific_ranges

In [None]:
%%time
# Merge df_prep with brand_specific_ranges to get the min/max values for each brand_model
df_prep = (df_prep
           .reset_index()
           .merge(brand_specific_ranges, on='brand_model', how='left')
           .set_index(['id', 'timestamp'])
           .sort_index())

In [None]:
%%time
# Round return temperatures to whole degrees
df_prep['rounded_temp_ret__degC'] = (df_prep['batch_import_remeha_temp_ret__degC']
                                     .round()
                                     .clip(lower=df_prep['min_temp_ret__degC'], upper=df_prep['max_temp_ret__degC'])
                                     .astype('Int8')
                                    )


In [None]:
%%time
df_prep.loc[:,'rounded_load__%'] = (
    (df_prep['calculated_fan_frac__0'] + (1 - df_prep['calculated_fan_frac__0']) * df_prep['Qnh_min_lhv__kW'] / df_prep['Qnh_max_lhv__kW'])
    * 100
    * df_prep['batch_import_remeha_gas_valve_open__bool'] # make sure load = 0 when gas valve is closed
).round().clip(lower=df_prep['min_load__%'], upper=df_prep['max_load__%']).astype('Int16')

In [None]:
%%time
# Merging DataFrames
df_prep = (df_prep
           .reset_index()
           .merge(df_boiler_efficiency.reset_index(),
                  on=['brand_model', 'rounded_load__%', 'rounded_temp_ret__degC'], 
                  how='left'
                  )
           .set_index(['id', 'timestamp'])
           .sort_index()
          )  


### Calculate heat_ch__W

In [None]:
%%time
df_prep.loc[:,'calculated_heat_ch__W'] = df_prep['calculated_g_use_ch_hhv__W'] * df_prep['eta_ch_hhv__W0']

### Calculate flow_ch__dm3_s_1

In [None]:
water__kg_m_3 = 994 # Approximation of the density of water at 34°C calculated using CoolProp and an average of all observed supply and return temperatures for central heating

In [None]:
%%time
# Calculate the temperature difference
temp_diff = df_prep['batch_import_remeha_temp_sup_ch__degC'] - df_prep['batch_import_remeha_temp_ret_ch__degC']

# Use np.where to conditionally replace results based on the temperature difference
df_prep.loc[:, 'calculated_flow_ch__dm3_min_1'] = np.where(
    (temp_diff == 0) | pd.isna(temp_diff),  # Check if the temperature difference is zero or NA
    pd.NA,  # Assign NA if the condition is true
    (
        df_prep['calculated_heat_ch__W']
        / (
            water__J_kg_1_K_1 
            * temp_diff
            * water__kg_m_3
        )
        * dm3_m_3
        * s_min_1
    )
)

# Clipping the calculated flow values, based on maximum flow of 6.5 [m³/h] 
# (Grundfos UPM4 15-75, UPM4S 15-60; Wilo Yonos Para MS/6B-PWM1, Yonos Para MS/78-PWM1)
flow_ch_max__dm3_min_1 = 6.5 * dm3_m_3 / min_h_1
df_prep['calculated_flow_ch__dm3_min_1'] = df_prep['calculated_flow_ch__dm3_min_1'].clip(lower=0, upper=flow_ch_max__dm3_min_1)

In [None]:
df_prep['calculated_flow_ch__dm3_min_1'].describe()

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'calculated_flow_ch__dm3_min_1'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)


## Inspect results 

In [None]:
sorted(list(df_prep.columns))

In [None]:
df_prep[['batch_import_remeha_ch_water_pump_speed__0',
         'batch_import_remeha_temp_sup__degC',
         'batch_import_remeha_temp_sup_ch__degC',
         'batch_import_remeha_temp_ret__degC',
         'batch_import_remeha_temp_ret_ch__degC',
         'calculated_heat_ch__W',
         'calculated_flow_ch__dm3_min_1',
         'batch_import_remeha_dhw_flow__l_min_1'
        ]].describe()

In [None]:
df_prep[['batch_import_remeha_ch_water_pump_speed__0',
         'batch_import_remeha_temp_sup__degC',
         'batch_import_remeha_temp_sup_ch__degC',
         'batch_import_remeha_temp_ret__degC',
         'batch_import_remeha_temp_ret_ch__degC',
         'calculated_flow_ch__dm3_min_1',
         'batch_import_remeha_dhw_flow__l_min_1'
        ]].groupby('id').count().T

In [None]:
# Create masks
boiler_burning_mask = ((df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True) | (df_prep['batch_import_remeha_boiler_status_burning_dhw__bool'] == True)) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
boiler_ch_mask = (df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
boiler_dhw_mask = (df_prep['batch_import_remeha_boiler_status_burning_dhw__bool'] == True) & (df_prep['batch_import_remeha_gas_valve_open__bool'] == True)
remeha_data_notna_mask = (df_prep['batch_import_remeha_temp_indoor__degC'].notna()) & (df_prep['device_p1-reader_g_use_hhv__W'].notna())
boiler_valve_closed_mask = (df_prep['batch_import_remeha_temp_indoor__degC'].notna()) & (df_prep['batch_import_remeha_gas_valve_closed__bool'] == True)

In [None]:
# Select mask
boiler_status_mask = remeha_data_notna_mask 


### Inspecting gas power used by boiler based on fan speed [rpm]

In [None]:
df_prep[boiler_status_mask]['calculated_fan_frac__0'].mean()

In [None]:
average_gas_std_hhv__MJ_mm_3 = df_prep[boiler_status_mask]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'].mean() / 1e6
average_gas_std_hhv__MJ_mm_3


In [None]:
(df_prep[boiler_status_mask]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'] * df_prep[boiler_status_mask]['calculated_g_use_boiler_hhv__W']).sum() / df_prep[boiler_status_mask]['calculated_g_use_boiler_hhv__W'].sum() / 1e6



In [None]:
df_prep[boiler_status_mask]['calculated_g25_3_use_boiler_lhv__W'].mean()

In [None]:
df_prep[boiler_status_mask]['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0'].mean()                                           

In [None]:
df_prep[boiler_status_mask]['gas_pressure_factor_ref_to_actual__J0'].mean()                                           

In [None]:
df_prep[boiler_status_mask]['gas_temp_factor_ref_to_actual__J0'].mean()                                           

In [None]:
conversion_factor_boiler = (
    df_prep[boiler_status_mask]['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0'].mean()
    * df_prep[boiler_status_mask]['gas_pressure_factor_ref_to_actual__J0'].mean()
    * df_prep[boiler_status_mask]['gas_temp_factor_ref_to_actual__J0'].mean()
)
conversion_factor_boiler

In [None]:
average_gas_boiler__W = df_prep[boiler_status_mask]['calculated_g_use_boiler_hhv__W'].mean()                              
average_gas_boiler__W

### Inspecting gas power based used by home, based on smart meter measurements

In [None]:
df_prep[boiler_status_mask]['device_p1-reader_g_use_hhv__W'].mean()

In [None]:
df_prep[boiler_status_mask]['gas_calorific_factor_groningen_hhv_to_actual_hhv__J0'].mean()

In [None]:
df_prep[boiler_status_mask]['gas_calorific_factor_groningen_hhv_to_actual_hhv__J0'].mean()

In [None]:
df_prep[boiler_status_mask]['gas_pressure_factor_correct_smart_meter_to_actual__J0'].mean()

In [None]:
conversion_factor_smart_meter = (
    df_prep[boiler_status_mask]['gas_calorific_factor_groningen_hhv_to_actual_hhv__J0'].mean()
    * df_prep[boiler_status_mask]['gas_pressure_factor_correct_smart_meter_to_actual__J0'].mean()
)
conversion_factor_smart_meter

In [None]:
average_gas_smart_meter__W = df_prep[boiler_status_mask]['calculated_g_use_hhv__W'].mean()
average_gas_smart_meter__W

### Inspecting gas power based used by boiler, based on boiler counters

In [None]:
average_gas_input_bdr_energy_counter_ch_lhv__W = df_prep[boiler_status_mask]['batch_import_remeha_g_use_ch_lhv__W'].mean()                              
average_gas_input_bdr_energy_counter_ch_lhv__W

In [None]:
df_prep[boiler_status_mask]['calculated_remeha_g_use_ch_hhv__W'].mean()

In [None]:
average_gas_input_bdr_energy_counter_dhw_lhv__W = df_prep[boiler_status_mask]['batch_import_remeha_g_use_dhw_lhv__W'].mean()                              
average_gas_input_bdr_energy_counter_dhw_lhv__W

In [None]:
df_prep[boiler_status_mask]['calculated_remeha_g_use_dhw_hhv__W'].mean()

In [None]:
average_gas_input_bdr_energy_counter_lhv__W = (average_gas_input_bdr_energy_counter_ch_lhv__W +  average_gas_input_bdr_energy_counter_dhw_lhv__W) 
average_gas_input_bdr_energy_counter_lhv__W

In [None]:
average_gas_input_bdr_energy_counter__W = df_prep[boiler_status_mask]['calculated_remeha_g_use_ch_hhv__W'].mean() + df_prep[boiler_status_mask]['calculated_remeha_g_use_dhw_hhv__W'].mean()
average_gas_input_bdr_energy_counter__W

#### Inspecting boiler counters while valve is closed

In [None]:
df_prep[boiler_valve_closed_mask][[
    'batch_import_remeha_fan_rotations__min_1',
    'batch_import_remeha_g_use_ch_lhv__W',
    'batch_import_remeha_g_use_dhw_lhv__W',
    'calculated_remeha_g_use_ch_hhv__W',
    'calculated_remeha_g_use_dhw_hhv__W',
]].describe().T   

### Compare gas power calculated in various ways

In [None]:
average_gas_smart_meter__W - average_gas_boiler__W

In [None]:
average_gas_boiler__W / average_gas_smart_meter__W

In [None]:
average_gas_boiler__W / average_gas_input_bdr_energy_counter__W

In [None]:
df_prep[boiler_status_mask].groupby(level='id').agg({
    'batch_import_remeha_g_use_ch_lhv__W': ['mean'],    
    'calculated_remeha_g_use_ch_hhv__W': ['mean'],
    'calculated_remeha_g_use_dhw_hhv__W': ['mean'],
    'calculated_remeha_g_use_hhv__W': ['mean'],
    'calculated_g25_3_use_boiler_lhv__W': ['mean'],
    'calculated_g_use_ch_hhv__W': ['mean'],
    'calculated_g_use_dhw_hhv__W': ['mean'],
    'calculated_g_use_boiler_hhv__W': ['mean'],
    'calculated_g_use_hhv__W': ['mean'],
    'calculated_heat_ch__W': ['mean'],
}).T

In [None]:
df_prep[boiler_status_mask][[
    'batch_import_remeha_g_use_ch_lhv_cum__kWh',
    'batch_import_remeha_g_use_ch_lhv__W',
    'batch_import_remeha_g_use_dhw_lhv_cum__kWh',
    'batch_import_remeha_g_use_dhw_lhv__W']
].groupby(level='id').describe().T
 

In [None]:
# Group by 'id' and calculate the means
df_prep[boiler_status_mask].groupby(level='id').agg({
    'batch_import_remeha_temp_ch_sup_max__degC': 'mean',
    'batch_import_remeha_temp_sup_ch__degC': 'mean',
    'batch_import_remeha_temp_ret_ch__degC': 'mean',
    'rounded_temp_ret__degC': 'mean',
    'rounded_load__%': 'mean',
    'eta_ch_hhv__W0': 'mean',
}).sort_values(by='eta_ch_hhv__W0', ascending=False)

In [None]:
df_prep[boiler_ch_mask]['rounded_temp_ret__degC'].mean() 

In [None]:
df_prep[boiler_ch_mask & (df_prep['rounded_temp_ret__degC'] <30)]['rounded_temp_ret__degC'].count() / df_prep[boiler_ch_mask]['rounded_temp_ret__degC'].count()

In [None]:
# average efficiency on higher heating value, NOT weighted by gas input
df_prep[boiler_ch_mask]['eta_ch_hhv__W0'].mean() 

In [None]:
# average efficiency on higher heating value, weighted by gas input
(df_prep.loc[boiler_ch_mask, 'eta_ch_hhv__W0'] * df_prep.loc[boiler_ch_mask, 'calculated_remeha_g_use_hhv__W']).sum() / df_prep.loc[boiler_ch_mask, 'calculated_remeha_g_use_hhv__W'].sum()


In [None]:
df_prep[boiler_ch_mask]['rounded_load__%'].mean() 

In [None]:
df_boiler_efficiency

In [None]:
# Filter the DataFrame based on boiler_status_mask
df_filtered = df_prep.loc[boiler_status_mask]

# Group by 'id' and calculate the sum of the relevant columns
grouped = df_filtered.groupby('id').agg(
    mean_boiler_use=('calculated_g_use_boiler_hhv__W', 'mean'),
    mean_smart_meter_use=('calculated_g_use_hhv__W', 'mean'),
    dsmr_version=('device_p1-reader_dsmr_version__0', 'first')  # Assuming the DSMR version doesn't change within an id
)

# Calculate the ratio
grouped['ratio'] = grouped['mean_boiler_use'] / grouped['mean_smart_meter_use']

In [None]:
grouped

In [None]:
# Now, group by DSMR version and calculate the mean ratio for each version
version_grouped = grouped.groupby('dsmr_version').agg(
    sum_mean_boiler_use=('mean_boiler_use', 'sum'),
    sum_mean_smart_meter_use=('mean_smart_meter_use', 'sum'),
    mean_ratio=('ratio', 'mean'),
    count=('ratio', 'size'))

version_grouped['ratio_sums'] =  version_grouped['sum_mean_boiler_use'] / version_grouped['sum_mean_smart_meter_use']


In [None]:
version_grouped

### Boxplots per home (when boiler is burning for central heating & gas value is open)

In [None]:
# Group by 'id' and calculate the mean for both 'eta_ch_hhv__W0' and 'batch_import_remeha_temp_ret__degC'
df_prep[boiler_ch_mask].groupby(level='id').agg({
    'batch_import_remeha_temp_ch_sup_max__degC': 'mean',
    'batch_import_remeha_temp_sup__degC': 'mean',
    'batch_import_remeha_temp_ret__degC': 'mean',
    'rounded_temp_ret__degC': 'mean',
    'rounded_load__%': 'mean',
    'eta_ch_hhv__W0': 'mean',
    'calculated_heat_ch__W': 'max',
}).sort_values(by='eta_ch_hhv__W0', ascending=False)

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='rounded_temp_ret__degC')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='rounded_load__%')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='eta_ch_hhv__W0')

In [None]:
Plot.nfh_property_per_id_boxplot(df_prep[boiler_ch_mask], property_col='calculated_heat_ch__W')

## Write heat distribution results

### Writing heat distribution preprocessing results results to parquet file

In [None]:
df_heat_dist = df_prep[['batch_import_KNMI_ghi__W_m_2',
                        'batch_import_KNMI_temp_outdoor__degC',
                        'batch_import_KNMI_wind__m_s_1',
                        'device_p1-reader_g_use_hhv__W',
                        'batch_import_remeha_boiler_status_burning_ch__bool',
                        'batch_import_remeha_boiler_status_burning_dhw__bool', 
                        'batch_import_remeha_gas_valve_closed__bool',
                        'batch_import_remeha_gas_valve_open__bool',
                        'batch_import_remeha_fan_rotations__min_1', 
                        'batch_import_remeha_ch_water_pump_speed__0', 
                        'batch_import_remeha_g_use_ch_lhv__W',
                        'batch_import_remeha_g_use_dhw_lhv__W',
                        'batch_import_remeha_temp_set__degC',
                        'batch_import_remeha_temp_indoor__degC',
                        'device_living_room_calibrated_temp_indoor__degC',
                        'batch_import_remeha_temp_sup__degC',
                        'batch_import_remeha_temp_ret__degC',
                        'batch_import_remeha_temp_sup_ch__degC',
                        'batch_import_remeha_temp_ret_ch__degC',
                        'batch_import_remeha_temp_ch_sup_max__degC', 
                        'calculated_g_use_hhv__W',
                        'calculated_g_use_boiler_hhv__W',
                        'calculated_g_use_dhw_hhv__W',
                        'calculated_g_use_ch_hhv__W',
                        'eta_ch_hhv__W0',
                        'calculated_heat_ch__W',
                        'rounded_load__%',
                        'rounded_temp_ret__degC',
                       ]]

In [None]:
df_heat_dist.info()

In [None]:
# %%time 
# df_heat_dist.to_parquet(rhc_heat_dist_preprocessed_poperties_file, index=True, engine='pyarrow')

### Writing heat distribution preprocessing results to multiple zipped CSV files

In [None]:
# %%time 
# # uncomment this entire block of code to enable it 
# # for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()[:3]):
# # for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()[3:]):
# # for home_id in [483173]:
# for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()):
#     df_heat_dist.xs(home_id, drop_level=False).to_csv(
#         f'{home_id}_heat_dist_preprocessed_properties.zip',
#         encoding='utf-8',
#         compression= dict(method='zip',
#                           archive_name=f'{home_id}_heat_dist_preprocessed_properties.csv'),
#         date_format='%Y-%m-%dT%H:%M:%S%z'
#     )


## Inspect  return temperatures

In [None]:
units_to_mathtext = property_types = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'p' : r'$persons$',
    'W' : r'$W$',
    'W_m_2' : r'$W/m^{2}$',
    'm_s_1' : r'$m/s$',
    'W0' : r'$W^{0}$',
    'l_min_1' : r'$L/min$',
}

In [None]:
# time mask for one week in winter with almost all ids having data
returntemp_mask = (
    (df_prep.index.get_level_values('id') == 483173)
    & 
    (df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-03-04 00:00:00+01:00'))
    & 
    (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-03-11 00:00:00+01:00'))
)


In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][['batch_import_remeha_ch_water_pump_speed__0',
                                                           'batch_import_remeha_temp_sup__degC',
                                                           'batch_import_remeha_temp_ret__degC',
                                                           'batch_import_remeha_temp_sup_ch__degC',
                                                           'batch_import_remeha_temp_ret_ch__degC',
                                                           'batch_import_remeha_dhw_flow__l_min_1',
                                                           'calculated_g_use_ch_hhv__W',
                                                           'calculated_g_use_dhw_hhv__W']
                                 ], units_to_mathtext)

## Learn energy profile parameters

Most of the heavy lifting is done by the `learn_energy_profile()` function, which again uses the [GEKKO Python](https://machinelearning.byu.edu/) dynamic optimization toolkit.

In [None]:
# If room metadata is set to None, then learn_home_parameters() will derive the metadata from the ids.
%autoreload 2

hints = {
    'heat_tr_building_cond__W_K_1': heat_tr_building_nl_avg__W_K_1,      # specific heat loss
    'th_inertia_building__h':       th_inertia_building_nl_avg__h,       # thermal inertia
    'aperture_sol__m2':             aperture_sol_nl_avg__m2,             # apparent solar aperture
    'wind_chill__K_s_m_1':          wind_chill_nl_avg__K_s_m_1,          # wind chill factor
    'aperture_inf__cm2':            aperture_inf_nl_avg__cm2,            # effective infiltration area 
    'occupancy__p':                 occupancy_nl_avg__p,                 # house occupancy
    'heat_int__W_p_1':              heat_int_nl_avg__W_p_1,              # heat gain per occupant
    'eta_ch_hhv__W0':               eta_ch_nl_avg_hhv__W0,               # home heating efficiency of a gas boiler (based on higher heating value)
    'eta_dhw_hhv__W0':              eta_dhw_nl_avg_hhv__W0,              # domestic hot water efficiency
    'frac_remain_dhw__0':           frac_remain_dhw_nl_avg__0,           # fraction of domestic hot water heat contributing to heating the home
    'g_use_cooking_hhv__W':         g_use_cooking_nl_avg_hhv__W,         # gas power (higher heating value) for cooking 
    'eta_cooking_hhv__W0':          eta_cooking_nl_avg_hhv__W0,          # cooking efficiency
    'frac_remain_cooking__0':       frac_remain_cooking_nl_avg__0,       # fraction of cooking heat contributing to heating the home
    'heat_tr_dist__W_K_1':          heat_tr_dist_nl_avg__W_K_1,          # heat dissipation capacity of the heat distribution system
    'th_mass_dist__Wh_K_1':         (th_mass_dist_nl_avg__W_K_1/s_h_1),  # thermal mass of the heat distribution system
    'ventilation_default__dm3_s_1': 7.0,                                 # default ventilation rate for the entire home
    'ventilation_max__dm3_s_1_m_2': 1.0,                                 # maximum ventilation rate per m2 floor area
    'co2_outdoor__ppm':             co2_outdoor_eu_avg_2022__ppm,        # average CO₂ outdoor concentration
}

learn = [
    'heat_tr_building_cond__W_K_1',
    'th_inertia_building__h',
    'th_mass_building__Wh_K_1',
    'aperture_sol__m2',
    'aperture_inf__cm2',
    'ventilation__dm3_s_1', 
    # 'heat_tr_dist__W_K_1', 'th_mass_dist__J_K_1',
]


learned_properties = [f'learned_{prop}' for prop in learn]
# mae_learned_properties = [f'mae_{prop}' for prop in learn]
# rmse_learned_properties = [f'rse_{prop}' for prop in learn]

# select which property columns in df_prep are used as properties needed by the learning algorithm
property_sources = {
    'temp_indoor__degC':    'batch_import_remeha_temp_indoor__degC',
    'temp_outdoor__degC':   'batch_import_KNMI_temp_outdoor__degC',
    'wind__m_s_1':          'batch_import_KNMI_wind__m_s_1',
    'ghi__W_m_2':           'batch_import_KNMI_ghi__W_m_2', 
    'g_use_ch_hhv__W':      'calculated_g_use_ch_hhv__W',
    'eta_ch_hhv__W0':       'eta_ch_hhv__W0', 
    'g_use_dhw_hhv__W' :    'calculated_g_use_dhw_hhv__W',
    'e__W' :                'calculated_device_p1-reader_e__W',    
    'occupancy__p':         'device_living_room_occupancy__p',
    'co2_indoor__ppm':      'device_living_room_co2_indoor__ppm',
    # 'temp_sup_ch__degC':   'batch_import_remeha_temp_sup_ch__degC',
    # 'temp_ret_ch__degC':   'batch_import_remeha_temp_ret_ch__degC',
}

In [None]:
props = list(property_sources.values())

In [None]:
props

In [None]:
%%time 
df_home_bag_data = pd.read_excel(home_BAG_data_file_path,  index_col='id')

### Define (subsets of) learning periods and (subsets of) home ids to perform the learning on

In [None]:
# time mask for core of winter with most data
janfebmrt24_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-01-01 00:00:00+01:00')) 
                    & 
                    (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-04-01 00:00:00+02:00'))
                   )

In [None]:
# time mask for one week in winter with almost all ids having data
febwk3_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-18 00:00:00+01:00'))
               & 
               (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-02-25 00:00:00+01:00'))
              )


In [None]:
# time mask for one week in winter with almost all ids having data
feb_mask = ((df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-01 00:00:00+01:00'))
            & 
            (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-02-29 00:00:00+01:00'))
           )


In [None]:
specific_id = 434931
specific_ids = [434931, 450298, 495906]

### Check presence of properties

In [None]:
Plot.plot_missing_data_overview(df_prep[janfebmrt24_mask], properties_include=props, freq='1W', title_fontsize=8)

In [None]:
df_prep[febwk3_mask][props].xs(specific_id, level='id', drop_level=False).describe().T

In [None]:
df_prep[feb_mask][props].xs(specific_id, level='id', drop_level=False).describe().T

In [None]:
df_prep[janfebmrt24_mask][props].xs(specific_id, level='id', drop_level=False).describe().T

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[feb_mask][props].xs(specific_id, level='id', drop_level=False), units_to_mathtext)

### Use GEKKO model to learn the energy profile parameters

In [None]:
hints

In [None]:
th_inertia_dist_nl_avg__h = hints['th_mass_dist__Wh_K_1']  / hints['heat_tr_dist__W_K_1']
th_inertia_dist_nl_avg__h

In [None]:
learn

In [None]:
# learn the model parameters and write results to a dataframe
%autoreload 2
idx = pd.IndexSlice
df_results_per_period, df_results = Learner.learn_energy_profile(df_prep[febwk3_mask][props].loc[idx[[specific_id]], :],                                                                 
# df_results_per_period, df_results = Learner.learn_energy_profile(df_prep[feb_mask][props].loc[idx[specific_ids], :],                                                                 
# df_results_per_period, df_results = Learner.learn_energy_profile(df_prep[janfebmrt24_mask][props],
                                                                 property_sources = property_sources,
                                                                 df_building_data = df_home_bag_data,
                                                                 learn = learn, 
                                                                 hints = hints,
                                                                 learn_period__d=7,
                                                                 learn_change_interval__min = 30,
                                                                 req_col = list(property_sources.values()),
                                                                 ev_type = 2
                                                                )

In [None]:
%%time 
# Convert all datetime columns to timezone-naive
df_results_per_period_no_tz = df_results_per_period.apply(lambda x: x.dt.tz_localize(None) if x.dtype.kind == 'M' else x)

# Export to Excel
df_results_per_period_no_tz.to_excel(rhc_analysis_results_per_period_file)


In [None]:
%%time 
df_results.to_parquet(rhc_analysis_results_file, index=True, engine='pyarrow')

## Result visualization

### (optional) Read results

In [None]:
%%time

# Attempt to read the Parquet file
try:
    df_results = pd.read_parquet(
        rhc_analysis_results_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
%%time 
df_results_per_period = pd.read_excel(rhc_analysis_results_per_period_file,  index_col='id')

### Result per learning period

In [None]:
if 'ventilation__dm3_s_1' in learn:
    learned_properties.remove('learned_ventilation__dm3_s_1')

In [None]:
df_results_per_period[learned_properties].describe().T

In [None]:
df_results_per_period[learned_properties].groupby('id').describe().T

In [None]:
for learned_prop in learned_properties:
    Plot.nfh_property_per_id_boxplot(df_results_per_period, property_col=learned_prop)

In [None]:
# show essential statistics for the errors; all homeperiods combined
df_results_per_period[['mae_temp_indoor__degC', 'rmse_temp_indoor__degC']].describe()

In [None]:
# show essential statistics for the errors; all periods per home
df_results_per_period[['mae_temp_indoor__degC', 'rmse_temp_indoor__degC']].groupby('id').describe().T

### Visualization of simulated indoor temperatures

In [None]:
df_results

In [None]:
df_results['sim_temp_indoor__degC'].groupby('id').count().to_frame().T

In [None]:
if 'ventilation__dm3_s_1' in learn:
    display(df_results['learned_ventilation__dm3_s_1'].groupby('id').count().to_frame().T)
    display(df_results['learned_ventilation__dm3_s_1'].groupby('id').describe().T)

In [None]:
df_results['learned_ventilation__dm3_s_1']

In [None]:
#Plot only temperatures from all sources for all ids
Plot.dataframe_preprocessed_plot(df_results[['batch_import_remeha_temp_indoor__degC', 'sim_temp_indoor__degC']].xs(specific_id, level='id', drop_level=False), units_to_mathtext)
# Plot.dataframe_preprocessed_plot(df_results['batch_import_remeha_temp_indoor__degC', 'sim_temp_indoor__degC']], units_to_mathtext)