# Processing REDUCEDHEATARB data for heat distribution system model check

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
# Files needed as input
# rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_5_min.parquet'
rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_1_min.parquet'
home_data_file_path = "home_data.parquet"
boiler_returntemp_load_efficiency_file_path = "boiler_returntemp_load_efficiency.parquet"
boiler_returntemp_efficiency_file_path = "boiler_returntemp_efficiency.parquet"

# Files written as output
rhc_heat_dist_preprocessed_poperties_file='rhc_heat_dist_preprocessed_properties.parquet'

%load_ext autoreload

import matplotlib.pyplot as plt

%matplotlib inline
%matplotlib widget

import sys
sys.path.append('../analysis')
from nfh_utils import *

## Reading preprocessed interpolated properties from a parquet file

In [None]:
%%time


# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
df_prep.info()

In [None]:
print("[\n'","', \n'".join(sorted(df_prep.columns)),"'\n]")

## Filtering values immediately relevant for heat distribution system modelling

In [None]:
heat_distribution_props = [
    'batch_import_KNMI_ghi__W_m_2', 
    'batch_import_KNMI_temp_out__degC', 
    'batch_import_KNMI_wind__m_s_1', 
    'device_p1-reader_g_use_hhv__W', 
    'batch_import_remeha_boiler_status_burning_ch__bool',
    'batch_import_remeha_gas_valve_closed__bool', 
    'batch_import_remeha_gas_valve_open__bool', 
    'batch_import_remeha_fan_rotations__min_1', 
    'batch_import_remeha_ch_set_fan_rotations_min__min_1', 
    'batch_import_remeha_ch_set_fan_rotations_max__min_1',       
    'batch_import_remeha_g_use_ch_lhv__W',
    'batch_import_remeha_temp_set__degC', 
    'batch_import_remeha_temp_in__degC', 
    'device_living_room_calibrated_temp_in__degC', 
    'batch_import_remeha_temp_sup__degC', 
    'batch_import_remeha_temp_ch_sup_max__degC', 
    'batch_import_remeha_temp_ret__degC' 
]

In [None]:
df_heat_dist = df_prep[heat_distribution_props].copy()

In [None]:
print(f"df_heat_dist.count().sum(): {df_heat_dist.count().sum():_}")

In [None]:
df_heat_dist.info()

## Calculating additional values relevant for heat distribution system modelling

In [None]:
%%time

# Filter out rows where either of the columns has NaN
valid_rows = df_heat_dist[['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].notna().all(axis=1)

# Calculate the mean only for valid rows
df_heat_dist.loc[valid_rows, 'calculated_temp_rad__degC'] = df_heat_dist.loc[valid_rows, ['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].mean(axis=1)

## Reading boiler data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_homes = pd.read_parquet(
        home_data_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
df_homes.info()

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_boiler_efficiency = pd.read_parquet(
        # boiler_returntemp_efficiency_file_path, 
        boiler_returntemp_load_efficiency_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
df_boiler_efficiency

## Calculating heat production

In [None]:
# # Calculate Q_gain_ch__W based on national average efficiency
# df_heat_dist['calculated_Q_gain_ch__W'] = df_heat_dist['batch_import_remeha_g_use_ch_lhv__W'] *  g_groningen_hhv___MJ_m_3 /  g_groningen_lhv___MJ_m_3 * eta_ch_nl_avg_hhv__J0

In [None]:
g_groningen_hhv___MJ_m_3

In [None]:
g_groningen_lhv___MJ_m_3

In [None]:
%%time
# Round return temperatures to whole degrees
df_heat_dist['rounded_temp_ret__degC'] = df_heat_dist['batch_import_remeha_temp_ret__degC'].round()

In [None]:
df_heat_dist.info()

In [None]:
df_heat_dist.index.dtypes

In [None]:
df_heat_dist.count()

In [None]:
df_heat_dist.groupby(level='id').count().T

In [None]:
pd.concat([df_heat_dist.dropna().groupby(level='id')['batch_import_remeha_ch_set_fan_rotations_min__min_1'].unique(),
           df_heat_dist.dropna().groupby(level='id')['batch_import_remeha_ch_set_fan_rotations_max__min_1'].unique()],
          axis=1).sort_values(by='batch_import_remeha_ch_set_fan_rotations_max__min_1', ascending=True)

In [None]:
df_heat_dist.groupby(level='id')['batch_import_remeha_ch_set_fan_rotations_max__min_1'].unique()

In [None]:
# Merge df_heat_dist with df_homes to get the brand_model for each id
df_heat_dist = df_heat_dist.reset_index().merge(df_homes, on='id').set_index(['id', 'timestamp'])

In [None]:
df_heat_dist.info()

In [None]:
df_heat_dist.count()

In [None]:
df_heat_dist = df_heat_dist[['batch_import_KNMI_ghi__W_m_2',
                             'batch_import_KNMI_temp_out__degC',
                             'batch_import_KNMI_wind__m_s_1',
                             'device_p1-reader_g_use_hhv__W',
                             'batch_import_remeha_boiler_status_burning_ch__bool',
                             'batch_import_remeha_gas_valve_closed__bool',
                             'batch_import_remeha_gas_valve_open__bool',
                             'batch_import_remeha_fan_rotations__min_1', 
                             'batch_import_remeha_ch_set_fan_rotations_min__min_1', 
                             'batch_import_remeha_ch_set_fan_rotations_max__min_1', 
                             'boiler_default_min_fan_rotations__min_1', 
                             'boiler_default_max_ch_fan_rotations__min_1', 
                             'batch_import_remeha_g_use_ch_lhv__W',
                             'batch_import_remeha_temp_set__degC',
                             'batch_import_remeha_temp_in__degC',
                             'device_living_room_calibrated_temp_in__degC',
                             'batch_import_remeha_temp_sup__degC',
                             'batch_import_remeha_temp_ret__degC',
                             'batch_import_remeha_temp_ch_sup_max__degC', 
                             'min_fan_rotations__min_1',
                             'max_ch_fan_rotations__min_1',
                             'power_ch_min__kW',
                             'power_ch_max__kW']]

In [None]:
remeha2nfh = {
    'parHeFanRpmChMax': 'batch_import_remeha_ch_set_fan_rotations_max__min_1',
    'parHeFanRpmMin': 'batch_import_remeha_ch_set_fan_rotations_min__min_1',
    'parHePowerMax': 'power_ch_max__kW',
    'parHePowerMin': 'power_ch_min__kW',
    'parHeFanRpmMin.Minimum': 'min_fan_rotations__min_1',
    'parHeFanRpmChMax.Maximum': 'max_ch_fan_rotations__min_1',
    'varHeFanRpm': 'batch_import_remeha_fan_rotations__min_1',
    'varApPowerActual': 'batch_import_remeha_g_use_ch_and_dhw__kW0'
}


In [None]:
%%time
df_heat_dist.loc[:,'calculated_boiler_ramp_kW_min'] = ((df_heat_dist[remeha2nfh['parHePowerMax']] - df_heat_dist[remeha2nfh['parHePowerMin']])
                                                       / 
                                                       ( df_heat_dist[remeha2nfh['parHeFanRpmChMax.Maximum']] -  df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                      )

# Create lookup value for boiler load fraction, rounded to 2 decimals 
df_heat_dist.loc[:,'calculated_p_load__kW%'] = ((df_heat_dist[remeha2nfh['varHeFanRpm']] - df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                /
                                                (df_heat_dist[remeha2nfh['parHeFanRpmChMax.Maximum']] - df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                * 100
                                               ).round().astype('Int16')

df_heat_dist.loc[:,'rounded_temp_ret__degC'] = df_heat_dist['batch_import_remeha_temp_ret__degC'].round().astype('Int8')

df_heat_dist.loc[:,'calculated_g_use_min_lhv__kW'] = (df_heat_dist[remeha2nfh['parHePowerMin']] + 
                                                      df_heat_dist['calculated_boiler_ramp_kW_min'] * 
                                                      (df_heat_dist[remeha2nfh['parHePowerMin']] - df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                      /
                                                      (df_heat_dist[remeha2nfh['parHeFanRpmChMax.Maximum']] - df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                     )
                                                    

df_heat_dist.loc[:,'calculated_g_use_ch_lhv__kW'] = df_heat_dist['calculated_g_use_min_lhv__kW'] + df_heat_dist['calculated_p_load__kW%']/100 * df_heat_dist['calculated_boiler_ramp_kW_min'] 

# Set the calculated gas use for central heating to zero when the boiler was not burning for central heating
df_heat_dist.loc[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == False, 'calculated_g_use_min_lhv__kW'] = 0


df_heat_dist.loc[:,'calculated_g_use_ch_lhv_rpm_direct__kW'] = (df_heat_dist[remeha2nfh['parHePowerMin']] 
                                                                + df_heat_dist['calculated_boiler_ramp_kW_min'] 
                                                                * (df_heat_dist[remeha2nfh['varHeFanRpm']] - df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                                /
                                                                (df_heat_dist[remeha2nfh['parHeFanRpmChMax.Maximum']] -  df_heat_dist[remeha2nfh['parHeFanRpmMin.Minimum']])
                                                               )

# Set the calculated gas use for central heating to zero when the boiler was not burning for central heating
df_heat_dist.loc[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == False, 'calculated_g_use_ch_lhv_rpm_direct__kW'] = 0


df_heat_dist.loc[:,'calculated_g_use_ch_hhv__kW'] = df_heat_dist['calculated_g_use_ch_lhv__kW'] /  g_groningen_lhv___MJ_m_3 *  g_groningen_hhv___MJ_m_3  

df_heat_dist.loc[:,'calculated_g_use_ch_hhv_rpm_direct__kW'] = df_heat_dist['calculated_g_use_ch_lhv_rpm_direct__kW'] /  g_groningen_lhv___MJ_m_3 *  g_groningen_hhv___MJ_m_3  

In [None]:
df_heat_dist.info()

In [None]:
df_heat_dist[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == True].groupby(level='id').agg({
    'power_ch_min__kW': 'mean',
    'power_ch_max__kW': 'mean',
    'min_fan_rotations__min_1': 'mean',
    'batch_import_remeha_ch_set_fan_rotations_min__min_1': 'mean',
    'boiler_default_min_fan_rotations__min_1': 'mean', 
    'boiler_default_max_ch_fan_rotations__min_1': 'mean', 
    'batch_import_remeha_ch_set_fan_rotations_max__min_1': 'mean',
    'max_ch_fan_rotations__min_1': 'mean',
    'batch_import_remeha_fan_rotations__min_1': ['min', 'mean', 'max'],
    'calculated_boiler_ramp_kW_min': 'mean',
    'calculated_p_load__kW%': ['min', 'mean', 'max'],
    'calculated_g_use_ch_lhv__kW': ['min', 'mean', 'max'],
    'calculated_g_use_ch_lhv_rpm_direct__kW': ['min', 'mean', 'max'],
    'calculated_g_use_ch_hhv__kW': ['min', 'mean', 'max'],
    'calculated_g_use_ch_hhv_rpm_direct__kW': ['min', 'mean', 'max'],
}).T

### I don't trust calculation with negative values for calculated_p_load__kW%, so revert back to using return temp only

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_boiler_efficiency = pd.read_parquet(
        boiler_returntemp_efficiency_file_path, 
        # boiler_returntemp_load_efficiency_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
%%time
# Extract id and rounded_temp_ret__degC values
id_values = df_heat_dist.index.get_level_values('id')
temp_ret_values = df_heat_dist['rounded_temp_ret__degC']

# Lookup efficiency vectorized
def lookup_efficiency_vectorized(id_vals, temp_vals, df_efficiency, df_homes):
    brand_models = df_homes.loc[id_vals, 'brand_model'].values
    efficiency_values = np.empty(len(brand_models))
    
    for i, (model, temp) in enumerate(zip(brand_models, temp_vals)):
        try:
            efficiency_values[i] = df_efficiency.loc[(model, temp), 'eta_ch_hhv__W0']
        except KeyError:
            efficiency_values[i] = np.nan  # or some default value
    
    return efficiency_values


# Apply the vectorized lookup
df_heat_dist['eta_ch_hhv__W0'] = lookup_efficiency_vectorized(id_values, temp_ret_values, df_boiler_efficiency, df_homes)

In [None]:
df_heat_dist['calculated_Q_gain_ch__W'] = df_heat_dist['batch_import_remeha_g_use_ch_lhv__W'] *  g_groningen_hhv___MJ_m_3 /  g_groningen_lhv___MJ_m_3 * df_heat_dist['eta_ch_hhv__W0']

In [None]:
list(df_heat_dist.columns)

In [None]:
df_heat_dist.describe().T

In [None]:
df_heat_dist

## Writing results to parquet file

In [None]:
%%time 
df_heat_dist.to_parquet(rhc_heat_dist_preprocessed_poperties_file, index=True, engine='pyarrow')

## Overview of efficiencies in practice

In [None]:
df_heat_dist[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == True]['eta_ch_hhv__W0'].mean()

In [None]:
# Group by 'id' and calculate the mean for both 'eta_ch_hhv__W0' and 'batch_import_remeha_temp_ret__degC'
df_heat_dist[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == True].groupby(level='id').agg({
    'eta_ch_hhv__W0': 'mean',
    'batch_import_remeha_temp_ret__degC': 'mean',
    'batch_import_remeha_temp_ch_sup_max__degC': 'mean',
    'batch_import_remeha_temp_sup__degC': 'mean'
}).sort_values(by='eta_ch_hhv__W0', ascending=False)

In [None]:
# Create boxplot


# Assuming df_heat_dist is your DataFrame

# Step 1: Filter the DataFrame to include only rows where the boiler is burning
df_filtered = df_heat_dist[df_heat_dist['batch_import_remeha_boiler_status_burning_ch__bool'] == True]

# Step 2: Calculate the mean efficiency per id and sort in descending order
mean_per_id = df_filtered.groupby(level='id')['eta_ch_hhv__W0'].mean().sort_values(ascending=False)

# Step 3: Extract 'id' and 'eta_ch_hhv__W0' into a new DataFrame and drop missing values
df_boxplot = df_filtered.reset_index()[['id', 'eta_ch_hhv__W0']].dropna()

# Step 4: Convert 'id' to a categorical type based on the sorted 'id' values
df_boxplot['id'] = pd.Categorical(df_boxplot['id'], categories=mean_per_id.index, ordered=True)
df_boxplot = df_boxplot.sort_values('id')

# Step 5: Group by 'id' and collect the 'eta_ch_hhv__W0' values
grouped = df_boxplot.groupby('id')['eta_ch_hhv__W0'].apply(list)

# Step 6: Create a list of lists for the boxplot
data = [grouped[id] for id in grouped.index]

# Step 7: Create the boxplot using matplotlib
plt.figure(figsize=(12, 6))
plt.boxplot(data, labels=grouped.index)
plt.title('Efficiency per id (Sorted by High Average Efficiency)')
plt.xlabel('ID')
plt.ylabel('eta_ch_hhv__W0')
plt.xticks(rotation=45)  # Rotate x labels if needed
plt.show()


## Writing results to multiple zipped CSV files

In [None]:
# %%time 
# # uncomment this entire block of code to enable it 
# for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()):
#     df_heat_dist.xs(home_id, drop_level=False).to_csv(
#         f'{home_id}_heat_dist_preprocessed_properties.zip',
#         encoding='utf-8',
#         compression= dict(method='zip',
#                           archive_name=f'{home_id}_heat_dist_preprocessed_properties.csv'),
#         date_format='%Y-%m-%dT%H:%M:%S%z'
#     )
