# Processing REDUCEDHEATARB data for heat distribution system model check

In [1]:
import pandas as pd
from tqdm.notebook import tqdm

rhc_preprocessed_poperties_file='rhc_preprocessed_properties.parquet'
rhc_heat_dist_preprocessed_poperties_file='rhc_heat_dist_preprocessed_properties.parquet'
%load_ext autoreload

%matplotlib inline
%matplotlib widget

import sys
sys.path.append('../analysis')
from nfh_utils import *

## Reading preprocessed interpolated properties from a parquet file

In [2]:
%%time


# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

File was successfully read without specifying compression codec.
CPU times: user 1.22 s, sys: 882 ms, total: 2.1 s
Wall time: 1.18 s


In [3]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

df_prep.count().sum(): 38_913_996


In [4]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1916540 entries, (401632, Timestamp('2023-04-01 00:00:00+0200', tz='Europe/Amsterdam')) to (495906, Timestamp('2024-04-04 01:00:00+0200', tz='Europe/Amsterdam'))
Data columns (total 78 columns):
 #   Column                                                   Dtype  
---  ------                                                   -----  
 0   batch_import_KNMI_ghi__W_m_2                             Float32
 1   batch_import_KNMI_temp_out__degC                         Float32
 2   batch_import_remeha_temp_out__degC                       Float32
 3   batch_import_KNMI_wind__m_s_1                            Float32
 4   batch_import_enelogic_e_ret_monthly_hi_cum__kWh          Float64
 5   batch_import_enelogic_e_ret_monthly_lo_cum__kWh          Float64
 6   batch_import_enelogic_e_use_monthly_hi_cum__kWh          Float64
 7   batch_import_enelogic_e_use_monthly_lo_cum__kWh          Float64
 8   batch_import_enelogic_g_use_monthly_cum__m3       

In [5]:
print("[\n'","', \n'".join(sorted(df_prep.columns)),"'\n]")

[
' batch_import_KNMI_ghi__W_m_2', 
'batch_import_KNMI_temp_out__degC', 
'batch_import_KNMI_wind__m_s_1', 
'batch_import_enelogic_e_ret_monthly_hi__W', 
'batch_import_enelogic_e_ret_monthly_hi_cum__kWh', 
'batch_import_enelogic_e_ret_monthly_lo__W', 
'batch_import_enelogic_e_ret_monthly_lo_cum__kWh', 
'batch_import_enelogic_e_use_monthly_hi__W', 
'batch_import_enelogic_e_use_monthly_hi_cum__kWh', 
'batch_import_enelogic_e_use_monthly_lo__W', 
'batch_import_enelogic_e_use_monthly_lo_cum__kWh', 
'batch_import_enelogic_g_use_monthly_cum__m3', 
'batch_import_enelogic_g_use_monthly_hhv__W', 
'batch_import_remeha_boiler_status_blocking_mode__bool', 
'batch_import_remeha_boiler_status_burner_start__bool', 
'batch_import_remeha_boiler_status_burner_stop__bool', 
'batch_import_remeha_boiler_status_burning_ch__bool', 
'batch_import_remeha_boiler_status_burning_dhw__bool', 
'batch_import_remeha_boiler_status_controlled_stop__bool', 
'batch_import_remeha_boiler_status_de_air__bool', 
'batch_import

## Filtering values immediately relevant for heat distribution system modelling

In [6]:
heat_distribution_props = [
    'batch_import_KNMI_ghi__W_m_2', 
    'batch_import_KNMI_temp_out__degC', 
    'batch_import_KNMI_wind__m_s_1', 
    'device_p1-reader_g_use_hhv__W', 
    'batch_import_remeha_boiler_status_burning_ch__bool',
    'batch_import_remeha_gas_valve_closed__bool', 
    'batch_import_remeha_gas_valve_open__bool', 
    'batch_import_remeha_g_use_ch_lhv__W',
    'batch_import_remeha_temp_set__degC', 
    'batch_import_remeha_temp_in__degC', 
    'device_living_room_calibrated_temp_in__degC', 
    'batch_import_remeha_temp_sup__degC', 
    'batch_import_remeha_temp_ret__degC' 
]

In [7]:
df_heat_dist = df_prep[heat_distribution_props].copy()

In [8]:
print(f"df_heat_dist.count().sum(): {df_heat_dist.count().sum():_}")

df_heat_dist.count().sum(): 5_907_999


In [9]:
df_heat_dist.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1916540 entries, (401632, Timestamp('2023-04-01 00:00:00+0200', tz='Europe/Amsterdam')) to (495906, Timestamp('2024-04-04 01:00:00+0200', tz='Europe/Amsterdam'))
Data columns (total 13 columns):
 #   Column                                              Dtype  
---  ------                                              -----  
 0   batch_import_KNMI_ghi__W_m_2                        Float32
 1   batch_import_KNMI_temp_out__degC                    Float32
 2   batch_import_KNMI_wind__m_s_1                       Float32
 3   device_p1-reader_g_use_hhv__W                       Float64
 4   batch_import_remeha_boiler_status_burning_ch__bool  boolean
 5   batch_import_remeha_gas_valve_closed__bool          boolean
 6   batch_import_remeha_gas_valve_open__bool            boolean
 7   batch_import_remeha_g_use_ch_lhv__W                 Float64
 8   batch_import_remeha_temp_set__degC                  Float32
 9   batch_import_remeha_temp_in__degC  

## Calculating additional values relevant for heat distribution system modelling

In [10]:
%%time

# Filter out rows where either of the columns has NaN
valid_rows = df_heat_dist[['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].notna().all(axis=1)

# Calculate the mean only for valid rows
df_heat_dist.loc[valid_rows, 'calculated_temp_rad__degC'] = df_heat_dist.loc[valid_rows, ['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].mean(axis=1)

# Calculate Q_gain_ch__W based on national average efficiency
df_heat_dist['calculated_Q_gain_ch__W'] = df_heat_dist['batch_import_remeha_g_use_ch_lhv__W'] *  g_groningen_hhv___MJ_m_3 /  g_groningen_lhv___MJ_m_3 * eta_ch_nl_avg_hhv__J0

CPU times: user 1 s, sys: 89.9 ms, total: 1.09 s
Wall time: 979 ms


In [11]:
df_heat_dist.count()

batch_import_KNMI_ghi__W_m_2                          727112
batch_import_KNMI_temp_out__degC                      731300
batch_import_KNMI_wind__m_s_1                         730706
device_p1-reader_g_use_hhv__W                         361654
batch_import_remeha_boiler_status_burning_ch__bool    370548
batch_import_remeha_gas_valve_closed__bool            360192
batch_import_remeha_gas_valve_open__bool              360192
batch_import_remeha_g_use_ch_lhv__W                   427895
batch_import_remeha_temp_set__degC                    363192
batch_import_remeha_temp_in__degC                     369083
device_living_room_calibrated_temp_in__degC           365992
batch_import_remeha_temp_sup__degC                    370548
batch_import_remeha_temp_ret__degC                    369585
calculated_temp_rad__degC                             369585
calculated_Q_gain_ch__W                               427895
dtype: int64

In [12]:
df_heat_dist.count()

batch_import_KNMI_ghi__W_m_2                          727112
batch_import_KNMI_temp_out__degC                      731300
batch_import_KNMI_wind__m_s_1                         730706
device_p1-reader_g_use_hhv__W                         361654
batch_import_remeha_boiler_status_burning_ch__bool    370548
batch_import_remeha_gas_valve_closed__bool            360192
batch_import_remeha_gas_valve_open__bool              360192
batch_import_remeha_g_use_ch_lhv__W                   427895
batch_import_remeha_temp_set__degC                    363192
batch_import_remeha_temp_in__degC                     369083
device_living_room_calibrated_temp_in__degC           365992
batch_import_remeha_temp_sup__degC                    370548
batch_import_remeha_temp_ret__degC                    369585
calculated_temp_rad__degC                             369585
calculated_Q_gain_ch__W                               427895
dtype: int64

In [13]:
df_heat_dist.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
batch_import_KNMI_ghi__W_m_2,727112.0,47.510342,94.626961,0.0,0.0,0.0,52.242214,691.650757
batch_import_KNMI_temp_out__degC,731300.0,6.66037,4.116151,-8.733984,4.141963,7.592608,9.609388,18.942167
batch_import_KNMI_wind__m_s_1,730706.0,4.817799,2.657396,0.00075,2.909,4.32976,6.248661,21.418507
device_p1-reader_g_use_hhv__W,361654.0,1763.329544,3208.873868,0.0,0.0,29.308333,2432.591667,69753.833333
batch_import_remeha_g_use_ch_lhv__W,427895.0,722.75857,2426.335966,0.0,0.0,0.0,0.0,26400.0
batch_import_remeha_temp_set__degC,363192.0,17.584879,3.248365,6.0,16.0,18.200001,20.0,30.0
batch_import_remeha_temp_in__degC,369083.0,18.995916,1.536783,9.296,17.942001,19.040001,20.083557,27.432001
device_living_room_calibrated_temp_in__degC,365992.0,18.928291,1.501862,12.846417,17.840815,18.862556,20.092236,30.146008
batch_import_remeha_temp_sup__degC,370548.0,38.680077,12.919238,10.368417,27.280001,38.579998,48.392002,88.480003
batch_import_remeha_temp_ret__degC,369585.0,36.111893,12.517842,14.21,25.362,34.09,45.765999,89.094002


In [14]:
df_heat_dist

Unnamed: 0_level_0,Unnamed: 1_level_0,batch_import_KNMI_ghi__W_m_2,batch_import_KNMI_temp_out__degC,batch_import_KNMI_wind__m_s_1,device_p1-reader_g_use_hhv__W,batch_import_remeha_boiler_status_burning_ch__bool,batch_import_remeha_gas_valve_closed__bool,batch_import_remeha_gas_valve_open__bool,batch_import_remeha_g_use_ch_lhv__W,batch_import_remeha_temp_set__degC,batch_import_remeha_temp_in__degC,device_living_room_calibrated_temp_in__degC,batch_import_remeha_temp_sup__degC,batch_import_remeha_temp_ret__degC,calculated_temp_rad__degC,calculated_Q_gain_ch__W
id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
401632,2023-04-01 00:00:00+02:00,,,,,,,,,,,,,,,
401632,2023-04-01 00:05:00+02:00,,,,,,,,,,,,,,,
401632,2023-04-01 00:10:00+02:00,,,,,,,,,,,,,,,
401632,2023-04-01 00:15:00+02:00,,,,,,,,,,,,,,,
401632,2023-04-01 00:20:00+02:00,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495906,2024-04-04 00:40:00+02:00,0.0,10.248547,5.110382,,,,,,,,,,,,
495906,2024-04-04 00:45:00+02:00,0.0,10.178299,5.050424,,,,,,,,,,,,
495906,2024-04-04 00:50:00+02:00,0.0,10.108052,4.990466,,,,,,,,,,,,
495906,2024-04-04 00:55:00+02:00,0.0,10.037806,4.930508,,,,,,,,,,,,


In [15]:
%%time 
df_heat_dist.to_parquet(rhc_heat_dist_preprocessed_poperties_file, index=True, engine='pyarrow')

CPU times: user 800 ms, sys: 62.9 ms, total: 862 ms
Wall time: 793 ms


In [16]:
# %%time 
# # uncomment this entire block of code to enable it 
# for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()):
#     df_heat_dist.xs(home_id, drop_level=False).to_csv(
#         f'{home_id}_preprocessed_properties.zip',
#         encoding='utf-8',
#         compression= dict(method='zip',
#                           archive_name=f'{home_id}_preprocessed_properties.csv'),
#         date_format='%Y-%m-%dT%H:%M:%S%z'
#     )
