# Processing REDUCEDHEATARB data for heat distribution system model check

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

rhc_preprocessed_poperties_file='rhc_preprocessed_properties.parquet'
rhc_heat_dist_preprocessed_poperties_file='rhc_heat_dist_preprocessed_properties.parquet'
%load_ext autoreload

%matplotlib inline
%matplotlib widget

import sys
sys.path.append('../analysis')
from nfh_utils import *

## Reading preprocessed interpolated properties from a parquet file

In [None]:
%%time


# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
df_prep.info()

In [None]:
print("[\n'","', \n'".join(sorted(df_prep.columns)),"'\n]")

## Filtering values immediately relevant for heat distribution system modelling

In [None]:
heat_distribution_props = [
    'batch_import_KNMI_ghi__W_m_2', 
    'batch_import_KNMI_temp_out__degC', 
    'batch_import_KNMI_wind__m_s_1', 
    'device_p1-reader_g_use_hhv__W', 
    'batch_import_remeha_boiler_status_burning_ch__bool',
    'batch_import_remeha_gas_valve_closed__bool', 
    'batch_import_remeha_gas_valve_open__bool', 
    'batch_import_remeha_g_use_ch_lhv__W',
    'batch_import_remeha_temp_set__degC', 
    'batch_import_remeha_temp_in__degC', 
    'device_living_room_calibrated_temp_in__degC', 
    'batch_import_remeha_temp_sup__degC', 
    'batch_import_remeha_temp_ret__degC' 
]

In [None]:
df_heat_dist = df_prep[heat_distribution_props].copy()

In [None]:
print(f"df_heat_dist.count().sum(): {df_heat_dist.count().sum():_}")

In [None]:
df_heat_dist.info()

## Calculating additional values relevant for heat distribution system modelling

In [None]:
%%time

# Filter out rows where either of the columns has NaN
valid_rows = df_heat_dist[['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].notna().all(axis=1)

# Calculate the mean only for valid rows
df_heat_dist.loc[valid_rows, 'calculated_temp_rad__degC'] = df_heat_dist.loc[valid_rows, ['batch_import_remeha_temp_sup__degC', 'batch_import_remeha_temp_ret__degC']].mean(axis=1)

# Calculate Q_gain_ch__W based on national average efficiency
df_heat_dist['calculated_Q_gain_ch__W'] = df_heat_dist['batch_import_remeha_g_use_ch_lhv__W'] *  g_groningen_hhv___MJ_m_3 /  g_groningen_lhv___MJ_m_3 * eta_ch_nl_avg_hhv__J0

In [None]:
df_heat_dist.count()

In [None]:
df_heat_dist.count()

In [None]:
df_heat_dist.describe().T

In [None]:
df_heat_dist

In [None]:
%%time 
df_heat_dist.to_parquet(rhc_heat_dist_preprocessed_poperties_file, index=True, engine='pyarrow')

In [None]:
# %%time 
# # uncomment this entire block of code to enable it 
# for home_id in tqdm(df_heat_dist.index.get_level_values('id').unique()):
#     df_heat_dist.xs(home_id, drop_level=False).to_csv(
#         f'{home_id}_preprocessed_properties.zip',
#         encoding='utf-8',
#         compression= dict(method='zip',
#                           archive_name=f'{home_id}_preprocessed_properties.csv'),
#         date_format='%Y-%m-%dT%H:%M:%S%z'
#     )
