# REDUCEDHEATCARB sanity check

In [None]:
import pandas as pd
import numpy as np

rhc_file_path='rhc_raw_measurements.parquet'
rhc_file_write_path='rhc_sane_measurements.parquet'
rhc_raw_properties_file_path='rhc_raw_properties.parquet'

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload


%matplotlib inline
%matplotlib widget
import pylab as plt
import itertools
import gc

from tqdm.notebook import tqdm
from preprocessor import Preprocessor
from measurements import Measurements

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df = pd.read_parquet(
        rhc_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df.info()

In [None]:
df.index.dtypes

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

In [None]:
df

## Initial exploration: size, ids, start & stop times per id

In [None]:
# As it turns out, the original reducedheatcarb dataset (without Enelogic data added afterwards) does not contain duplicates
# As the code below typiclaly takes 1 - 2 minutes to run on a powerful serer, the code below which is commented out
# you only need to deduplicate after adding data, e.g. after downloading more Enelogic data

# %%time
# # deduplicate the measurements
# df = df.reset_index().drop_duplicates().set_index(['id', 'source_category', 'source_type', 'timestamp', 'property']).sort_index()

In [None]:
%%time
df_minmaxpersource = df.reset_index().groupby(['id', 'source_type'])['timestamp'].agg(['min', 'max'])

In [None]:
df_minmaxpersource

In [None]:
df.describe()

In [None]:
df.xs('enelogic', level='source_type', drop_level=False).reset_index().groupby(['id', 'source_category']).agg(first_timestamp=('timestamp', 'min'), last_timestamp=('timestamp', 'max')).sort_values('first_timestamp', ascending = True)

## Remove ids with only batch_import data from Remeha

In [None]:
df_size_per_id_and_source = df.groupby(['id', 'source_category']).size().unstack()


# Select ids where both 'cloud_feed' and 'device' are not greater than 0
filtered_ids = df_size_per_id_and_source[(df_size_per_id_and_source['cloud_feed'].fillna(0) <= 0) & (df_size_per_id_and_source['device'].fillna(0) <= 0)].index.tolist()

print("Removing ids with only Remeha data, i.e. where both 'cloud_feed' and 'device' are not > 0:")
print(filtered_ids)

# Remove rows associated with ids in filtered_ids
df = df[~df.index.get_level_values('id').isin(filtered_ids)]

In [None]:
df.info()

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

In [None]:
print('#ids: ', len(df.index.unique(level='id').values))
print('ids: ', list(df.index.unique(level='id')))
print('source categories: ', list(df.index.unique(level='source_category')))
print('source types: ', list(df.index.unique(level='source_type')))
print('properties: ', list(df.index.unique(level='property')))

print('values: ', df['value'].count())
print('#values per: \n', df.groupby(['source_type']).size())
print('#values per: \n', df.groupby(['source_category', 'source_type', 'property']).size())

In [None]:
len(list(df.index.unique(level='property')))

In [None]:
print('#values per: \n', df.groupby(['source_category']).size())

In [None]:
# Access the 'property' level
list(df.index.get_level_values('property').unique())

## Remove trailing decimals to avoid problems with int conversion


In [None]:
df['value'] = df['value'].str.replace(r'\.0+$', '', regex=True)

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

## Preprocessing categorical data

In [None]:
# Extract the 'gas_valve__str' values from the DataFrame
print('unique gas_valve__str values: ', list(df[df.index.get_level_values('property') == 'gas_valve__str']['value'].unique()))


In [None]:
# Rename columns based on the translation table
gas_valve_categories = {
    '0': 'gas_valve_open',
    '1': 'gas_valve_closed',
    '2': 'gas_valve_off'
}


In [None]:
%%time
%autoreload 2
df = Preprocessor.encode_categorical_property_as_boolean_properties(df, 'gas_valve__str', gas_valve_categories)

In [None]:
%%time
# Extract the 'boiler_status__str' values from the DataFrame
print("Unique categories for 'boiler_status__str':", list(df[df.index.get_level_values('property') == 'boiler_status__str']['value'].unique()))



In [None]:
# Rename columns based on the translation table
boiler_status_categories = {
    '0': 'boiler_status_standby',
    '1': 'boiler_status_heat_demand',
    '2': 'boiler_status_burner_start',
    '3': 'boiler_status_burning_ch',
    '4': 'boiler_status_burning_dhw',
    '5': 'boiler_status_burner_stop',
    '6': 'boiler_status_pump_post_run',
    '7': 'boiler_status_cooling_active',
    '8': 'boiler_status_controlled_stop',
    '9': 'boiler_status_blocking_mode',
    '10': 'boiler_status_locking_mode',
    '11': 'boiler_status_cs_mode_l_ch',
    '12': 'boiler_status_cs_mode_h_ch',
    '13': 'boiler_status_cs_mode_hdhw',
    '15': 'boiler_status_manual_hd_ch_on',
    '16': 'boiler_status_boiler_frost_prot',
    '17': 'boiler_status_de_air',
    '18': 'boiler_status_cu_cooling',
    '19': 'boiler_status_reset_in_progress',
    '20': 'boiler_status_auto_filling',
    '21': 'boiler_status_halted',
    '22': 'boiler_status_forced_calibration',
    '23': 'boiler_status_factory_test',
    '200': 'boiler_status_device_mode',
    '254': 'boiler_status_unknown'
}

In [None]:
%%time
df = Preprocessor.encode_categorical_property_as_boolean_properties(df, 'boiler_status__str', boiler_status_categories)

In [None]:
df.index.unique(level='id').values

In [None]:
print('#ids: ', len(df.index.unique(level='id').values))
print('ids: ', list(df.index.unique(level='id')))
print('source categories: ', list(df.index.unique(level='source_category')))
print('source types: ', list(df.index.unique(level='source_type')))
print('properties: ', list(df.index.unique(level='property')))

print('values: ', df['value'].count())
print('#values per: \n', df.groupby(['source_type']).size())
print('#values per: \n', df.groupby(['source_category', 'source_type', 'property']).size())

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

## Write sane measurements to parquet file(s)

In [None]:
%%time 
df.to_parquet(rhc_file_write_path, index=True, engine='pyarrow')

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df = pd.read_parquet(
        rhc_file_write_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

### Write raw measurements per home to parquet files

In [None]:
homes = list(df.index.unique(level='id'))
# homes = list(df.index.unique(level='id'))[:2]

In [None]:
homes

In [None]:
%%time 
for id in tqdm(homes):
    df.xs(id, drop_level=False).to_parquet(f'{id}_sane_measurements.parquet', index=True, engine='pyarrow')

## Put properties in separate columns, apply types and write parquet file(s)

In [None]:
# unstacking might take a lot of memory, hence do it homw by home. example: unstacking entire Twomes dataset uses 32 GB memory
del df
gc.collect()

### Writing raw properties per home to a parquet file

In [None]:
homes

In [None]:
property_types = {
    'battery_voltage__V': 'float32',
    'boiler_status_blocking_mode__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_burner_start__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_burner_stop__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_burning_ch__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_burning_dhw__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_controlled_stop__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_de_air__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_heat_demand__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_locking_mode__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_pump_post_run__bool': pd.BooleanDtype(),  # Nullable boolean type
    'boiler_status_standby__bool': pd.BooleanDtype(),  # Nullable boolean type
    'ch_set_fan_rotations_max__min_1': 'Int16',  # Adjusted to Int16 based on the range
    'ch_set_fan_rotations_min__min_1': 'Int16',  # Adjusted to Int16 based on the range
    'ch_water_pump_speed__0': 'float32',  # Adjusted to float32
    'co2__ppm': 'float32',
    'dhw_flow__l_min_1': 'float32',  # Adjusted to float32
    'dhw_temp_out__degC': 'float32',
    'meter_code__str': pd.StringDtype(),
    'dsmr_version__0': 'float32',
    'e_ret_cum__kWh': 'float64',
    'e_ret_hi_cum__kWh': 'float64',
    'e_ret_lo_cum__kWh': 'float64',
    'e_ret_monthly_hi_cum__kWh': 'float64',
    'e_ret_monthly_lo_cum__kWh': 'float64',
    'e_use_cum__kWh': 'float64',
    'e_use_hi_cum__kWh': 'float64',
    'e_use_lo_cum__kWh': 'float64',
    'e_use_monthly_hi_cum__kWh': 'float64',
    'e_use_monthly_lo_cum__kWh': 'float64',
    'fan_rotations__min_1': 'Int16',  # Adjusted to Int16 based on the range
    'ghi__W_m_2': 'float32',                      
    'g_use_ch_lhv_cum__kWh': 'float64',
    'g_use_cum__m3': 'float64',
    'g_use_monthly_cum__m3': 'float64',
    'g_use_dhw_lhv_cum__kWh': 'float64',
    'gas_valve_closed__bool': pd.BooleanDtype(),  # Nullable boolean type
    'gas_valve_open__bool': pd.BooleanDtype(),  # Nullable boolean type
    'heartbeat__0': 'Int8',
    'occupancy__p': 'Int8',
    'onboarded__p': 'Int8',
    'power_ch_max__kW': 'float32',  # Adjusted to float32
    'rel_humidity__0': 'float32',
    'temp_ch_sup_max__degC': 'float32',
    'temp_in__degC': 'float32',
    'temp_out__degC': 'float32',
    'temp_ret__degC': 'float32',
    'temp_set__degC': 'float32',
    'temp_sup__degC': 'float32',
    'wind__m_s_1': 'float32'                              
}


In [None]:
%%time

df_prop = pd.DataFrame()

for id in tqdm(homes):
    df_home = pd.read_parquet(f'{id}_sane_measurements.parquet', engine='pyarrow', dtype_backend='numpy_nullable')
    
    df_prop_home = Measurements.to_properties_with_source_category_and_type(
        df_home,
        property_types
    )
    df_prop_home.to_parquet(f'{id}_raw_properties.parquet', index=True, engine='pyarrow')
    df_prop = pd.concat([df_prop, df_prop_home]) 
    
if not df_prop.index.is_monotonic_increasing:
    df_prop = df_prop.sort_index()  

In [None]:
prop_describe = df_prop.describe(include='all')
prop_min_max = prop_describe.loc[['min', 'max']]
print(prop_min_max.T)

In [None]:
df_prop[df_prop.index.get_level_values('source_type') == 'KNMI'].describe().T


In [None]:
df_prop.describe().T

In [None]:
df_bools_to_float = df_prop[[col for col in df_prop.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

In [None]:
print(f"size: {df_prop.size:_}")

In [None]:
df_prop.info()

In [None]:
df_prop.index.dtypes

In [None]:
df_prop

### Writing raw properties to a parquet file

In [None]:
%%time 
df_prop.to_parquet(rhc_raw_properties_file_path, index=True, engine='pyarrow')