# Preprocessing REDUCEDHEATARB data

In [None]:
import pandas as pd
import numpy as np
import pylab as plt
from tqdm.notebook import tqdm

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')
sys.path.append('../analysis/')

rhc_raw_properties_file='rhc_raw_properties.parquet'

home_data_file_path = "home_data.parquet"
household_data_excel_file_path="household_properties_detailed.xlsx"
boiler_returntemp_load_efficiency_file_path = "boiler_returntemp_load_efficiency.parquet"

interpolate__min=1 # interpolation to 1 minute intervals (takes more space & time!)
rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_1_min.parquet'

# interpolate__min=5 # default interpolation to 5 minute intervals
# rhc_preprocessed_poperties_file='rhc_preprocessed_properties_intv_5_min.parquet'


%load_ext autoreload

%matplotlib inline
%matplotlib widget

from preprocessor import Preprocessor
from plotter import Plot
from nfh_utils import *

meta_df = None


### Load Measured Data Properties from parquet file

In [None]:
%%time
# Prerequisite: for this example to work, you need to have the b4b_raw_properties.parquet, located in the ../data/ folder.
# One way to get this is to run NeedForHeatExtractionBackup.ipynb, REDUCEDHEATCARB_data_merge.ipynb and REDUCEDHEATCARB_sanity_check.ipynb first,
# but then you have to run this code on the energietransitiewindesheim.nl server

# Attempt to read the Parquet file
try:
    df_prop = pd.read_parquet(
        rhc_raw_properties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prop.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prop = df_prop.sort_index()  

In [None]:
print(f"df_prop.count().sum(): {df_prop.count().sum():_}")

In [None]:
df_prop.index.unique(level='id').values

In [None]:
df_prop.index.unique(level='source_category').values

In [None]:
df_prop.index.unique(level='source_type').values

In [None]:
%%time
# Rename long source type names
rename_dict = {
    'twomes-co2-occupancy-scd41-m5coreink-firmware': 'living_room',
    'twomes-p1-reader-firmware': 'p1-reader'
}

df_prop = df_prop.rename(index=rename_dict, level='source_type')


In [None]:
df_prop.index.unique(level='source_type').values

In [None]:
df_prop

In [None]:
print(f"df_prop.count().sum(): {df_prop.count().sum():_}")

In [None]:
df_prop.info()

## Optional filter to try out new preprocessing code on a smaller subsample

In [None]:
# mask to filter a small sample
small_test__mask = (
    # (df_prep.index.get_level_values('id') == 483173)
    (df_prop.index.get_level_values('id') == 403603)
    & 
    (df_prop.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-23 00:00:00+01:00'))
    & 
    (df_prop.index.get_level_values('timestamp') < pd.to_datetime('2024-02-26 00:00:00+01:00'))
)

In [None]:
# # comment out the line below to disable the filter
# df_prop = df_prop[small_test__mask]

In [None]:
df_prop.info()

## Inspecting and preprocessing properties

In [None]:
df_prop.describe().T

In [None]:
df_bools_to_float = df_prop[[col for col in df_prop.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

### Preprocessing temp_outdoor__degC

In [None]:
prop = 'temp_outdoor__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
df_prop

In [None]:
%matplotlib inline
%matplotlib widget
# Group by 'id' to plot each ID's data separately
grouped = df_prop.groupby('id')[prop]

# Plotting
plt.figure(figsize=(10, 6))
for id_val, group_data in grouped:
    if not group_data.empty and not group_data.eq(0).all():
        group_data.plot.hist(bins=200, alpha=0.5, label=f'ID {id_val}')

plt.title(prop)
plt.xlabel(prop)
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=-28.0, max=40.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop

In [None]:
# df_prop['remeha_temp_outdoor__degC'].dropna().to_frame(name='remeha_temp_outdoor__degC').unstack(level='id').dropna(axis=1, how='all') # batch_import_remeha_temp_outdoor__degC
df_remeha_temp_outdoor__degC = df_prop.loc[(slice(None), slice(None), 'remeha', slice(None)), 'temp_outdoor__degC'].dropna().unstack(level='id').dropna(axis=1, how='all') # batch_import_remeha_temp_outdoor__degC


In [None]:
df_remeha_temp_outdoor__degC.count()

In [None]:
if not df_remeha_temp_outdoor__degC.empty:
    pd.Series(df_remeha_temp_outdoor__degC
              .xs('remeha', level='source_type')
              .xs('batch_import', level='source_category')[458000]
              .dropna()
              .to_frame()
              .index
              .diff()
              ).describe()


In [None]:
if not df_remeha_temp_outdoor__degC.empty:
    pd.Series(df_remeha_temp_outdoor__degC
              .xs('remeha', level='source_type')
              .xs('batch_import', level='source_category')[478667]
              .dropna()
              .to_frame()
              .index
              .diff()
              ).describe()


In [None]:
if not df_remeha_temp_outdoor__degC.empty:
    df_remeha_temp_outdoor__degC[458000].dropna().sort_index()

In [None]:
if not df_remeha_temp_outdoor__degC.empty:
    df_remeha_temp_outdoor__degC[478667].dropna().sort_index()

### Additional weather interpolation checks


In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_prop = pd.read_parquet(
        rhc_raw_properties_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prop.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prop = df_prop.sort_index()  

In [None]:
weather_locations_path = "weather_interpolation_locations.parquet"

In [None]:
%%time
# read df_weather_locations from Parquet file
try:
    df_weather_locations = pd.read_parquet(
        weather_locations_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
df_weather_locations

In [None]:
suspect_df_prop_wind = df_prop.loc[
    (df_prop.index.get_level_values('source_category') == 'batch_import') &
    (df_prop.index.get_level_values('source_type') == 'KNMI') &
    (df_prop['wind__m_s_1'] < 0),
    ['wind__m_s_1']
]

In [None]:
suspect_wind = suspect_df_prop_wind.count().item()
total_wind = df_prop['wind__m_s_1'].count().item()
print(f"wind: suspect {suspect_wind:_}; total: {total_wind:_}; fraction: {suspect_wind/total_wind:%}")

In [None]:
suspect_df_prop_wind.index.get_level_values('id').unique()

In [None]:
suspect_wind_cells = list(df_weather_locations.loc[suspect_df_prop_wind.index.get_level_values('id').unique()]['weather_H3_cell_id'].unique())

In [None]:
suspect_wind_cells

In [None]:
suspect_df_prop_wind.describe().T

In [None]:
suspect_df_prop_sun = df_prop.loc[
    (df_prop.index.get_level_values('source_category') == 'batch_import') &
    (df_prop.index.get_level_values('source_type') == 'KNMI') &
    (df_prop['sol_ghi__W_m_2'] < 0),
    ['sol_ghi__W_m_2']
]

In [None]:
suspect_sun = suspect_df_prop_sun.count().item()
total_sun = df_prop['sol_ghi__W_m_2'].count().item()
print(f"sun: suspect {suspect_sun:_}; total: {total_sun:_}; fraction: {suspect_sun/total_sun:%}")

In [None]:
suspect_df_prop_sun.index.get_level_values('id').unique()

In [None]:
suspect_sun_cells = list(df_weather_locations.loc[suspect_df_prop_sun.index.get_level_values('id').unique()]['weather_H3_cell_id'].unique())

In [None]:
suspect_df_prop_sun.describe().T

### Preprocessing wind__m_s_1


In [None]:
prop = 'wind__m_s_1'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
# Group by 'id' to plot each ID's data separately
grouped = df_prop.groupby('id')[prop]

# Plotting
plt.figure(figsize=(10, 6))
for id_val, group_data in grouped:
    if not group_data.empty and not group_data.eq(0).all():
        group_data.plot.hist(bins=200, alpha=0.5, label=f'ID {id_val}')

plt.title(prop)
plt.xlabel(prop)
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Ensure that geospatial interpolation did not cause negative wind speed values
df_prop[prop] = df_prop[prop].clip(lower=0)

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing sol_ghi__W_m_2

In [None]:
prop = 'sol_ghi__W_m_2'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
# Group by 'id' to plot each ID's data separately
grouped = df_prop.groupby('id')[prop]

# Plotting
plt.figure(figsize=(10, 6))
for id_val, group_data in grouped:
    if not group_data.empty and not group_data.eq(0).all():
        group_data.plot.hist(bins=200, alpha=0.5, label=f'ID {id_val}')

plt.title(prop)
plt.xlabel(prop)
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Ensure that geospatial interpolation did not cause negative GHI values
df_prop[prop] = df_prop[prop].clip(lower=0)

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing temp_indoor__degC

In [None]:
prop = 'temp_indoor__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=40.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].info()

In [None]:
df_prop[prop].groupby(level='id').count()

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_indoor__degC', 'temp_set__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_indoor__degC, temp_set__degC')

#### Preprocess indoor temperatures per source_category

In [None]:
df_prop[prop].groupby(level='source_type', observed=True).describe()

In [None]:
%matplotlib inline
%matplotlib widget
source_types = df_prop.index.get_level_values('source_type').unique()
valid_source_types = [st for st in source_types if not df_prop.xs(st, level='source_type')[prop].dropna().empty]

# Create the plot
plt.figure(figsize=(10, 6))

for source_type in valid_source_types:
    # Filter DataFrame for the current source_type and drop NaN values
    df_filtered = df_prop.xs(source_type, level='source_type')[prop].dropna()
    
    # Plot the histogram using matplotlib directly
    plt.hist(df_filtered, bins=200, alpha=0.5, density=True, label=source_type)

plt.legend()
plt.show()

In [None]:
source_types

In [None]:
df_prop.index.dtypes

In [None]:
df_prop.info()

In [None]:
%autoreload 2 
df_calibration_factors = Preprocessor.compute_calibration_factors(df_prop, prop, 'living_room', 'remeha', 20)

In [None]:
df_calibration_factors.set_index('id').T

In [None]:
pd.DataFrame(df_calibration_factors.mean()).T

In [None]:
%%time
df_prop, metadata = Preprocessor.create_calibrated_property(df_prop, prop, 'living_room', 'remeha', 20)        


In [None]:
df_prop[prop].groupby(level='source_type', observed=True).describe()

In [None]:
metadata

In [None]:
%matplotlib inline
%matplotlib widget
source_types = df_prop.index.get_level_values('source_type').unique()
valid_source_types = [st for st in source_types if not df_prop.xs(st, level='source_type')[prop].dropna().empty]

# Create the plot
plt.figure(figsize=(10, 6))

for source_type in valid_source_types:
    # Filter DataFrame for the current source_type and drop NaN values
    df_filtered = df_prop.xs(source_type, level='source_type')[prop].dropna()
    
    # Plot the histogram using matplotlib directly
    plt.hist(df_filtered, bins=200, alpha=0.5, density=True, label=source_type)

plt.legend()
plt.show()

### Preprocessing dhw_temp__degC

In [None]:
prop = 'dhw_temp__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing dhw_flow__l_min_1

In [None]:
prop = 'dhw_flow__l_min_1'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop[prop].info()

In [None]:
df_prop[prop].groupby(level='id').describe().style.map(Preprocessor.highlight_zero, subset=['std'])

In [None]:
df_prop, meta_df = Preprocessor.filter_id_prop_with_std_zero(df_prop, prop, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].groupby(level='id').describe().style.map(Preprocessor.highlight_zero, subset=['std'])

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop_filtered = df_prop[prop][df_prop[prop] != 0]

In [None]:
df_prop_filtered.info()

In [None]:
df_prop_filtered.describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop_filtered.plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing flow and return temperatures

#### Preprocessing temp_flow__degC

In [None]:
prop = 'temp_flow__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

#### Preprocessing temp_ret__degC

In [None]:
prop = 'temp_ret__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

### Copying and filtering flow and return temperatures valid for CH


In [None]:
%autoreload 2
df_prop = Preprocessor.add_filtered_flow_ret_ch_temperatures(df_prop)

In [None]:
df_prop.info()

In [None]:
df_prop[['temp_flow__degC', 'temp_flow_ch__degC', 'temp_ret__degC', 'temp_ret_ch__degC']].describe().T

In [None]:
df_prop['temp_flow_ch__degC'].count()/df_prop['temp_flow__degC'].count()

In [None]:
df_prop['temp_ret_ch__degC'].count()/df_prop['temp_ret__degC'].count()

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_flow_ch__degC', 'temp_flow__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_flow_ch__degC, temp_flow__degC')

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_ret_ch__degC', 'temp_ret__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_ret_ch__degC, temp_ret__degC')

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_ret_ch__degC', 'temp_flow_ch__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_flow_ch__degC, temp_ret_ch__degC')

In [None]:
df_prop[['temp_flow__degC', 'temp_flow_ch__degC', 'temp_ret__degC', 'temp_ret_ch__degC']].groupby('id').describe().T

## Inspecting and preprocessing parameters, i.e. constant (?) properties


### Preprocessing temp_flow_ch_max__degC (f.k.a. temp_ch_sup_max__degC)

In [None]:
df_prop['temp_flow_ch_max__degC'].groupby(level='id').describe().T

### Inspecting power_ch_max__kW

In [None]:
prop = 'power_ch_max__kW'

In [None]:
df_prop[prop].groupby(level='id').describe().T

## Inspecting and preprocessing smart meter values

In [None]:
df_prop['meter_code__str'].unique()

In [None]:
df_prop['dsmr_version__0'].unique()

In [None]:
df_prop.groupby(['id', 'meter_code__str', 'dsmr_version__0']).size().reset_index(name='count').set_index(['id', 'meter_code__str', 'dsmr_version__0'])

In [None]:
use_e_meter_cols = ['e_use_hi_cum__kWh', 'e_use_lo_cum__kWh']
ret_e_meter_cols = ['e_ret_hi_cum__kWh', 'e_ret_lo_cum__kWh']
all_e_meter_cols = use_e_meter_cols + ret_e_meter_cols

In [None]:
props = all_e_meter_cols + ['dsmr_version__0']

In [None]:
df_prop['dsmr_version__0'].unique()

In [None]:
df_prop[props].describe()

In [None]:
df_prop[props].groupby(level='id').describe().T

In [None]:
# %%time
# df_prop, meta_df = Preprocessor.filter_electricity_meter_values_fast(df_prop, min_valid_cum__kWh=15.0, meta_df=meta_df)

In [None]:
%%time
df_prop, meta_df = Preprocessor.filter_electricity_meter_values(df_prop, min_valid_cum__kWh=15.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop['dsmr_version__0'].unique()

In [None]:
df_prop[props].groupby(level='id').describe().T

#### Inspecting and preprocessing g_use_cum__m3

In [None]:
prop = 'g_use_cum__m3'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='std', ascending=False).T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing meter_code__str

In [None]:
prop = 'meter_code__str'

In [None]:
df_prop[prop].unique()

In [None]:
df_metercodes = df_prop.groupby(['id', 'meter_code__str', 'dsmr_version__0']).size().reset_index(name='count').set_index(['id', 'meter_code__str', 'dsmr_version__0'])

In [None]:
df_metercodes

In [None]:
id_lists = (df_metercodes
            .sort_values(by='dsmr_version__0')
            .reset_index()
            .groupby('dsmr_version__0')['id']
             .apply(list))

In [None]:
id_lists

In [None]:
# Print the full content of the Series
for key, value in id_lists.items():
    print(f"dsmr_version: {key:.1f}, {len(value)} ids: {sorted(value)}")

# Inspect number of null measurements and time covered before interpolation

In [None]:
%%time
count_non_null_before_interpolation = Preprocessor.count_non_null_measurements(df_prop).sort_index(axis=1).sort_values(by='total', ascending=False)

In [None]:
count_non_null_before_interpolation.T.style.map(Preprocessor.highlight_zero)

In [None]:
%%time
covered_time_before_interpolation = Preprocessor.calculate_covered_time(df_prop).sort_index(axis=1).sort_values(by='total', ascending=False)

In [None]:
if 'remeha_temp_outdoor__degC' in covered_time_before_interpolation.columns:
    display(covered_time_before_interpolation.sort_values(by='remeha_temp_outdoor__degC', ascending=False).T.style.map(Preprocessor.highlight_zero).format(lambda x: f'{x.days} d'))
else:
    display(covered_time_before_interpolation.T.style.map(Preprocessor.highlight_zero).format(lambda x: f'{x.days} d'))

## Plotting data using in analysis

In [None]:
# %%time

# #Plot all properties with a single unit for a single id
# Plot.dataframe_properties_plot(df_prop.loc[[948634, 999169]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool']]], units_to_mathtext)

In [None]:
# props = [prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool', 'p']]
props = ['co2_indoor__ppm', 'occupancy__p', 'onboarded__p']

In [None]:
props

In [None]:
df_prop.info()

In [None]:
list(df_prop.columns)

In [None]:
df = df_prop[props] 

In [None]:
df.describe().T


In [None]:
stats = df.groupby(level=['source_category', 'source_type','id'], observed=True).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'co2_indoor__ppm'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'flow_dstr_pump_speed__pct'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'dhw_flow__l_min_1'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

## Preprocessing co2_indoor__ppm
Filtering out measurement errors of 0 ppm below 5 ppm

In [None]:
prop = 'co2_indoor__ppm'

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop[prop].groupby(level=['id']).describe().T

#### No need to filter out values below 5 ppm (which would be clear measurement errors)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=5, meta_df=meta_df)

In [None]:
meta_df

#### Check to see whether minimum changed

In [None]:
df_prop[prop].groupby(level=['id']).describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


## Preprocessing occupancy__p

In [None]:
df_household = pd.read_excel(household_data_excel_file_path,index_col='id')
df_prop = df_prop.join(df_household['household__p'], on='id')

In [None]:
df_prop[['household__p', 'onboarded__p', 'occupancy__p']].describe().T

In [None]:
df_prop[['household__p', 'onboarded__p', 'occupancy__p']].groupby('id').describe().T

In [None]:
%%time
# Remove invalid occupancy__p data as long as not everybody is onboarded (i.e. as long as onboarded__p < household_p)
#TODO: update meta_df?
df_prop.loc[df_prop['onboarded__p'].isna() | (df_prop['onboarded__p'] < df_prop['household__p']), 'occupancy__p'] = np.nan
df_prop.drop(columns=['household__p', 'onboarded__p'], inplace=True)

In [None]:
df_prop['occupancy__p'].describe().T

In [None]:
df_prop['occupancy__p'].groupby('id').describe().T

## After preprocessing, before temporal interpolation

In [None]:
meta_df

In [None]:
meta_df.to_excel('rhc_preprocessing.xlsx')

In [None]:
df_prop.size

In [None]:
print(f"df_prop.count().sum(): {df_prop.count().sum():_}")

In [None]:
df_prop.info()

# Temporal interpolation of properties


In [None]:
%%time 

# Define the properties and their respective limit__min values
property_limits = {
    'boiler_status_blocking_mode__bool': 5,
    'boiler_status_burner_start__bool': 5,
    'boiler_status_burner_stop__bool': 5,
    'boiler_status_burning_ch__bool': 5,
    'boiler_status_burning_dhw__bool': 5,
    'boiler_status_controlled_stop__bool': 5,
    'boiler_status_de_air__bool': 5,
    'boiler_status_heat_demand__bool': 5,
    'boiler_status_locking_mode__bool': 5,
    'boiler_status_pump_post_run__bool': 5,
    'boiler_status_standby__bool': 5,
    'dhw_flow__l_min_1': 5,
    'dhw_temp__degC': 5,
    'gas_valve_closed__bool': 5,
    'gas_valve_open__bool': 5,
    'temp_flow__degC': 5,
    'temp_ret__degC': 5,
    'fan_rotations__min_1': 5,
    'temp_flow_ch__degC': 0,
    'temp_ret_ch__degC': 0,
    'batch_import_KNMI_temp_outdoor__degC': 4 * 60, # 4 hours
    'batch_import_KNMI_sol_ghi__W_m_2': 4 * 60, # 4 hours
    'batch_import_KNMI_wind__m_s_1': 4 * 60, # 4 hours
    'batch_import_KNMI_air_outdoor__Pa': 4 * 60, # 4 hours
    'batch_import_KNMI_air_outdoor_rel_humidity__0': 4 * 60, # 4 hours
    'temp_flow_ch_max__degC': 2 * 24 * 60,  # 2 days in minutes
    'power_ch_max__kW': 2 * 24 * 60,     # 2 days in minutes
    'batch_import_EDSN_actual_gas_std_hhv__J_m_3': 2 * 24 * 60,  # 2 days in minutes
    'ch_set_fan_rotations_max__min_1': 2 * 24 * 60,  # 2 days in minutes
    'ch_set_fan_rotations_min__min_1': 2 * 24 * 60,   # 2 days in minutes
    'e_ret_monthly_hi_cum__kWh': 32 * 24 * 60,   # 32 days in minutes,
    'e_ret_monthly_lo_cum__kWh': 32 * 24 * 60,   # 32 days in minutes,
    'e_use_monthly_hi_cum__kWh': 32 * 24 * 60,   # 32 days in minutes,
    'e_use_monthly_lo_cum__kWh': 32* 24 * 60,   # 32 days in minutes,
    'g_use_monthly_cum__m3': 32 * 24 * 60,   # 32 days in minutes,
}

In [None]:
# properly sort and index dataframe to prevent performance warnings
df_prop = df_prop.sort_index()

In [None]:
df_prop.info()

In [None]:
print(f"df_prop.count().sum(): {df_prop.count().sum():_}")

In [None]:
# Create overview of non-null counts per column, source_category, and source_type
df_prop.groupby(['source_category', 'source_type'], observed=True).apply(lambda x: x.notna().sum()).reset_index().T

## Inspect measurement intervals

In [None]:
%autoreload 2
df_intervals = Preprocessor.analyze_intervals(df_prop, default_limit__min=90, property_limits=property_limits, interpolate__min=5)

In [None]:
def mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

In [None]:
# Group by 'id', 'source_category', and 'source_type' and calculate descriptive statistics
# (df_intervals.groupby(['id', 'source_category', 'source_type'], observed=True)
(df_intervals.groupby(['source_category', 'source_type'], observed=True)
 .agg({'modal_intv__min': ['mean', 'std', 'min', 'max', mode],
    'limit__min': ['mean', 'std', 'min', 'max', mode],
    'upsample__min': ['mean', 'std', 'min', 'max', mode],
    'interpolate__min': ['mean', 'std', 'min', 'max', mode],
    'limit': ['mean', 'std', 'min', 'max', mode]
})).T

In [None]:
# Slice df_intervals to show only rows where source_category is 'batch_import'
df_intervals.loc[(slice(None), 'batch_import', slice(None)), :].sort_values(by='upsample__min', ascending=False)


In [None]:
df_intervals.loc[(slice(None), 'batch_import', slice(None), 'temp_outdoor__degC'), :]

In [None]:
df_intervals.loc[(slice(None), 'cloud_feed', slice(None), 'g_use_cum__m3'), :]

In [None]:
df_intervals.loc[(slice(None), 'device', slice(None), 'g_use_cum__m3'), :].sort_values(by='modal_intv__min', ascending=False)

In [None]:
id_lists = (df_intervals
 .loc[(slice(None), 'device', slice(None), 'g_use_cum__m3'), :]
 .sort_values(by='modal_intv__min', ascending=False)
 .reset_index()
 .groupby('modal_intv__min')['id']
 .apply(list))

# Print the full content of the Series
for key, value in id_lists.items():
    print(f"modal interval (minutes): {key}, {len(value)} ids: {sorted(value)}")

The IDs with a modal interval of 60 minutes correspond neatly to the IDs that have a smart meter that adheres to DSMR 4.2, which only support a gas meter reading once per hour

## Perform temporal interpolation

In [None]:
%%time
%autoreload 2

# Perform interpolation
df_interpolated = Preprocessor.interpolate_time(df_prop=df_prop,
                                                default_limit__min=90,
                                                property_limits=property_limits,
                                                interpolate__min=interpolate__min,
                                                restore_original_types=True,
                                                inplace=False)

In [None]:
df_interpolated

In [None]:
print(f"df_interpolated.count().sum(): {df_interpolated.count().sum():_}")

In [None]:
df_interpolated.info()

In [None]:
df_interpolated.index.dtypes

In [None]:
# Create overview of non-null counts per column, source_category, and source_type
df_interpolated.groupby(['source_category', 'source_type'], observed=True).apply(lambda x: x.notna().sum()).reset_index().T

In [None]:
df_prop.describe().T

In [None]:
df_interpolated.describe().T.sort_index()

In [None]:
df_bools_to_float = df_interpolated[[col for col in df_interpolated.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

# Inspect number of null measurements and time covered after interpolation

In [None]:
count_non_null_after_interpolation = Preprocessor.count_non_null_measurements(df_interpolated).sort_index(axis=1).sort_values(by='total', ascending=False)

In [None]:
count_non_null_after_interpolation.T.style.map(Preprocessor.highlight_zero)

In [None]:
mandatory_props=['living_room_calibrated_temp_indoor__degC', 'p1-reader_g_use_cum__m3']
filter_props = mandatory_props + ['all_mandatory_props']

In [None]:
covered_time_after_interpolation = Preprocessor.calculate_covered_time(df_interpolated, mandatory_props=mandatory_props).sort_values(by='all_mandatory_props', ascending=False)[filter_props]

In [None]:
covered_time_after_interpolation.T.style.map(Preprocessor.highlight_zero).format(lambda x: f'{x.days} d')

In [None]:
covered_time_after_interpolation.to_excel('rhc_covered_time.xlsx')

In [None]:
%%time
covered_time_after_interpolation = Preprocessor.calculate_covered_time(df_interpolated).sort_index(axis=1).sort_values(by='total', ascending=False)

In [None]:
covered_time_after_interpolation.T.style.map(Preprocessor.highlight_zero).format(lambda x: f'{x.days} d')

# Converting raw properties dataframe to preprocessed dataframe


In [None]:
%%time
%autoreload 2
df_prep = Preprocessor.unstack_source_cat_and_type(df_interpolated)

In [None]:
df_prep.info()

In [None]:
df_prep.index.dtypes

In [None]:
df_prep

# Remove first few minutes of heat distribution flow and return data

After temporal interpolation, the first few minutes of flow and return temperatures may not be represtentative for the flow and return temperature of the heat distribution system. Some heat of a DHW tapping may still be influencing the sensors for the flow and return temperature. 

In [None]:
filtered_flow_and_return_cols = {'batch_import_remeha_temp_flow_ch__degC', 'batch_import_remeha_temp_ret_ch__degC'}

In [None]:
valid_before = df_prep[list(filtered_flow_and_return_cols)].notna().sum().sum()
print(f"Number of valid flow and return temps before filtering: {valid_before}")

In [None]:
df_prep[['batch_import_remeha_temp_flow_ch__degC', 'batch_import_remeha_temp_ret_ch__degC']].describe()

In [None]:
# Define remove_first__min (number of initial minutes to exclude per streak)
%autoreload 2
min_streak_length__min = 10
remove_first__min = 4 
remove_last__min = 0 

In [None]:
clipped_flow_and_return_cols = {'filtered_batch_import_remeha_temp_flow_ch__degC', 'filtered_batch_import_remeha_temp_ret_ch__degC'}

In [None]:
%%time
%autoreload 2
df_prep = Preprocessor.add_clipped_flow_return_temps(
    df_prep,
    sorted(list(filtered_flow_and_return_cols)),
    sorted(list(clipped_flow_and_return_cols)),
    min_streak_length__min,
    remove_first__min,
    remove_last__min
)

In [None]:
df_prep[list(clipped_flow_and_return_cols)].describe()

## Interpolate across flow and return temperatures across gaps not too large

In [None]:
# gap_max_duration__min = 60  # in minutes
gap_max_duration__min = 5  # in minutes

In [None]:
interpolated_flow_and_return_cols = {f"interpolated_{col}" for col in filtered_flow_and_return_cols}

In [None]:
interpolated_flow_and_return_cols

In [None]:
%%time
%autoreload 2
df_prep = Preprocessor.interpolate_with_gap_limit(
    df_prep, 
    sorted(list(clipped_flow_and_return_cols)), 
    sorted(list(interpolated_flow_and_return_cols)), 
    gap_max_duration__min
)

In [None]:
df_prep[list(interpolated_flow_and_return_cols)].describe()

In [None]:
valid_after = df_prep[list(clipped_flow_and_return_cols)].notna().sum().sum()
print(f"Removed {valid_before - valid_after} values not representative")
print(f"Number of valid flow and return temps after filtering: {valid_after}")

In [None]:
valid_interpolated_after = df_prep[list(interpolated_flow_and_return_cols)].notna().sum().sum()
print(f"Added {valid_interpolated_after - valid_after} values not during interpolation")
print(f"Number of valid flow and return temps after filtering and interpolation: {valid_interpolated_after}")

## Calculate heat distribution system temperature


In [None]:
df_prep['temp_dstr__degC'] = (df_prep['batch_import_remeha_temp_flow_ch__degC'] + df_prep['batch_import_remeha_temp_ret_ch__degC']) / 2

# Convert cumulative smart meter values to average power

In [None]:
meter_props = sorted([item for item in list(df_prep.columns) if item.endswith('_cum__kWh') or item.endswith('_cum__m3')])

In [None]:
%%time
%autoreload 2
df_prep = Preprocessor.convert_cumulative_to_avg_power(df_prep,
                                                       props=meter_props,
                                                       heating_value__MJ_m_3=(gas_groningen_nl_avg_std_hhv__J_m_3 / 1e6),
                                                       heating_value_name__str='hhv'
                                                      )

In [None]:
print(df_prep.describe().T.to_string())

In [None]:
list(df_prep.columns)

In [None]:
power_props = [
    'device_p1-reader_e_ret_hi__W',
    'device_p1-reader_e_ret_lo__W',
    'device_p1-reader_e_use_hi__W',
    'device_p1-reader_e_use_lo__W'
]

In [None]:
df_prep[power_props].groupby('id').count().T

In [None]:
df_prep[power_props].groupby('id').describe().T

In [None]:
# Set return power to zero when NaN, but only if both usage value powers are not NaN
mask_use_not_nan = df_prep['device_p1-reader_e_use_hi__W'].notna() & df_prep['device_p1-reader_e_use_lo__W'].notna()

df_prep.loc[mask_use_not_nan, ['device_p1-reader_e_ret_hi__W', 'device_p1-reader_e_ret_lo__W']] = \
    df_prep.loc[mask_use_not_nan, ['device_p1-reader_e_ret_hi__W', 'device_p1-reader_e_ret_lo__W']].fillna(0)


In [None]:
df_prep[power_props].groupby('id').count().T

In [None]:
df_prep[power_props].groupby('id').describe().T

# Calculate additional properties 
We may have to move some of these calculations to inside the GEKKO Python model code (e.g. for the what-if scenario simulation)

### Calculating electricity data

In [None]:
%%time
df_prep['device_p1-reader_e_use__W'] = df_prep['device_p1-reader_e_use_hi__W'] + df_prep['device_p1-reader_e_use_lo__W']
df_prep['device_p1-reader_e_ret__W'] = df_prep['device_p1-reader_e_ret_hi__W'] + df_prep['device_p1-reader_e_ret_lo__W'] 
df_prep['device_p1-reader_e__W'] = df_prep['device_p1-reader_e_use__W'] - df_prep['device_p1-reader_e_ret__W'] 


### Reading and calculating boiler data 

#### Reading home metadata

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_homes = pd.read_parquet(
        home_data_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#### Merge home data into df_prep

In [None]:
# Merge df_prep with df_homes to get the brand_model for each id
df_prep = df_prep.reset_index().merge(df_homes, on='id').set_index(['id', 'timestamp'])

## Calculate gas input power

### Calculate conversion factors for calorific value, temperature and pressure

In [None]:
# use the actual higher heating value
df_prep.loc[:,'gas_std_hhv__J_m_3'] = df_prep['batch_import_EDSN_actual_gas_std_hhv__J_m_3']

# alternatively, use average of actual higher heating value
# df_prep.loc[:,'gas_std_hhv__J_m_3'] = df_prep[df_prep['batch_import_remeha_boiler_status_burning_ch__bool'] == True]['batch_import_EDSN_actual_gas_std_hhv__J_m_3'].mean()

# alternative, simpler solution: assume higher heating value of groningen gas
# df_prep.loc[:,'gas_std_hhv__J_m_3'] = gas_groningen_nl_avg_std_hhv__J_m_3

# Calorific value conversion factor from G25.3 gas to the actual gas used 
df_prep.loc[:,'gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0'] = df_prep['gas_std_hhv__J_m_3'] / gas_g25_3_ref_lhv__J_m_3

# Pressure conversion factor from reference pressure (1 atm) to actual pressure (KNMI), including constant overpressure
df_prep.loc[:,'gas_pressure_factor_ref_to_actual__J0'] = (
    (df_prep['batch_import_KNMI_air_outdoor__Pa'] + overpressure_gas_nl_avg__Pa) 
    / 
    (P_std__Pa + overpressure_gas_nl_avg__Pa)
)

# Temperature conversion factor from reference temperature to actual temperature (of which we only know an average value based on ACM)
df_prep.loc[:,'gas_temp_factor_ref_to_actual__J0'] = temp_gas_ref__K / temp_gas_nl_avg__K

In [None]:
# Calorific value conversion factor from Groningen gas gas to the actual gas used 
df_prep.loc[:,'gas_calorific_factor_groningen_hhv_to_actual_hhv__J0'] = df_prep['gas_std_hhv__J_m_3'] / gas_groningen_nl_avg_std_hhv__J_m_3

# Pressure conversion factor to correct pressure conversion by smart meter (assumed: P_nl_avg__Pa) using actual pressure (KNMI), including constant overpressure
df_prep.loc[:,'gas_pressure_factor_correct_smart_meter_to_actual__J0'] = (
    (P_nl_avg__Pa + overpressure_gas_nl_avg__Pa)
    /
    (df_prep['batch_import_KNMI_air_outdoor__Pa'] + overpressure_gas_nl_avg__Pa)
)


In [None]:
df_prep[[
    'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
    'gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0',
    'gas_calorific_factor_groningen_hhv_to_actual_hhv__J0',
    'gas_temp_factor_ref_to_actual__J0',
    'gas_pressure_factor_ref_to_actual__J0',
    'gas_pressure_factor_correct_smart_meter_to_actual__J0',
]].describe().T

### Calculate gas input power based on boiler fan speed

In [None]:
%%time                                         
# Create value for boiler load percentage (how far along the current fan RPM is between the min and max)
df_prep.loc[:, 'fan_speed__pct'] = (
    (df_prep['batch_import_remeha_fan_rotations__min_1'] - df_prep['fan_min_ch_rotations__min_1'])
    /
    (df_prep['fan_max_ch_rotations__min_1'] - df_prep['fan_min_ch_rotations__min_1'])
).astype('Float32').clip(lower=0.0, upper=1.0) * 100  # Convert to percentage

# Calculate input power of G25.3 gas as a lineair interpolation btween Qnh_min_lhv__kW and Qnh_max_lhv__kW based on fan_speed__pct
df_prep.loc[:, 'g25_3_use_fan_lhv__W'] = (
    df_prep['batch_import_remeha_gas_valve_open__bool']   #only include the gas input if the valve was open
    * 1e3                                                 # convert from kW to W 
    *
    (df_prep['Qnh_min_lhv__kW'] 
     + 
     df_prep['fan_speed__pct'] / 100 * (df_prep['Qnh_max_lhv__kW'] - df_prep['Qnh_min_lhv__kW'])
    )
).astype('Float32')

# Calculate the (unrounded) load percentage, as a value between the minimum and maximum load. 
df_prep.loc[:, 'g_use_fan_load__pct'] = (
    df_prep['g25_3_use_fan_lhv__W'] 
    /
    (df_prep['Qnh_max_lhv__kW'] * 1e3)
).astype('Float32').clip(lower=0.0, upper=1.0) * 100

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'fan_speed__pct'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'g_use_fan_load__pct'
df_prep[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%%time                                         
# estimate gas input power at actual pressure and temperature 
df_prep.loc[:,'g_use_fan_hhv__W'] = (
    df_prep['g25_3_use_fan_lhv__W']
    *
    df_prep['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0']
    *
    df_prep['gas_pressure_factor_ref_to_actual__J0'] 
    *
    df_prep['gas_temp_factor_ref_to_actual__J0']
)

In [None]:
df_prep.loc[:,'g_use_fan_ch_hhv__W'] = df_prep['g_use_fan_hhv__W'] * df_prep['batch_import_remeha_boiler_status_burning_ch__bool']
df_prep.loc[:,'g_use_fan_dhw_hhv__W'] = df_prep['g_use_fan_hhv__W'] * df_prep['batch_import_remeha_boiler_status_burning_dhw__bool']

### Calculate gas input power based on Remeha energy counters

In [None]:
%%time                                         
# estimate gas input power at actual pressure and temperature 
df_prep.loc[:,'g_use_boilercounter_dhw_hhv__W'] = (
    df_prep['batch_import_remeha_g_use_dhw_lhv__W']
    *
    df_prep['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0']
    *
    df_prep['gas_pressure_factor_ref_to_actual__J0'] 
    *
    df_prep['gas_temp_factor_ref_to_actual__J0']
)

df_prep.loc[:,'g_use_boilercounter_ch_hhv__W'] = (
    df_prep['batch_import_remeha_g_use_ch_lhv__W']
    *
    df_prep['gas_calorific_factor_g25_3_lhv_to_actual_hhv__J0']
    *
    df_prep['gas_pressure_factor_ref_to_actual__J0'] 
    *
    df_prep['gas_temp_factor_ref_to_actual__J0']
)

df_prep.loc[:,'g_use_boilercounter_hhv__W'] = (
    df_prep['g_use_boilercounter_ch_hhv__W'] 
    + 
    df_prep['g_use_boilercounter_dhw_hhv__W']
)

In [None]:
df_bools_to_float = df_prep[[col for col in df_prep.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

### Calculate actual gas input power for all purposes based on smart meter data

In [None]:
%%time
# Smart meters measure and correct for temperature; correct for actual air pressure and  actual calorific value is not yet done; we do it here.

df_prep.loc[:,'g_use_p1_hhv__W'] = (
    df_prep['device_p1-reader_g_use_hhv__W']
    * df_prep['gas_calorific_factor_groningen_hhv_to_actual_hhv__J0']     # calorific conversion factor
    * df_prep['gas_pressure_factor_correct_smart_meter_to_actual__J0']    # pressure conversion factor
)

## Inspect filtered and interpolated return temperatures

In [None]:
units_to_mathtext = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0:False 1:True$',
    'p' : r'$persons$',
    'W' : r'$W$',
    'W_m_2' : r'$W/m^{2}$',
    'm_s_1' : r'$m/s$',
    'W0' : r'$W^{0}$',
    'min_1' : r'$/min$',
    'l_min_1' : r'$L/min$',
    'dm3_s_1' : r'$dm^{3}/s$',
    'pct': '%',
}

In [None]:
#mask to filter a test day for flow and return temp filtering and gap bridging
returntemp_mask = (
    # (df_prep.index.get_level_values('id') == 483173)
    (df_prep.index.get_level_values('id') == 403603)
    & 
    (df_prep.index.get_level_values('timestamp') >= pd.to_datetime('2024-02-23 00:00:00+01:00'))
    & 
    (df_prep.index.get_level_values('timestamp') < pd.to_datetime('2024-02-26 00:00:00+01:00'))
)

In [None]:
replace_boolprops = {'batch_import_remeha_boiler_status_burning_ch__bool',
                     'batch_import_remeha_boiler_status_pump_post_run__bool', 
                     'batch_import_remeha_boiler_status_burning_dhw__bool',
                    }

In [None]:
%%time
for prop in replace_boolprops:
    df_prep[f"{prop}01"] = df_prep[prop].astype('Int8')

In [None]:
replaced_boolprops = {f"{prop}01" for prop in replace_boolprops}

In [None]:
flow_and_return_cols = {
    'batch_import_remeha_temp_flow__degC',
    'batch_import_remeha_temp_ret__degC',
}

In [None]:
base_cols = {
    # 'g_use_fan_ch_hhv__W',
    # 'g_use_fan_dhw_hhv__W',
    # 'batch_import_remeha_dhw_flow__l_min_1',
    'batch_import_remeha_flow_dstr_pump_speed__pct',
    'g_use_fan_load__pct',
    # 'fan_speed__pct',
} | replaced_boolprops

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][list(base_cols | flow_and_return_cols)], units_to_mathtext)

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][list(base_cols | filtered_flow_and_return_cols)], units_to_mathtext)

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][list(base_cols | clipped_flow_and_return_cols)], units_to_mathtext)

In [None]:
Plot.dataframe_preprocessed_plot(df_prep[returntemp_mask][list(base_cols | interpolated_flow_and_return_cols)], units_to_mathtext)

# Writing preprocessed interpolated properties to a parquet file

In [None]:
df_prep.info()

In [None]:
df_prep.index.dtypes

In [None]:
%%time 
df_prep.to_parquet(rhc_preprocessed_poperties_file, index=True, engine='pyarrow')

# Missing value overview

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
list(sorted(df_prep.columns))

In [None]:
mandatory_sourceprops =  ['batch_import_KNMI_temp_outdoor__degC', 
                          'batch_import_KNMI_sol_ghi__W_m_2',
                          'batch_import_KNMI_wind__m_s_1',
                          'batch_import_KNMI_air_outdoor__Pa',
                          'batch_import_remeha_temp_indoor__degC',
                          'batch_import_remeha_temp_ret__degC',
                          'batch_import_remeha_temp_flow__degC',
                          'batch_import_remeha_fan_rotations__min_1',
                          'batch_import_EDSN_actual_gas_std_hhv__J_m_3',
                          # 'device_p1-reader_g_use_cum__m3'
                         ]

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=mandatory_sourceprops, freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep,
                                properties_include=['batch_import_KNMI_temp_outdoor__degC',
                                                    'batch_import_KNMI_sol_ghi__W_m_2',
                                                    'batch_import_KNMI_wind__m_s_1',
                                                    'batch_import_KNMI_air_outdoor__Pa',
                                                    'batch_import_KNMI_air_outdoor_rel_humidity__0',                                                   ],
                                freq='1W', 
                                title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['device_p1-reader_g_use_cum__m3'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['batch_import_remeha_temp_outdoor__degC'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['batch_import_KNMI_temp_outdoor__degC',
                                                             'batch_import_KNMI_sol_ghi__W_m_2',
                                                             'batch_import_KNMI_wind__m_s_1',
                                                             'batch_import_remeha_temp_indoor__degC',
                                                             'batch_import_remeha_temp_ret__degC',
                                                             'batch_import_remeha_temp_flow__degC',
                                                             # 'batch_import_remeha_g_use_ch_lhv__W',
                                                             'device_p1-reader_e_use_hi_cum__kWh', 
                                                             'device_p1-reader_e_use_lo_cum__kWh',
                                                             'device_living_room_co2_indoor__ppm',
                                                             'device_living_room_occupancy__p', 
                                                             'device_living_room_rel_humidity__0', 
                                                             'device_living_room_temp_indoor__degC',
                                                             'device_p1-reader_g_use_cum__m3' 
                                                            ], 
                                freq='1W', title_fontsize=6)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['device_p1-reader_e_use_hi_cum__kWh', 
                                                             'device_p1-reader_e_use_lo_cum__kWh',
                                                             'device_living_room_co2_indoor__ppm',
                                                             'device_living_room_occupancy__p', 
                                                             'device_living_room_rel_humidity__0', 
                                                             'device_living_room_temp_indoor__degC',
                                                             'device_p1-reader_g_use_cum__m3' 
                                                            ], 
                                freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['batch_import_remeha_temp_indoor__degC'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['device_living_room_co2_indoor__ppm',
                                                             'device_living_room_occupancy__p', 
                                                             'device_living_room_rel_humidity__0', 
                                                             'device_living_room_temp_indoor__degC'
                                                            ], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['device_living_room_calibrated_temp_indoor__degC'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['batch_import_remeha_temp_indoor__degC', 'device_living_room_calibrated_temp_indoor__degC'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['cloud_feed_enelogic_g_use_cum__m3'], freq='1W', title_fontsize=8)

In [None]:
Plot.plot_missing_data_overview(df_prep, properties_include=['batch_import_enelogic_g_use_monthly_cum__m3'], freq='1M', title_fontsize=8)

## Alternative plot: using colored axes


In [None]:
%%time
Plot.plot_data_availability(df_prep, properties_include=mandatory_sourceprops, alpha=0.5, figsize=(12, 12), title_fontsize=8)


In [None]:
%%time
Plot.plot_data_availability(df_prep, properties_include=['cloud_feed_enelogic_g_use_cum__m3'], alpha=0.5, figsize=(12, 12), title_fontsize=8)


# Histograms of streak length


In [None]:
df_prep.info()

In [None]:
Preprocessor.calculate_streak_durations(df_prep, mandatory_sourceprops)

In [None]:
Preprocessor.calculate_streak_durations(df_prep, ['cloud_feed_enelogic_g_use_cum__m3'])

In [None]:
Preprocessor.calculate_streak_durations(df_prep, ['device_p1-reader_g_use_cum__m3'])

In [None]:
Preprocessor.calculate_streak_durations(df_prep, ['device_living_room_co2_indoor__ppm'])

In [None]:
Preprocessor.calculate_streak_durations(df_prep, ['batch_import_remeha_temp_ret_ch__degC'])

# Plotting results: time series before and after preprocessing

In [None]:
# %%time
# #Plot all properties with a single unit for a single id
# Plot.dataframe_properties_plot(df_prop.loc[[401632]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] == 'm3']], units_to_mathtext)

In [None]:
# %%time
# #Plot all preprocessed properties with a single unit for a single id
# Plot.dataframe_preprocessed_plot(df_prep.loc[[401632]][[prop for prop in df_prep.columns.values if prop.split('__')[-1] == 'degC']], units_to_mathtext)

In [None]:
# TO DO: add prop_ and prep_ in a `prop-prep` column; merge into single dataframe and unstack, thus allowing for close inspection of preprocessed data

# Other examples: temp_indoor__degC, minmax filtering and using the static outlier filter, per id
Filtering out extreme temperatures based on mean and standard deviation per room

In [None]:
prop = 'temp_indoor__degC'

In [None]:
%matplotlib inline
%matplotlib widget

df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)
