# Preprocessing REDUCEDHEATARB data

In [None]:
import pandas as pd
import numpy as np
import pylab as plt

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

rhc_raw_properties_file='rhc_raw_props.parquet'
rhc_preprocessed_poperties_file='rhc_prep_props.parquet'
%load_ext autoreload

%matplotlib inline
%matplotlib widget

from preprocessor import Preprocessor
from plotter import Plot

meta_df = None


### Load Measured Data Properties from parquet file

In [None]:
%%time
# Prerequisite: for this example to work, you need to have the b4b_raw_properties.parquet, located in the ../data/ folder.
# One way to get this is to run NeedForHeatExtractionBackup.ipynb, REDUCEDHEATCARB_data_merge.ipynb and REDUCEDHEATCARB_sanity_check.ipynb first,
# but then you have to run this code on the energietransitiewindesheim.nl server

# Attempt to read the Parquet file
try:
    df_prop = pd.read_parquet(
        rhc_raw_properties_file, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prop.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prop = df_prop.sort_index()  

In [None]:
df_prop.index.unique(level='id').values

In [None]:
df_prop.index.unique(level='source_category').values

In [None]:
df_prop.index.unique(level='source_type').values

In [None]:
# Rename long source type names
rename_dict = {
    'twomes-co2-occupancy-scd41-m5coreink-firmware': 'living_room',
    'twomes-p1-reader-firmware': 'p1-reader'
}

df_prop = df_prop.rename(index=rename_dict, level='source_type')


In [None]:
df_prop.index.unique(level='source_type').values

In [None]:
df_prop

In [None]:
df_prop.count().sum()

In [None]:
df_prop.info()

## Inspecting and preprocessing properties

In [None]:
df_prop.describe().T

In [None]:
df_bools_to_float = df_prop[[col for col in df_prop.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

### Preprocessing temp_out__degC

In [None]:
prop = 'temp_out__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=-28.0, max=40.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing temp_in__degC

In [None]:
prop = 'temp_in__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=40.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].info()

In [None]:
df_prop[prop].groupby(level='id').count()

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_in__degC', 'temp_set__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_in__degC, temp_set__degC')

#### Preprocess indoor temperatures per source_category

In [None]:
df_prop[prop].groupby(level='source_type').describe()

In [None]:
%matplotlib inline
%matplotlib widget
source_types = df_prop.index.get_level_values('source_type').unique()
valid_source_types = [st for st in source_types if not df_prop.xs(st, level='source_type')[prop].dropna().empty]

# Create the plot
plt.figure(figsize=(10, 6))

for source_type in valid_source_types:
    # Filter DataFrame for the current source_type and drop NaN values
    df_filtered = df_prop.xs(source_type, level='source_type')[prop].dropna()
    
    # Plot the histogram using matplotlib directly
    plt.hist(df_filtered, bins=200, alpha=0.5, density=True, label=source_type)

plt.legend()
plt.show()

In [None]:
source_types

In [None]:
%autoreload 2 
df_calibration_factors = Preprocessor.compute_calibration_factors(df_prop, prop, 'living_room', 'remeha', 20)

In [None]:
df_calibration_factors

In [None]:
pd.DataFrame(df_calibration_factors.mean()).T

In [None]:
%%time
df_prop, metadata = Preprocessor.create_calibrated_property(df_prop, prop, 'living_room', 'remeha', 20)        


In [None]:
df_prop[prop].groupby(level='source_type').describe()

In [None]:
metadata

In [None]:
%matplotlib inline
%matplotlib widget
source_types = df_prop.index.get_level_values('source_type').unique()
valid_source_types = [st for st in source_types if not df_prop.xs(st, level='source_type')[prop].dropna().empty]

# Create the plot
plt.figure(figsize=(10, 6))

for source_type in valid_source_types:
    # Filter DataFrame for the current source_type and drop NaN values
    df_filtered = df_prop.xs(source_type, level='source_type')[prop].dropna()
    
    # Plot the histogram using matplotlib directly
    plt.hist(df_filtered, bins=200, alpha=0.5, density=True, label=source_type)

plt.legend()
plt.show()

### Preprocessing temp_sup__degC

In [None]:
prop = 'temp_sup__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop['meter_code__str'].unique()

In [None]:
df_prop['dsmr_version__0'].unique()

In [None]:
df_prop.groupby(['id', 'meter_code__str', 'dsmr_version__0']).size().reset_index(name='count').set_index(['id', 'meter_code__str', 'dsmr_version__0'])

### Preprocessing temp_ret__degC

In [None]:
prop = 'temp_ret__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[['temp_sup__degC', 'temp_ret__degC']].plot.hist(bins=200, alpha=0.5, title = 'temp_sup__degC, temp_ret__degC')

### Preprocessing dhw_temp_out__degC

In [None]:
prop = 'dhw_temp_out__degC'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=0.0, max=100.0, meta_df=meta_df)

In [None]:
meta_df

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

### Preprocessing dhw_flow__l_min_1

In [None]:
prop = 'dhw_flow__l_min_1'

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop[prop].info()

In [None]:
# Define a custom styling function
def highlight_zero_std(s):
    # Use pd.isna to handle NaN values properly
    is_zero = (s == 0.0) & (~pd.isna(s))
    return ['background-color: lightcoral; color: red;' if v else '' for v in is_zero]


In [None]:
stats_per_id = df_prop[prop].groupby(level='id').describe()

In [None]:
# Apply the custom styling to the DataFrame
styled_stats = stats_per_id.style.apply(
    highlight_zero_std, 
    subset=['std']
)

# Display the styled DataFrame
styled_stats

In [None]:
df_prop, meta_df = Preprocessor.filter_id_prop_with_std_zero(df_prop, prop, meta_df=meta_df)

In [None]:
meta_df

In [None]:
stats_per_id = df_prop[prop].groupby(level='id').describe()

In [None]:
# Apply the custom styling to the DataFrame
styled_stats = stats_per_id.style.apply(
    highlight_zero_std, 
    subset=['std']
)

# Display the styled DataFrame
styled_stats

In [None]:
df_prop[prop].describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop_filtered = df_prop[prop][df_prop[prop] != 0]

In [None]:
df_prop_filtered.info()

In [None]:
df_prop_filtered.describe().T

In [None]:
%matplotlib inline
%matplotlib widget
df_prop_filtered.plot.hist(bins=200, alpha=0.5, title = prop)

## Inspecting and preprocessing 'fixed' (?) properties


### Preprocessing temp_ch_sup_max__degC

In [None]:
df_prop['temp_ch_sup_max__degC'].groupby(level='id').describe()

### Inspecting power_ch_max__kW

In [None]:
prop = 'power_ch_max__kW'

In [None]:
df_prop[prop].groupby(level='id').describe()

## Inspecting and preprocessing smart meter values

#### Removing electricity meter values when dsmr_version__0 < 3.0

In [None]:
# Define columns for electricity meter values
use_meter_cols = ['e_use_hi_cum__kWh', 'e_use_lo_cum__kWh']
ret_meter_cols = ['e_ret_hi_cum__kWh', 'e_ret_lo_cum__kWh']
all_meter_cols = use_meter_cols + ret_meter_cols

# Apply mask to set use meter columns to NaN where dsmr_version__0 < 3.0 or values < 10 kWh
mask_version = df_prop['dsmr_version__0'] < 3.0
mask_use_values = df_prop[use_meter_cols] < 10

# Combine the masks for use meter columns
mask_use = mask_version | mask_use_values.any(axis=1)

df_prop.loc[mask_use, use_meter_cols] = np.nan

# Identify ids where the maximum value of ret meter columns is 0.0
max_ret_values = df_prop.groupby('id')[ret_meter_cols].max()
ids_with_no_real_ret = max_ret_values[(max_ret_values < 10).all(axis=1)].index

In [None]:
ids_with_no_real_ret

In [None]:
# # Apply mask to set ret meter columns to NaN for identified ids and where values < 10 kWh
# mask_ret_ids = df_prop.index.get_level_values('id').isin(ids_with_max_zero_ret)
# mask_ret_values = df_prop[ret_meter_cols] < 10

# # Ensure masks have the same shape before combining them
# mask_ret_ids_reshaped = mask_ret_ids.values.reshape(-1, 1)

# # Combine masks for return meter columns
# mask_ret_combined = mask_ret_ids_reshaped & mask_ret_values

In [None]:
# Masks for filtering
mask_with_solar = ~df_prop.index.get_level_values('id').isin(ids_with_no_real_ret)
mask_without_solar = df_prop.index.get_level_values('id').isin(ids_with_no_real_ret)

# Apply mask for use_meter_cols for ids not in ids_with_no_real_ret
df_prop.loc[mask_with_solar, use_meter_cols] = df_prop.loc[mask_with_solar, use_meter_cols].where(lambda x: x >= 10)

# Apply mask for use_meter_cols for ids in ids_with_max_zero_ret
df_prop.loc[mask_without_solar, use_meter_cols] = df_prop.loc[mask_without_solar, use_meter_cols].where(lambda x: x >= 10)

# Set dsmr_version__0 to NaN where dsmr_version__0 < 3.0
mask_version = df_prop['dsmr_version__0'] < 3.0
df_prop.loc[mask_version, 'dsmr_version__0'] = np.nan

In [None]:
# Additional filtering step for ret_meter_cols
# Apply mask for ret_meter_cols for ids with solar panels
df_prop.loc[mask_with_solar, ret_meter_cols] = df_prop.loc[mask_with_solar, ret_meter_cols].where(lambda x: x >= 10)

# Apply mask for ret_meter_cols for ids without solar panels
df_prop.loc[mask_without_solar, ret_meter_cols] = df_prop.loc[mask_without_solar, ret_meter_cols].where(lambda x: (x >= 10) | (x == 0))


In [None]:
for prop in all_meter_cols:
    print('\n', prop, '\n', df_prop[prop].unique())

#### Inspecting and preprocessing dsmr_version__0

In [None]:
prop = 'dsmr_version__0'

In [None]:
df_prop[prop].unique()

In [None]:
df_prop[prop].describe()

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop[prop].groupby(level='id').describe()

In [None]:
# Define a custom styling function
def highlight_nonzero_std(s):
    # Use pd.isna to handle NaN values properly
    is_not_zero = (s != 0.0) & (~pd.isna(s))
    return ['background-color: lightcoral; color: red;' if v else '' for v in is_not_zero]


In [None]:
stats_per_id = df_prop[prop].groupby(level='id').describe()

In [None]:
# Apply the custom styling to the DataFrame
styled_stats = stats_per_id.style.apply(
    highlight_nonzero_std, 
    subset=['std']
)

# Display the styled DataFrame
styled_stats

#### Inspecting and preprocessing e_use_hi_cum__kWh

In [None]:
prop = 'e_use_hi_cum__kWh'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='min', ascending=False)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing e_use_lo_cum__kWh

In [None]:
prop = 'e_use_lo_cum__kWh'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='min', ascending=False)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing e_ret_hi_cum__kWh

In [None]:
prop = 'e_ret_hi_cum__kWh'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='max', ascending=False)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing e_ret_lo_cum__kWh

In [None]:
prop = 'e_ret_lo_cum__kWh'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='max', ascending=False)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing g_use_cum__m3

In [None]:
prop = 'g_use_cum__m3'

In [None]:
df_prop[prop].describe()

In [None]:
df_prop[prop].groupby(level='id').describe().sort_values(by='std', ascending=False)

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=400, alpha=0.5, title = prop)

#### Inspecting and preprocessing meter_code__str

In [None]:
prop = 'meter_code__str'

In [None]:
df_prop['meter_code__str'].unique()

In [None]:
df_prop.groupby(['id', 'meter_code__str', 'dsmr_version__0']).size().reset_index(name='count').set_index(['id', 'meter_code__str', 'dsmr_version__0'])

#### Analysing the relation between suspect values for dsmr_version__o and electricity meter values

In [None]:
# %%time
# # Define columns for electricity meter values
# meter_cols = ['e_use_hi_cum__kWh', 'e_use_lo_cum__kWh', 'e_ret_hi_cum__kWh', 'e_ret_lo_cum__kWh']

# # Flag suspect readings (values < 10 kWh or negative)
# df_prop['suspect_reading'] = df_prop[meter_cols].apply(lambda x: any(v < 10 for v in x if not pd.isna(v)), axis=1)

# # Fill NaNs in suspect_reading as False (if there's no data, assume not suspect)
# df_prop['suspect_reading'].fillna(False, inplace=True)

# # Create a crosstab to analyze the distribution
# crosstab = pd.crosstab(df_prop['dsmr_version__0'], df_prop['suspect_reading'])

# # Calculate the percentage of suspect readings for each dsmr_version__0 value
# crosstab['percent_suspect'] = crosstab[True] / (crosstab[True] + crosstab[False]) * 100

# print(crosstab)

# del df_prop['suspect_reading']

## Inspecting total number of non-null measurements

In [None]:
# Define a function to apply the styling
def highlight_specific_value(val, specific_value=0):
    color = 'red' if val == specific_value else ''
    return f'background-color: {color}'

# Count non-null values per column and per id
non_null_counts_per_col = df_prop.groupby(level='id').count()

# Sum across columns to get the total non-null values per id
non_null_counts_per_col['total_non_null'] = non_null_counts_per_col.sum(axis=1)

non_null_counts_per_col.sort_values(by='total_non_null', ascending=False).T.style.applymap(highlight_specific_value)



In [None]:
# Function to calculate the total covered time excluding large intervals
def calculate_covered_time(group, max_interval=90*60):
    # Calculate the intervals
    intervals = group.dropna().index.get_level_values('timestamp').to_series().diff().dt.total_seconds()
    # Filter out intervals larger than max_interval
    valid_intervals = intervals[intervals <= max_interval]
    # Sum the valid intervals
    return valid_intervals.sum()
    

# Create a copy of the DataFrame for the analysis
df_analysis = df_prop.copy()

# Remove duplicate timestamps and sort by 'id' and 'timestamp'
df_analysis = df_analysis[~df_analysis.index.duplicated(keep='first')]
df_analysis.sort_index(level=['id', 'timestamp'], inplace=True)

# Calculate the total covered time per property and id
covered_time = df_analysis.groupby(level='id').apply(
    lambda x: x.apply(lambda col: calculate_covered_time(col), axis=0)
)

# Convert the covered time from seconds to days
covered_time /= (24*60*60)

# Add a column for the total covered time per 'id'
covered_time['total'] = covered_time.sum(axis=1)

# Display the covered time measured per group
covered_time.sort_values(by='total', ascending=False).T.style.applymap(highlight_specific_value).format(precision=1)

In [None]:
covered_time.sort_values(by='total', ascending=False).to_excel('covered_time_sorted.xlsx')

## Plotting data using in analysis

In [None]:
units_to_mathtext = {
    'ppm' : r'$ppm$',
    'kWh' : r'$kWh$',
    'm3' : r'$m^{3}$',
    'degC' : r'$°C$',
    'W' : r'$W$',
    'V' : r'$V$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'W_m_2' : r'$W\cdotm^{-1}$'
}

In [None]:
# %%time

# #Plot all properties with a single unit for a single id
# Plot.dataframe_properties_plot(df_prop.loc[[948634, 999169]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool']]], units_to_mathtext)

In [None]:
# props = [prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool', 'p']]
props = ['co2__ppm', 'occupancy__p', 'onboarded__p']

In [None]:
props

In [None]:
df_prop[props].info()

In [None]:
df = df_prop[props] 

In [None]:
df.describe().T


In [None]:
stats = df.groupby(level=['source_category', 'source_type','id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'co2__ppm'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'ch_water_pump_speed__0'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'dhw_flow__l_min_1'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

## Preprocessing co2__ppm
Filtering out measurement errors of 0 ppm below 5 ppm

In [None]:
prop = 'co2__ppm'

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df_prop[prop].groupby(level=['id']).describe()

#### Filter out values below 5 ppm, these must be measurement errors

In [None]:
df_prop, meta_df = Preprocessor.filter_min_max(df_prop, prop, min=5, meta_df=meta_df)

In [None]:
meta_df

#### Check to see whether minimum is better now

In [None]:
df_prop[prop].groupby(level=['id']).describe()

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


## After preprocessing, before temporal interpolation

In [None]:
meta_df

In [None]:
meta_df.to_excel('rhc_preprocessing.xlsx')

In [None]:
df_prop.size

In [None]:
df_prop.count().sum()

In [None]:
df_prop.info()

# Interpolating  properties


In [None]:
%%time 
%autoreload 2

# Define the properties and their respective limit__min values
property_limits = {
    'default': 90,
    'boiler_status_blocking_mode__bool': 5,
    'boiler_status_burner_start__bool': 5,
    'boiler_status_burner_stop__bool': 5,
    'boiler_status_burning_ch__bool': 5,
    'boiler_status_burning_dhw__bool': 5,
    'boiler_status_controlled_stop__bool': 5,
    'boiler_status_de_air__bool': 5,
    'boiler_status_heat_demand__bool': 5,
    'boiler_status_locking_mode__bool': 5,
    'boiler_status_pump_post_run__bool': 5,
    'boiler_status_standby__bool': 5,
    'dhw_flow__l_min_1': 5,
    'dhw_temp_out__degC': 5,
    'gas_valve_closed__bool': 5,
    'gas_valve_open__bool': 5,
    'temp_sup__degC': 5,
    'temp_ret__degC': 5,
    'fan_rotations__min_1': 5,
    'temp_ch_supmax__degC': 2 * 24 * 60,  # 2 days in minutes
    'power_ch_max__kW': 2 * 24 * 60,     # 2 days in minutes
    'ch_set_fan_rotations_max__min_1': 2 * 24 * 60,  # 2 days in minutes
    'ch_set_fan_rotations_min__min_1': 2 * 24 * 60   # 2 days in minutes
}

In [None]:
# Perform interpolation
df_interpolated = Preprocessor.interpolate_time(df_prop=df_prop,
                                               property_limits =property_limits ,
                                               upsample__min=5,
                                               interpolate__min=15,
                                               restore_original_types=True,
                                               inplace=False)

In [None]:
df_prop.info()

In [None]:
df_interpolated.info()

In [None]:
df_prop.describe().T

In [None]:
df_interpolated.describe().T

In [None]:
df_bools_to_float = df_prop[[col for col in df_prop.columns if col.endswith('__bool')]].copy()
for col in df_bools_to_float.columns:
    df_bools_to_float[col] = df_bools_to_float[col].astype('float')
df_bools_to_float.describe().T.drop(columns='count').style.format("{:.2%}")

In [None]:
df_interpolated

In [None]:
df_interpolated.info()

In [None]:
df_interpolated.count().sum()

## Writing preprocessed interpolated properties to a parquet file

In [None]:
%%time 
df_interpolated.to_parquet(rhc_preprocessed_poperties_file, index=True, engine='pyarrow')

## Missing value overview

In [None]:
import missingno as msno
import matplotlib.pyplot as plt

In [None]:
%%time 

try:
    df_interpolated = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
# def plot_missing_data_overview(df, id_column, time_column, properties_include=None, properties_exclude=None):
#     """
#     Plots an overview of valid measurements over time for various IDs.
    
#     Parameters:
#     - df: DataFrame containing the data.
#     - id_column: The column name representing the IDs.
#     - time_column: The column name representing the timestamp.
#     - properties_include: List of properties to include for validation. If None, all properties are included.
#     - properties_exclude: List of properties to exclude for validation. If None, no properties are excluded.
#     """
#     # Ensure time_column is datetime type
#     df[time_column] = pd.to_datetime(df[time_column])
    
#     # If properties_include is specified, use it; otherwise, use all columns except id_column and time_column
#     if properties_include is not None:
#         properties = properties_include
#     else:
#         properties = df.columns.difference([id_column, time_column])
    
#     # Exclude specified properties if properties_exclude is provided
#     if properties_exclude is not None:
#         properties = properties.difference(properties_exclude)
    
#     # Filter the DataFrame to include only the relevant columns
#     df_filtered = df[[id_column, time_column] + list(properties)]
    
#     # Aggregate data to ensure no duplicate timestamps for any ID
#     df_aggregated = df_filtered.groupby([id_column, time_column]).first().reset_index()
    
#     # Set the time_column as index
#     df_aggregated = df_aggregated.set_index(time_column)
    
#     # Create a new DataFrame to track validity
#     df_validity = pd.DataFrame(index=df_aggregated.index)
    
#     # Create columns for each ID, marking True if all specified properties are non-null
#     for id_val in df_aggregated[id_column].unique():
#         df_id = df_aggregated[df_aggregated[id_column] == id_val]
#         df_validity[id_val] = df_id[properties].notnull().all(axis=1)
    
#     # Plot using missingno
#     msno.matrix(df_validity, sparkline=False)
#     plt.gca().invert_yaxis()  # Invert y-axis for time representation
#     plt.yticks(ticks=range(len(df_validity.index)), labels=df_validity.index.strftime('%Y-%m-%d %H:%M:%S'))
#     plt.title("Overview of Valid Measurements Over Time for Various IDs")
#     plt.xlabel("ID")
#     plt.ylabel("Time")
#     plt.show()


In [None]:
def plot_missing_data_overview(df, id_column, time_column, properties_include=None, properties_exclude=None):
    """
    Plots an overview of valid measurements over time for various IDs.
    
    Parameters:
    - df: DataFrame containing the data.
    - id_column: The column name representing the IDs.
    - time_column: The column name representing the timestamp.
    - properties_include: List of properties to include for validation. If None, all properties are included.
    - properties_exclude: List of properties to exclude for validation. If None, no properties are excluded.
    """
    # Ensure time_column is datetime type
    df[time_column] = pd.to_datetime(df[time_column])
    
    # If properties_include is specified, use it; otherwise, use all columns except id_column and time_column
    if properties_include is not None:
        properties = properties_include
    else:
        properties = df.columns.difference([id_column, time_column])
    
    # Exclude specified properties if properties_exclude is provided
    if properties_exclude is not None:
        properties = properties.difference(properties_exclude)
    
    # Filter the DataFrame to include only the relevant columns
    df_filtered = df[[time_column] + list(properties)].copy()
    
    # Set the time_column as index
    df_filtered = df_filtered.set_index(time_column)
    
    # Create a new DataFrame to track validity by counting non-null values per timestamp
    df_validity = df_filtered.notnull().groupby(level=0).sum()
    
    # Plot using missingno
    msno.matrix(df_validity, sparkline=False)
    plt.title("Overview of Valid Measurements Over Time")
    plt.xlabel("Properties")
    plt.ylabel("Time")
    plt.show()


In [None]:
# Example usage
# Assuming df_interpolated is your DataFrame with interpolated values
plot_missing_data_overview(df_interpolated.reset_index(), 
                           id_column='id', 
                           time_column='timestamp', 
                           properties_include=['temp_sup__degC', 'temp_ret__degC', 'fan_rotations__min_1'])

## Inspecting total number of non-null measurements

In [None]:
# Count non-null values per column and per id
non_null_counts_per_col = df_interpolated.groupby(level='id').count()

# Sum across columns to get the total non-null values per id
non_null_counts_per_col['total_non_null'] = non_null_counts_per_col.sum(axis=1)

non_null_counts_per_col.sort_values(by='total_non_null', ascending=False).T.style.applymap(highlight_specific_value)



#### Converting raw properties dataframe to preprocessed dataframe


In [None]:
df_prep = Preprocessor.unstack_prop(df_interpolated)

In [None]:
df_prep

In [None]:
df_prep.info()

## Plotting results: time series before and after preprocessing

In [None]:
%autoreload 2
units_to_mathtext = property_types = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'p' : r'$persons$'
}

In [None]:
%%time
#Plot all properties with a single unit for a single id
Plot.dataframe_properties_plot(df_prop.loc[[401632]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] == 'm3']], units_to_mathtext)

In [None]:
%%time
#Plot all preprocessed properties with a single unit for a single id
Plot.dataframe_preprocessed_plot(df_prep.loc[[401632]][[prop for prop in df_prep.columns.values if prop.split('__')[-1] == 'degC']], units_to_mathtext)

In [None]:
# TO DO: add prop_ and prep_ in a `prop-prep` column; merge into single dataframe and unstack, thus allowing for close inspection of preprocessed data

# Other examples: temp_in__degC, minmax filtering and using the static outlier filter, per id
Filtering out extreme temperatures based on mean and standard deviation per room

In [None]:
prop = 'temp_in__degC'

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source_type', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

In [None]:
%matplotlib inline
%matplotlib widget

df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


In [None]:
df = (df_prop[prop]
      .unstack([0,1])
      .dropna(how='all', axis=1)
      .dropna(how='all', axis=0)
      .stack([0,1])
      .swaplevel(0,2)
      .swaplevel(0,1)
      .sort_index()
     )

# create subplots for each combination of id and source_type
fig, axes = plt.subplots(nrows=len(df.index.levels[0]), ncols=len(df.index.levels[1]), figsize=(20, 10))

# set title for each subplot
for i, id in enumerate(df.index.levels[0]):
    for j, source_type in enumerate(df.index.levels[1]):
        axes[i, j].set_title(f"id: {id}, source_type: {source_type}")

# plot histogram for each combination of id and source_type
for i, id in enumerate(df.index.levels[0]):
    for j, source_type in enumerate(df.index.levels[1]):
        try:
            data = df.loc[(id, source_type)].dropna()
        except KeyError:
            continue
        axes[i, j].hist(data, bins=100)
        # axes[i, j].set_xlabel('CO2 (ppm)')
        # axes[i, j].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
df_prop[prop].info()

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source_type', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

## Writing preprocessed properties to a parquet file

In [None]:
%%time 
df_prop.to_parquet(rhc_preprocessed_poperties_file, index=True, engine='pyarrow')