# Preprocessing B4B data

In [None]:
import pandas as pd
import numpy as np
import pylab as plt

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload

%matplotlib inline
%matplotlib widget

from preprocessor import Preprocessor
from plotter import Plot

### Load Measured Data from parquet file

In [None]:
%%time
# Prerequisite: for this example to work, you need to have the b4b_raw_properties.parquet, located in the ../data/ folder.
# One way to get this is to run B4BExtractionBackup.ipynb first, but then you have to run this code on the energietransitiewindesheim.nl server

df_prop = pd.read_parquet('../data/b4b_raw_properties.parquet', engine='pyarrow')

#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFramews are properly sorted
if not df_prop.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prop = df_prop.sort_index()  

In [None]:
df_prop.index.unique(level='id').values

In [None]:
df_prop.index.unique(level='source').values

In [None]:
df_prop

In [None]:
df_prop.info()

### Plotting data using in analysis

In [None]:
%autoreload 2
units_to_mathtext = property_types = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'p' : r'$persons$'
}

In [None]:
# %%time

# #Plot all properties with a single unit for a single id
# Plot.dataframe_properties_plot(df_prop.loc[[948634, 999169]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool']]], units_to_mathtext)

## Inspecting statistics

In [None]:
# props = [prop for prop in df_prop.columns.values if prop.split('__')[-1] in['ppm', '0', 'bool', 'p']]
props = ['co2__ppm', 'valve_frac__0', 'occupancy__p', 'occupancy__bool']

In [None]:
props

In [None]:
df_prop[props].info()

In [None]:
df = df_prop[props] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'co2__ppm'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'valve_frac__0'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'occupancy__p'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
%matplotlib inline
%matplotlib widget
prop = 'occupancy__bool'
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

## Preprocessing co2__ppm
Filtering out measurement errors of 0 ppm below 5 ppm

In [None]:
prop = 'co2__ppm'

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)

In [None]:
df = (df_prop[prop]
      .unstack([0,1])
      .dropna(how='all', axis=1)
      .dropna(how='all', axis=0)
      .stack([0,1])
      .swaplevel(0,2)
      .swaplevel(0,1)
      .sort_index()
     )

# create subplots for each combination of id and source
fig, axes = plt.subplots(nrows=len(df.index.levels[0]), ncols=len(df.index.levels[1]), figsize=(20, 10))

# set title for each subplot
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        axes[i, j].set_title(f"id: {id}, Source: {source}")

# plot histogram for each combination of id and source
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        try:
            data = df.loc[(id, source)].dropna()
        except KeyError:
            continue
        axes[i, j].hist(data, bins=100)
        # axes[i, j].set_xlabel('CO2 (ppm)')
        # axes[i, j].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
df_prop[prop].info()

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

#### Filter out values below 5 ppm, these must be measurement errors

In [None]:
df_prop = Preprocessor.filter_min_max(df_prop, prop, min=5)

#### Check to see whether minimum is better now

In [None]:
df_prop[prop].info()

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


#### Filter out faulty co2 sensors that have std = 0 ppm 

In [None]:
std = df_prop[prop].groupby(['id', 'source']).transform('std')
# set values to np.nan where std is zero
mask = std == 0
df_prop[mask] = np.nan

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

In [None]:
%matplotlib inline
%matplotlib widget
df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


In [None]:
df = (df_prop[prop]
      .unstack([0,1])
      .dropna(how='all', axis=1)
      .dropna(how='all', axis=0)
      .stack([0,1])
      .swaplevel(0,2)
      .swaplevel(0,1)
      .sort_index()
     )

# create subplots for each combination of id and source
fig, axes = plt.subplots(nrows=len(df.index.levels[0]), ncols=len(df.index.levels[1]), figsize=(20, 10))

# set title for each subplot
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        axes[i, j].set_title(f"id: {id}, Source: {source}")

# plot histogram for each combination of id and source
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        try:
            data = df.loc[(id, source)].dropna()
        except KeyError:
            continue
        axes[i, j].hist(data, bins=100)
        # axes[i, j].set_xlabel('CO2 (ppm)')
        # axes[i, j].set_ylabel('Count')

plt.tight_layout()
plt.show()

##### Interpolating  properties


In [None]:
%%time 
%autoreload 2

property_types = {
    'temp_in__degC' : 'float32',
    'co2__ppm' : 'float32',
    'rel_humidity__0' : 'float32',
    'valve_frac__0' : 'float32',
    'door_open__bool': 'Int8',
    'window_open__bool': 'Int8',
    'occupancy__bool': 'Int8',
    'occupancy__p' : 'Int8'
}

df_interpolated = Preprocessor.interpolate_time(df_prop,
                                        property_dict = property_types,
                                        upsample__min = 5,
                                        interpolate__min = 15,
                                        limit__min = 90,
                                        inplace=False
                                       )

In [None]:
df_prop.info()

In [None]:
df_prop.describe()

In [None]:
df_interpolated

In [None]:
df_interpolated.info()

#### Converting raw properties dataframe to preprocessed dataframe


In [None]:
df_prep = Preprocessor.unstack_prop(df_interpolated)

In [None]:
df_prep

In [None]:
df_prep.info()

#### Writing raw properties to a parquet file

In [None]:
%%time 
df_prop.to_parquet('b4b_raw_properties.parquet', index=True, engine='pyarrow')

## Plotting results: time series before and after preprocessing

In [None]:
%autoreload 2
units_to_mathtext = property_types = {
    'degC' : r'$°C$',
    'ppm' : r'$ppm$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'p' : r'$persons$'
}

In [None]:
%%time
#Plot all properties with a single unit for a single id
Plot.dataframe_properties_plot(df_prop.loc[[999169]][[prop for prop in df_prop.columns.values if prop.split('__')[-1] == 'ppm']], units_to_mathtext)

In [None]:
%%time
#Plot all preprocessed properties with a single unit for a single id
Plot.dataframe_preprocessed_plot(df_prep.loc[[999169]][[prop for prop in df_prep.columns.values if prop.split('__')[-1] == 'ppm']], units_to_mathtext)

In [None]:
# TO DO: add prop_ and prep_ in a `prop-prep` column; merge into single dataframe and unstack, thus allowing for close inspection of preprocessed data

# Other examples: temp_in__degC, minmax filtering and using the static outlier filter, per id
Filtering out extreme temperatures based on mean and standard deviation per room

In [None]:
prop = 'temp_in__degC'

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]

In [None]:
%matplotlib inline
%matplotlib widget

df_prop[prop].plot.hist(bins=200, alpha=0.5, title = prop)


In [None]:
df = (df_prop[prop]
      .unstack([0,1])
      .dropna(how='all', axis=1)
      .dropna(how='all', axis=0)
      .stack([0,1])
      .swaplevel(0,2)
      .swaplevel(0,1)
      .sort_index()
     )

# create subplots for each combination of id and source
fig, axes = plt.subplots(nrows=len(df.index.levels[0]), ncols=len(df.index.levels[1]), figsize=(20, 10))

# set title for each subplot
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        axes[i, j].set_title(f"id: {id}, Source: {source}")

# plot histogram for each combination of id and source
for i, id in enumerate(df.index.levels[0]):
    for j, source in enumerate(df.index.levels[1]):
        try:
            data = df.loc[(id, source)].dropna()
        except KeyError:
            continue
        axes[i, j].hist(data, bins=100)
        # axes[i, j].set_xlabel('CO2 (ppm)')
        # axes[i, j].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
df_prop[prop].info()

In [None]:
df = df_prop[prop] 
stats = df.groupby(level=['source', 'id']).describe().transpose()
stats.columns = stats.columns.swaplevel(0,1)
stats = stats.sort_index(axis=1, level=0)
stats.loc[['count', 'mean', 'min', 'max', 'std'],:]