# Preprocessing B4B data

In [None]:
import pandas as pd
import numpy as np
import pylab as plt

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')

%load_ext autoreload

%matplotlib inline
%matplotlib widget

from preprocessor import Preprocessor

In [None]:
%%time
# Prerequisite: for this example to work, you need to have the b4b_raw_properties.parquet, located e.g. in the ../data/ folder.
# One way to get this is to run B4BExtractionBackup.ipynb first
df = pd.read_parquet('b4b_raw_properties.parquet', engine='pyarrow')

#sorting the DataFrame index is needed to get good performance on certain filters
df = df.sort_index()

In [None]:
#if this plot does not show up at initial run, run the cell again (something fishy with interactive plotting of DataFrame.plot.hist())
%matplotlib widget
df.temp_in__degC.plot.hist(bins=50, alpha=0.5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df

## Example: using the min-max filter
Filtering out extreme, highly unlikely temperature measurements

In [None]:
Preprocessor.filter_min_max(df, 'temp_in__degC', min=5, max=40)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#if this plot does not show up at initial run, run the cell again (something fishy with interactive plotting of DataFrame.plot.hist())
%matplotlib widget
df.temp_in__degC.plot.hist(bins=50, alpha=0.5)

## Example: using the static outlier filter, per id
Filtering out extreme temperatures based on mean and standard deviation per room

In [None]:
Preprocessor.filter_static_outliers(df, 'temp_in__degC', n_sigma=3.0, per_id=True)

In [None]:
#if this plot does not show up at initial run, run the cell again (something fishy with interactive plotting of DataFrame.plot.hist())
%matplotlib widget
df.temp_in__degC.plot.hist(bins=50, alpha=0.5)

In [None]:
df.info()

In [None]:
df.describe()

## Converting raw properties dataframe to preprocessed dataframe


In [None]:
%%time
df_prep = df.unstack([1])
df_prep.columns = df_prep.columns.swaplevel(0,1)
df_prep.columns = ['_'.join(col) for col in df_prep.columns.values]

In [None]:
df_prep

In [None]:
df_prep.info()

In [None]:
%%time
df_prep = df_prep.dropna(axis=1,how='all')

In [None]:
df_prep

In [None]:
df_prep.info()