# Brains4Buildings data extraction and backup

This JupyterLabs notebook can be used download raw data from a Twomes database (see also [more information how to setup a Twomes server](https://github.com/energietransitie/twomes-backoffice-configuration#jupyterlab)).

In particular, it has been set up to get data from the [Brains4Buildings data collection](https://www.energietransitiewindesheim.nl/brains4buildings2022/privacy/index.html).

Don't forget to install the requirements listed in [requirements.txt](../requirements.txt) first!



## Setting the stage

First several imports and variables need to be defined


### Imports and generic settings

In [None]:
from datetime import datetime, timedelta
import pytz
import math
import pylab as plt

import pandas as pd
import numpy as np

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')
sys.path.append('../analysis/')

%load_ext autoreload

%matplotlib widget
from plotter import Plot

from measurements import Measurements
from preprocessor import Preprocessor

from tqdm.notebook import tqdm


import logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s %(levelname)-8s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='log_b4b.txt',
                   )

### Defining which account, which period 

- which account was used to provision the measurements? 
- the location and timezone is
- from which `start_day` to which `end_day' 

In [None]:
#location: T-building, Windesheim, in Zwolle
lat, lon = 52.4350486, 5.4040816

#timezone: 
timezone_database = 'UTC'
timezone_homes = 'Europe/Amsterdam'

# Below, the maximum period for data collection
first_day = pytz.timezone(timezone_homes).localize(datetime(2022, 10, 1))
last_day = pytz.timezone(timezone_homes).localize(datetime(2022, 11, 2))

# all devices were provisioned by a single account
account = [820921]

## Getting accounts

In [None]:
%%time 
%autoreload 2
df = Measurements.get_accounts_devices(first_day, last_day,
                                       timezone_database, timezone_homes)

In [None]:
pd.options.display.date_dayfirst = True

In [None]:
pd.options.display.date_yearfirst

In [None]:
pd.options.display.date_dayfirst

In [None]:
pd.options.display.precision

In [None]:
df

## Getting a single property of raw data

In [None]:
# extract a more minimal set of properties, as done in earlier queries
b4b_single_type_dict = {
    'roomTemp' : 'float32'
}


In [None]:
# extract a more minimal set of properties, as done in earlier queries
b4b_full_type_dict = {
    'roomTemp' : 'float32',
    'CO2concentration' : 'float32',
    'relativeHumidity' : 'float32',
    'countPresence' : 'Int8',
    'heartbeat' : 'Int8'
}


In [None]:
%%time 
%autoreload 2
df = Measurements.get_raw_properties(account,
                                     first_day, last_day,
                                     b4b_full_type_dict,
                                     timezone_database, timezone_homes)


In [None]:
df

In [None]:
df.describe()

### Plotting properties

In [None]:
# This cell can be used to plot one or more properties in one or more homes

for home_id in list(df.index.unique(level='home_id')):
    for prop in list(df.columns):
        Plot.temperature_and_power_one_home_plot(f'{prop} in {home_id}',
                                        df.loc[home_id],
                                        temp_plot_dict = {prop: 'r'}
                                       )

        for device_type in list(df.index.unique(level='device_type')):
            for device_name in list(df.index.unique(level='device_name')):
                Plot.temperature_and_power_one_home_plot(f'{device_name}:{prop} in {home_id}',
                                            df.loc[home_id,:,device_type,device_name],
                                            temp_plot_dict = {prop: 'r'}
                                           )

In [None]:
%autoreload 2
for home_id in list(df.index.unique(level='home_id')):
    for prop in list(df.columns):

        Plot.temperature_and_power_one_home_plot(f'{prop} in {home_id}',
                                        df.loc[home_id],
                                        temp_plot_dict = {prop: 'r'}
                                       )
        
        for device_type in list(df.index.unique(level='device_type')):
            df_plot = df.loc[home_id,:,device_type].copy()
            df_plot = Preprocessor.filter_min_max(df_plot, prop, min=0, max=45)
            df_plot = Preprocessor.filter_static_outliers(df_plot, prop, n_sigma=3)
            Plot.temperature_and_power_one_home_plot(f'{device_type}:{prop} in {home_id}',
                                        df_plot,
                                        temp_plot_dict = {prop: 'r'}
                                       )

In [None]:
#cell to experiment with other property filtering code

home_id = list(df.index.unique(level='home_id'))[0]
prop = list(df.columns)[0]
device_type = list(df.index.unique(level='device_type'))[0]
dfplot = df.loc[home_id,:,device_type]

#insert new filtering code here

Plot.temperature_and_power_one_home_plot(f'{device_type}:{prop} in {home_id}',
                            dfplot,
                            temp_plot_dict = {prop: 'r'}
                           )

In [None]:
dfplot.plot.hist(bins=200)

## Getting more properties and writing to a parquet file

In [None]:
# extract a more minimal set of properties, as done in earlier queries
b4b_limited_type_dict = {
    'eMeterReadingReturnHigh' : 'float32',
    'eMeterReadingReturnLow' : 'float32',
    'eMeterReadingSupplyHigh' : 'float32', 
    'eMeterReadingSupplyLow' : 'float32',
    'eMeterReadingTimestamp' : 'str',
    'gMeterReadingSupply' : 'float32',
    'gMeterReadingTimestamp' : 'str',
    'roomTemp' : 'float32',
    'roomSetpointTemp' : 'float32'
}


In [None]:
# extract the  full set of properties
b4b_full_type_dict = {
    'heartbeat' : 'Int16',
    'eMeterReadingReturnHigh' : 'float32',
    'eMeterReadingReturnLow' : 'float32',
    'eMeterReadingSupplyHigh' : 'float32', 
    'eMeterReadingSupplyLow' : 'float32',
    'eMeterReadingTimestamp' : 'str',
    'gMeterReadingSupply' : 'float32',
    'gMeterReadingTimestamp' : 'str',
    'listRSSI' : 'str',
    'boilerTemp1' : 'float32',
    'boilerTemp2' : 'float32',
    'roomTemp' : 'float32',
    'boilerSupplyTemp' : 'float32',
    'isBoilerFlameOn' : 'Int8',
    'isCentralHeatingModeOn' : 'Int8',
    'isDomesticHotWaterModeOn' : 'Int8',
    'maxModulationLevel' : 'Int8',
    'roomSetpointTemp' : 'float32',
    'minModulationLevel' : 'Int8',
    'boilerReturnTemp' : 'float32',
    'relativeModulationLevel' : 'Int8',
    'boilerMaxSupplyTemp' : 'float32',
    'CO2concentration' : 'Int16',
    'relativeHumidity' : 'float32',
    'roomTemp2' : 'float32'}


In [None]:
%%time 
df_data_homes = Measurements.get_raw_properties(homes_single,
                                                first_day, last_day,
                                                b4b_full_type_dict,
                                                timezone_database, timezone_homes)



In [None]:
df_data_homes.info()

In [None]:
df_data_homes[['gMeterReadingSupply', 'gMeterReadingTimestamp']]

In [None]:
%%time 
df_data_homes = Measurements.get_raw_properties(homes_all,
                                                first_day, last_day,
                                                b4b_full_type_dict,
                                                timezone_database, timezone_homes)



In [None]:
%%time 
df_data_homes = Measurements.get_raw_properties(homes_3,
                                                first_day, last_day,
                                                b4b_limited_type_dict,
                                                timezone_database, timezone_homes)



In [None]:
%%time 
df_data_homes.to_parquet('homes_all_b4b_limited_type_dict.parquet', index=True, engine='pyarrow')

In [None]:
df_data_homes.info()

In [None]:
%%time 
df_data_homes.to_parquet('homes_all_b4b_full_type_dict.parquet', index=True, engine='pyarrow')

In [None]:
df_data_homes

In [None]:
df_data_homes.describe()

### Write individual data for individual homes to parquet files

In [None]:
%%time 
for home_id in tqdm(homes_all):
    filename = f'{home_id}-rawdata_{first_day.isoformat()}-{(last_day+timedelta(days=1)+ timedelta(hours=1)).isoformat()}.parquet'
    df_data_homes.loc[home_id].to_parquet(filename, index=True, engine='pyarrow')


### Write raw data to a CSV file

In [None]:
%%time 
%autoreload 2
df = Measurements.get_raw_measurements(homes_all,
                                     first_day, last_day,
                                     b4b_full_type_dict,
                                     timezone_database, timezone_homes)


In [None]:
%%time 
for home_id in tqdm(list(df.index.unique(level='home_id'))):
    filename = f'{home_id}-rawmeasurements_{first_day.isoformat()}-{(last_day+timedelta(days=1)+ timedelta(hours=1)).isoformat()}.zip'
    df_write = df.loc[[home_id]].copy(deep=True).reset_index([0,2,3])
    df_write['unix_time'] = df_write.index.map(pd.Timestamp.timestamp).astype(int)
    df_write = (df_write
                .sort_values('unix_time')
                .reset_index(drop=True)
                [['home_id', 'unix_time', 'device_type','property', 'value', 'unit']])
    df_write.index.name = '#'
    df_write.to_csv(filename)
    del(df_write)


In [None]:
%%time 
for home_id in tqdm(list(df.index.unique(level='home_id'))):
    filename = f'{home_id}-rawmeasurements_{first_day.isoformat()}-{(last_day+timedelta(days=1)+ timedelta(hours=1)).isoformat()}.parquet'
    df.loc[[home_id]].to_parquet(filename, index=True, engine='pyarrow')
