# Brains4Buildings data extraction and backup

This JupyterLabs notebook can be used download raw data from a Twomes database (see also [more information how to setup a Twomes server](https://github.com/energietransitie/twomes-backoffice-configuration#jupyterlab)).

In particular, it has been set up to get data from the [Brains4Buildings data collection](https://www.energietransitiewindesheim.nl/brains4buildings2022/privacy/index.html).

Don't forget to install the requirements listed in [requirements.txt](../requirements.txt) first!

To complete data extraction, you also need to have downloaded [b4b-rawdata.zip from the source](https://liveadminwindesheim.sharepoint.com/:u:/r/sites/O365-Brains4Buildings/Gedeelde%20documenten/General/Windesheim%20as%20Living%20Lab/data-raw-anon/b4b-rawdata.zip?csf=1&web=1&e=M0NX1r) and saved it in the ../data/ folder).

## Setting the stage

First several imports and variables need to be defined


### Imports and generic settings

In [1]:
from datetime import datetime, timedelta
import pytz
import math

import pandas as pd
import numpy as np

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')

%load_ext autoreload
import gc


from measurements import Measurements

from tqdm.notebook import tqdm


import logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s %(levelname)-8s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='log_b4b.txt',
                   )

SyntaxError: invalid syntax (measurements.py, line 97)

### Defining which account, which period 

- which account was used to provision the measurements? 
- the location and timezone is
- from which `start_day` to which `end_day' 

In [None]:
#location: T-building, Windesheim, in Zwolle
lat, lon = 52.4350486, 5.4040816

#timezone: 
timezone_database = 'UTC'
timezone_buildings = 'Europe/Amsterdam'

# Below, the maximum period for data collection
first_day = pytz.timezone(timezone_buildings).localize(datetime(2022, 10, 1))
last_day = pytz.timezone(timezone_buildings).localize(datetime(2022, 11, 2))

# all devices were provisioned by a single account
account = [820921]

device_mapping = {
    'TWOMES-979368': 999169,
    'TWOMES-9799B8': 900846,
    'TWOMES-ACDEF0': 948634,
    'TWOMES-ACEB08': 917810,
    'TWOMES-ACEB4C': 925038
}

rooms = [device_mapping[id] for id in device_mapping.keys()]
rooms.append(924038)

property_rename = {
    'CO2concentration': 'co2__ppm',
    'countPresence': 'occupancy__p',
    'relativeHumidity': 'rel_humidity__0',
    'roomTemp': 'temp_in__degC'
}

b4b_db_properties = list(property_rename.keys())

property_types = {
    'temp_in__degC' : 'float32',
    'co2__ppm' : 'float32',
    'rel_humidity__0' : 'float32',
    'valve_frac__0' : 'float32',
    'door_open__bool': 'Int8',
    'window_open__bool': 'Int8',
    'occupancy__bool': 'Int8',
    'occupancy__p' : 'Int8'
}


## Getting measurements from sources

### Getting measurements from the database

In [None]:
%%time 
%autoreload 2
df_db_meas = (Measurements.get_raw_measurements(
    account,
    first_day, last_day,
    b4b_db_properties, property_rename,
    timezone_database, timezone_buildings)
           .loc[account[0]]
           .rename(index=device_mapping)
          )

df_db_meas.index.names = ['id', 'source', 'timestamp', 'property']
df_db_meas = df_db_meas.loc[[device_mapping[id] for id in device_mapping.keys()]]
df_db_meas = df_db_meas.sort_index()
df_db_meas.value = df_db_meas.value.astype('float')
mask_rh = df_db_meas.index.get_level_values('property') == 'rel_humidity__0'
df_db_meas.loc[mask_rh, 'value'] = df_db_meas.loc[mask_rh, 'value']/100
df_db_meas['unit'] = df_db_meas['unit'].cat.add_categories('0')
df_db_meas.loc[mask_rh, 'unit'] = '0'

In [None]:
df_db_meas.info()

In [None]:
df_db_meas

### Get other measurements

In [None]:
%%time 
df = pd.read_csv('../data/b4b-rawdata.zip', parse_dates=['timestamp'], index_col=['timezone', 'timestamp']).sort_index(level='timestamp')


df_other_meas = pd.DataFrame()
for tz in df.index.unique(level='timezone'):
    df_other_meas = pd.concat([df_other_meas, df.loc[tz].tz_localize(tz, ambiguous='NaT')])


df_other_meas = df_other_meas.sort_index()

df_other_meas = df_other_meas.loc[df_other_meas.index.dropna()]

In [None]:
df_other_meas.info()

In [None]:
df_other_meas

### Merge database and other measurements

In [None]:
df_meas = (pd.concat([
    df_db_meas.reset_index(), 
    df_other_meas.reset_index()[['id', 'source', 'timestamp', 'property', 'value']]
])
           .drop_duplicates()
           .set_index(['id', 'source', 'timestamp', 'property'])
           .sort_index()
          )

In [None]:
df_meas.info()

In [None]:
df_meas

### Writing raw measurements to a parquet file

In [None]:
%%time 
df_meas.to_parquet('b4b_raw_measurements.parquet', index=True, engine='pyarrow')

### Write raw measurements per home to parquet files

In [None]:
%%time 
for room_id in tqdm(list(df_meas.index.unique(level='id'))):
    df_meas.xs(room_id, drop_level=False).to_parquet(f'{room_id}_raw_measurements.parquet', index=True, engine='pyarrow')

## Put properties in separate columns, apply types and write parquet file(s)

In [None]:
# unstacking might take a lot of memory, hence do it homw by home. example: unstacking entire Twomes dataset uses 32 GB memory
del df_meas
gc.collect()

### Writing raw properties per home to a parquet file

In [None]:
%%time
%autoreload 2

df_prop = pd.DataFrame()

for room_id in tqdm(rooms):
    df_prop_room = Measurements.to_properties(
        pd.read_parquet(f'{room_id}_raw_measurements.parquet', engine='pyarrow'),
        property_types
    )
    df_prop_room.to_parquet(f'{room_id}_raw_properties.parquet', index=True, engine='pyarrow')
    df_prop = pd.concat([df_prop, df_prop_room]) 
    
if not df_prop.index.is_monotonic_increasing:
    df_prop = df_prop.sort_index()  

In [None]:
df_prop.info()

In [None]:
df_prop

### Writing raw properties to a parquet file

In [None]:
%%time 
df_prop.to_parquet('b4b_raw_properties.parquet', index=True, engine='pyarrow')