# REDUCEDHEATCARB data extraction and backup

This JupyterLabs notebook can be used download raw data from a twomes_v2 database (see also [more information how to setup a backoffice server](https://github.com/energietransitie/twomes-backoffice-configuration#jupyterlab)).
Don't forget to install the requirements listed in [requirements.txt](../requirements.txt) first!

Make sure you have an Excel file pseudonyms.xlsx in the same folder as this notebook file, with columns 'pseudonym' and 'account_id', which define the mapping.


## Set the stage

First several imports and variables need to be defined


### Imports and generic settings

In [None]:
from datetime import datetime, timedelta
import pytz
import pylab as plt

import pandas as pd
import numpy as np

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')

nfh_output_file_path='nfh_raw_meas.parquet'

%load_ext autoreload
import gc

from measurements import Measurements

from tqdm.notebook import tqdm


import logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s %(levelname)-8s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='log.txt',
                   )

### Define which homes, which period 

- which `homes` 
- what the location and timezone is of those homes (currently, we only support one location and timezone for a batch of homes) 
- from which `start_day` to which `end_day` 

In [None]:
# TODO: change weather interpolation location to a location per home (rounded to H3 coodrinates at a high level);
# the location below is the center of Assendorp neighbourhood in Zwolle
lat, lon = 52.50655, 6.09961

# timezone: 
timezone_database = 'UTC'
timezone_homes = 'Europe/Amsterdam'

# Period: maximum period of datacollection
first_day = pytz.timezone(timezone_homes).localize(datetime(2023, 12, 8))
last_day = pytz.timezone(timezone_homes).localize(datetime(2024, 4, 1))

# Period: Shorter period with suitable weather and lots of homes with measurements.
# first_day = pytz.timezone(timezone_homes).localize(datetime(2024, 2, 12))
# last_day = pytz.timezone(timezone_homes).localize(datetime(2024, 2, 25))


# Properties: a single one
needforheat_single_property_type = {
    'co2__ppm' : 'float32'
}
needforheat_single_property = list(needforheat_single_property_type.keys())


# Properties: limited set
needforheat_limited_properties_types = {
    'temp_in__degC' : 'float32',
    'co2__ppm' : 'float32',
    'e_use_lo_cum__kWh' : 'float64',
    'e_use_hi_cum__kWh' : 'float64',
    'e_ret_lo_cum__kWh' : 'float64',
    'e_ret_hi_cum__kWh' : 'float64',
    'g_use_cum__m3' : 'float64',
    'occupancy__p' : 'Int8'
}
needforheat_limited_properties = list(needforheat_limited_properties_types.keys())


# Properties:  full set
needforheat_full_properties_types = {
    'temp_in__degC' : 'float32',
    'co2__ppm' : 'float32',
    'rel_humidity__0' : 'float32',
    'battery_voltage__V': 'float32',
    'occupancy__p' : 'Int8',
    'onboarded__p' : 'Int8',
    'heartbeat' : 'Int16',
    'e_use_lo_cum__kWh' : 'float64',
    'e_use_hi_cum__kWh' : 'float64',
    'e_ret_lo_cum__kWh' : 'float64',
    'e_ret_hi_cum__kWh' : 'float64',
    'g_use_cum__m3' : 'float64',
    'meter_code__str': 'str',
    'dsmr_version__0': 'float32',
    'e_use_cum__kWh' : 'float64',
    'e_ret_cum__kWh' : 'float64'
}
needforheat_full_properties = list(needforheat_full_properties_types.keys())

map_source_category = {
    'twomes-p1-reader-firmware': 'device',
    'enelogic': 'cloud_feed',
    'twomes-co2-occupancy-scd41-m5coreink-firmware': 'device'
}


In [None]:
# Load Excel file into DataFrame, with columns 'pseudonym' and 'account_id', which define the mapping
# Make sure you have an Excel file pseudonyms.xlsx in the same folder as this notebook file.

df_pseudonym_mapping = pd.read_excel('pseudonyms.xlsx')

# Create a dictionary mapping pseudonyms to account_ids
pseudonym_to_account_id = dict(zip(df_pseudonym_mapping['pseudonym'], df_pseudonym_mapping['account_id']))

# Create a dictionary mapping account_ids to pseudonyms
account_id_to_pseudonym = dict(zip(df_pseudonym_mapping['account_id'], df_pseudonym_mapping['pseudonym']))


In [None]:
# Homes: full set of subjects that started and did not stop
homes_full = [401632, 403603, 404873, 410260, 412715, 424197, 429011, 430062, 434931, 438708, 440152, 444964, 449134, 450051, 450298, 456638, 458000, 458852, 478667, 483173, 487126, 487289, 494233, 495906]

# Homes: subset that satisfy 6 criteria: 1_app_activated__bool, 2a_p1_activated__bool, 2b_woonkamermodule_activated__bool, 3b_completed_onboarding__bool, 4a_enelogic_auth__bool, 4b_enelogic_data_bool
homes_all = [401632, 403603, 404873, 410260, 412715, 424197, 429011, 430062, 434931, 444964, 449134, 450298, 456638, 458000, 458852, 478667, 483173, 487126, 494233, 495906]

#Homes: 3 homes (for testing multi-home data retrieval)
homes_3 = [401632, 410260, 424197]

# Homes: single home (for testing purposes)
homes_single = [424197]


## Get measurements for 1 property, 1 home

In [None]:
homes = homes_single
properties = needforheat_single_property
types = needforheat_single_property_type

In [None]:
%%time 
%autoreload 2
df_meas= Measurements.get_needforheat_measurements(
    [pseudonym_to_account_id[pseudonym] for pseudonym in homes],
    first_day, last_day,
    properties,
    timezone_database, timezone_homes)
df_meas.index = df_meas.index.set_levels(df_meas.index.levels[0].map(account_id_to_pseudonym), level=0)

In [None]:
df_meas.info()

In [None]:
df_meas

## Get measuremens for more properties for 3 homes

In [None]:
homes = homes_3
properties = needforheat_limited_properties
types = needforheat_limited_properties_types

In [None]:
%%time 
%autoreload 2
df_meas= Measurements.get_needforheat_measurements(
    [pseudonym_to_account_id[pseudonym] for pseudonym in homes],
    first_day, last_day,
    properties,
    timezone_database, timezone_homes)
df_meas.index = df_meas.index.set_levels(df_meas.index.levels[0].map(account_id_to_pseudonym), level=0)

In [None]:
df_meas.info()

In [None]:
df_meas

### Transform the index to new format (id, source_category, source_type, timestamp)

In [None]:
# Rename the 'source' level to 'source_type'
df_meas = df_meas.rename_axis(index={'source': 'source_type'})

# Drop the 'device_name' level from the MultiIndex
df_meas.index = df_meas.index.droplevel('device_name')

# Create a new column 'source_category' based on the mapping of 'source_type'
df_meas['source_category'] = df_meas.index.get_level_values('source_type').map(map_source_category)

# Reset the index to separate the index levels from the new column
df_meas.reset_index(inplace=True)

# Set 'source_category' as the index level
df_meas.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'], inplace=True)

In [None]:
df_meas

### Get measurements for all properties for the list homes_all

In [None]:
# homes = homes_full
homes = homes_all
properties = needforheat_full_properties
types = needforheat_full_properties_types

In [None]:
len(homes)

In [None]:
%%time 
%autoreload 2
df_meas= Measurements.get_needforheat_measurements(
    [pseudonym_to_account_id[pseudonym] for pseudonym in homes] ,
    first_day, last_day,
    properties,
    timezone_database, timezone_homes)
df_meas.index = df_meas.index.set_levels(df_meas.index.levels[0].map(account_id_to_pseudonym), level=0)

In [None]:
# Rename the 'source' level to 'source_type'
df_meas = df_meas.rename_axis(index={'source': 'source_type'})

# Drop the 'device_name' level from the MultiIndex
df_meas.index = df_meas.index.droplevel('device_name')

# Create a new column 'source_category' based on the mapping of 'source_type'
df_meas['source_category'] = df_meas.index.get_level_values('source_type').map(map_source_category)

# Reset the index to separate the index levels from the new column
df_meas.reset_index(inplace=True)

# Set 'source_category' as the index level
df_meas.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'], inplace=True)

In [None]:
df_meas.info()

In [None]:
df_meas['value'].count()

In [None]:
df_meas

## Write to parquet file(s)

### Write raw measurements to a parquet file

In [None]:
%%time 
df_meas.to_parquet(nfh_output_file_path, index=True, engine='pyarrow')

### Write raw measurements per home to parquet files

In [None]:
%%time 
for home_id in tqdm(list(df_meas.index.unique(level='id'))):
    df_meas.xs(home_id, drop_level=False).to_parquet(f'{home_id}_raw_measurements.parquet', index=True, engine='pyarrow')