In [None]:
import os

from datetime import datetime, timedelta
import pytz
import pylab as plt

import pandas as pd
import numpy as np

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2
import sys
sys.path.append('../data/')

%load_ext autoreload
import gc

from measurements import Measurements

from tqdm.notebook import tqdm

# Replace 'file_path' with the actual path to your file
file_path='remeha_20231129-20240402.parquet'
file_output_path='remeha_export.parquet'



In [None]:
# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)

# Convert file size to kilobytes, megabytes, etc. for better readability
file_size_kb = file_size_bytes / 1024
file_size_mb = file_size_kb / 1024
file_size_gb = file_size_mb / 1024

# Print the file size
print(f"File Size: {file_size_bytes} bytes ({file_size_kb:.2f} KB, {file_size_mb:.2f} MB, {file_size_gb:.2f} GB)")

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df.info()


In [None]:
df.memory_usage()

## Map measurement_names to properties and add source_category and source_type

In [None]:
translation_table = {
    'parHeFanRpmChMax': 'ch_set_fan_rotations_max__min_1',
    'parHeFanRpmMin': 'ch_set_fan_rotations_min__min_1',
    'parHePowerMax': 'power_ch_max__kW',
    'parHePowerMin': 'power_ch_min__kW',
    'parZoneTFlowSetpointMax': 'temp_ch_sup_max__degC',
    'varApChEnergyConsumption': 'g_use_ch_lhv_cum__kWh',
    'varApDhwEnergyConsumption': 'g_use_dhw_lhv_cum__kWh',
    'varApPowerActual': 'g_use_ch_and_dhw__kW0',
    'varApPumpSpeed': 'ch_water_pump_speed__0',
    'varApStatus': 'boiler_status__str',
    'varApTOutside': 'temp_outdoor__degC',
    'varApTflow': 'temp_flow__degC',
    'varApTreturn': 'temp_ret__degC',
    'varDhwFlowSpeed': 'dhw_flow__l_min_1',
    'varDhwOutTemp': 'dhw_temp__degC',
    'varHeFanRpm': 'fan_rotations__min_1',
    'varHeGasValve1': 'gas_valve__str',
    'varZoneRoomTemperatureMeasured': 'temp_indoor__degC',
    'varZoneTRoomSetpoint': 'temp_set__degC'
}

# Map the translation table to create the 'property' column
df['property'] = df['measurement_name'].map(translation_table)


In [None]:
# Properties:  full set; TODO: be more specific for some data types
remeha_full_properties_types = {
    'ch_set_fan_rotations_max__min_1': 'float32',
    'ch_set_fan_rotations_min__min_1': 'float32',
    'power_ch_max__kW': 'float32',
    'power_ch_min__kW': 'float32',
    'temp_ch_sup_max__degC': 'float32',
    'g_use_ch_lhv_cum__kWh': 'float64',
    'g_use_dhw_lhv_cum__kWh': 'float64',
    'g_use_ch_and_dhw__kW0': 'float64',
    'ch_water_pump_speed__0': 'float64',
    'boiler_status__str': 'str',
    'temp_outdoor__degC': 'float32',
    'temp_flow__degC': 'float32',
    'temp_ret__degC': 'float32',
    'dhw_flow__l_min_1': 'float32',
    'dhw_temp__degC': 'float32',
    'fan_rotations__min_1': 'float32',
    'gas_valve__str': 'str',
    'temp_indoor__degC': 'float32',
    'temp_set__degC': 'float32'
}


remeha_full_properties = list(remeha_full_properties_types.keys())

types = remeha_full_properties_types



In [None]:
# Rename the 'pseudonym' column to 'id'
df.rename(columns={'pseudonym': 'id'}, inplace=True)

In [None]:
# Drop the 'measurement_name' column
df.drop(columns=['measurement_name'], inplace=True)

# Create constant values for the new levels
source_category_values = ['batch_import']
source_type_values = ['remeha']

# Create a DataFrame with the constant values for the new columns
new_columns_df = pd.DataFrame(index=df.index)
new_columns_df['source_category'] = source_category_values[0]
new_columns_df['source_type'] = source_type_values[0]

# Concatenate the new DataFrame with the existing DataFrame
df = pd.concat([df, new_columns_df], axis=1)

In [None]:
df.sample(25)

In [None]:
df.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'], inplace=True)


In [None]:
# Convert specific levels of the MultiIndex to 'category'
df.index = df.index.set_levels(df.index.levels[1].astype('category'), level=1)
df.index = df.index.set_levels(df.index.levels[2].astype('category'), level=2)
df.index = df.index.set_levels(df.index.levels[4].astype('category'), level=4)

In [None]:
df.info()


In [None]:
df.memory_usage()

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

In [None]:
# convert value column to string
df['value'] = df['value'].astype(float)

In [None]:
%%time 
df.to_parquet(file_output_path, index=True, engine='pyarrow')