# Import data

In [None]:
import pandas as pd
import numpy as np

nfh_input_file_path='nfh_raw_measurements.parquet'
enelogic_input_path='rhc_enelogic_monthly_export.parquet'
remeha_input_file_path='remeha_export.parquet'
rhc_output_file_path='rhc_raw_measurements.parquet'
home_data_file_path = "home_data.parquet"

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload


%matplotlib inline
%matplotlib widget
import pylab as plt
import itertools
from plotter import Plot
from tqdm.notebook import tqdm
import historicdutchweather
from measurements import Measurements, WeatherMeasurements

from urllib.error import HTTPError  # Import HTTPError from urllib.error

## Read NeedForHeat data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_nfh = pd.read_parquet(
        nfh_input_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_nfh.info()

In [None]:
df_meas_nfh.index.dtypes

### Add unit to hearbeat property

In [None]:
list(df_meas_nfh.index.get_level_values('property').unique())

In [None]:
%%time

# Reset index to modify column names
df_meas_nfh.reset_index(inplace=True)

# Replace 'heartbeat' values with 'heartbeat__0' using map
df_meas_nfh['property'] = df_meas_nfh['property'].map(lambda x: 'heartbeat__0' if x == 'heartbeat' else x)

# # Convert relevant columns to categorical type
df_meas_nfh['source_category'] = df_meas_nfh['source_category'].astype('category')
df_meas_nfh['source_type'] = df_meas_nfh['source_type'].astype('category')
df_meas_nfh['property'] = df_meas_nfh['property'].astype('category')

# Set the index again
df_meas_nfh.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'], inplace=True)

In [None]:
list(df_meas_nfh.index.get_level_values('property').unique())

In [None]:
df_meas_nfh.index.dtypes

In [None]:
df_meas_nfh

## Read Enelogic data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_enelogic = pd.read_parquet(
        enelogic_input_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_enelogic.info()

In [None]:
df_meas_enelogic.index.dtypes

In [None]:
df_meas_enelogic

In [None]:
%%time
df_meas_enelogic['value'] = df_meas_enelogic['value'].astype(str)

In [None]:
df_meas_enelogic.info()

## Read Remeha data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_remeha = pd.read_parquet(
        remeha_input_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_remeha.info()

In [None]:
df_meas_remeha.index.dtypes

In [None]:
df_meas_remeha

In [None]:
%%time
df_meas_remeha['value'] = df_meas_remeha['value'].astype(str)

In [None]:
df_meas_remeha.info()

## Merge measurements


In [None]:
%%time
df_meas = pd.concat([df_meas_nfh, df_meas_enelogic, df_meas_remeha])

In [None]:
df_meas.index.dtypes

In [None]:
%%time
# Reset the index to convert the MultiIndex back to columns
df_meas_reset = df_meas.reset_index()

# Convert relevant columns to categorical type
df_meas_reset['source_category'] = df_meas_reset['source_category'].astype('category')
df_meas_reset['source_type'] = df_meas_reset['source_type'].astype('category')
df_meas_reset['property'] = df_meas_reset['property'].astype('category')

# Set the columns back to a MultiIndex
df_meas = df_meas_reset.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'])


In [None]:
df_meas.index.dtypes

## Get and merge geospatially interpolated KNMI weather measurements

In [None]:
%%time
# read home data from Parquet file
try:
    df_homes = pd.read_parquet(
        home_data_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
%%time
%autoreload 2
df_meas_weather = WeatherMeasurements.get_nfh_weather_measurements(df_meas, df_homes)

In [None]:
df_meas_weather.info()

In [None]:
df_meas_weather.index.dtypes

In [None]:
# Describe statistics across all properties
df_meas_weather['value'].unstack('property').describe().T

## Merge weather measurements


In [None]:
%%time
df_meas_weather['value'] = df_meas_weather['value'].astype(str)

In [None]:
%%time
df_meas = pd.concat([df_meas, df_meas_weather])

In [None]:
df_meas.index.dtypes

In [None]:
%%time
# Reset the index to convert the MultiIndex back to columns
df_meas_reset = df_meas.reset_index()

# Convert relevant columns to categorical type
df_meas_reset['source_category'] = df_meas_reset['source_category'].astype('category')
df_meas_reset['source_type'] = df_meas_reset['source_type'].astype('category')
df_meas_reset['property'] = df_meas_reset['property'].astype('category')

# Set the columns back to a MultiIndex
df_meas = df_meas_reset.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'])

In [None]:
df_meas.index.dtypes

## Initial exploration: size, ids, start & stop times per id

In [None]:
df_meas.info()

In [None]:
df_meas.index.dtypes

In [None]:
print(f"len(df_meas): {len(df_meas):_}")

In [None]:
%%time
# deduplicate the measurements
df_meas = df_meas.reset_index().drop_duplicates().set_index(['id', 'source_category', 'source_type', 'timestamp', 'property']).sort_index()

In [None]:
print(f"len(df_meas): {len(df_meas):_}")

In [None]:
list(df_meas.index.get_level_values('id').unique())

In [None]:
len(df_meas.index.get_level_values('id').unique())

In [None]:
%%time
df_meas.reset_index().groupby(['id', 'source_type'], observed=True)['timestamp'].agg(['min', 'max'])

In [None]:
print(f"df_meas['value'].count(): {df_meas['value'].count():_}")

In [None]:
df_meas.duplicated().any()

In [None]:
df_meas.info()

In [None]:
df_meas.describe()

In [None]:
df_meas

In [None]:
list(df_meas.index.get_level_values('source_category').unique())

In [None]:
list(df_meas.index.get_level_values('source_type').unique())

In [None]:
list(df_meas.index.get_level_values('property').unique())

In [None]:
df_meas.groupby(['source_type'], observed=True).size()

In [None]:
df_meas.groupby(['source_category', 'source_type', 'property'], observed=True).size()

In [None]:
print(f"df_meas.count().sum(): {df_meas.count().sum():_}")

## Write to parquet file(s)

In [None]:
df_meas.index.dtypes

In [None]:
%%time 
# Convert the 'value' column to string type
df_meas['value'] = df_meas['value'].astype(str)

In [None]:
%%time 
df_meas.to_parquet(rhc_output_file_path, index=True, engine='pyarrow')

In [None]:
# # uncomment this code to enable it 
# %%time 
# for home_id in tqdm(df_meas.index.get_level_values('id').unique()):
#     df_meas.xs(home_id, drop_level=False).to_parquet(f'{home_id}_raw_measurements.parquet', index=True, engine='pyarrow')