# Import data

In [None]:
import pandas as pd
import numpy as np

nfh_input_file_path='nfh_raw_meas.parquet'
enelogic_input_path='rhc_enelogic_monthly_export.parquet'
remeha_input_file_path='remeha_export.parquet'
rhc_output_file_path='rhc_raw_meas.parquet'

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload


%matplotlib inline
%matplotlib widget
import pylab as plt
import itertools
from plotter import Plot
from tqdm.notebook import tqdm

In [None]:
units_to_mathtext = property_types = {
    'ppm' : r'$ppm$',
    'kWh' : r'$kWh$',
    'm3' : r'$m^{3}$',
    'degC' : r'$°C$',
    'W' : r'$W$',
    'V' : r'$V$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'W_m_2' : r'$W\cdotm^{-1}$'
}

## Read NeedForHeat data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_nfh = pd.read_parquet(
        nfh_input_file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_nfh.info()

In [None]:
df_nfh

## Read Enelogic data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_enelogic = pd.read_parquet(
        enelogic_input_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_enelogic.info()

In [None]:
df_enelogic

In [None]:
%%time
df_enelogic['value'] = df_enelogic['value'].astype(str)

In [None]:
df_enelogic.info()

## Read Remeha data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_remeha = pd.read_parquet(
        remeha_input_file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_remeha.info()

In [None]:
df_remeha

In [None]:
%%time
df_remeha['value'] = df_remeha['value'].astype(str)

In [None]:
df_remeha.info()

## Merge

In [None]:
%%time
df = pd.concat([df_nfh, df_enelogic, df_remeha])

## Initial exploration: size, ids, start & stop times per id

In [None]:
df.info()

In [None]:
len(df)

In [None]:
%%time
# deduplicate the measurements
df = df.reset_index().drop_duplicates().set_index(['id', 'source_category', 'source_type', 'timestamp', 'property']).sort_index()

In [None]:
len(df)

In [None]:
list(df.index.get_level_values('id').unique())

In [None]:
len(df.index.get_level_values('id').unique())

In [None]:
%%time
df.reset_index().groupby(['id', 'source_type'])['timestamp'].agg(['min', 'max'])

In [None]:
df['value'].count()

In [None]:
df.duplicated().any()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df

In [None]:
list(df.index.get_level_values('source_category').unique())

In [None]:
list(df.index.get_level_values('source_type').unique())

In [None]:
list(df.index.get_level_values('property').unique())

In [None]:
df.groupby(['source_type']).size()

In [None]:
df.groupby(['source_category', 'source_type', 'property']).size()

In [None]:
print(f"df.count().sum(): {df.count().sum():_}")

## Write to parquet file(s)

In [None]:
%%time 
df.to_parquet(rhc_output_file_path, index=True, engine='pyarrow')

In [None]:
# %%time 
# for home_id in tqdm(homes):
#     df.xs(home_id, drop_level=False).to_parquet(f'{home_id}_raw_measurements.parquet', index=True, engine='pyarrow')

## Write to csv file(s)

In [None]:
homes = list(df.index.get_level_values('id').unique())

### Write raw measurements per home to zipped .CSV files

In [None]:
%%time 
for home_id in tqdm(homes):
    try:
        # df_meas_home =  pd.read_parquet(
        #     f'{home_id}_raw_measurements.parquet', 
        #     engine='pyarrow',
        #     use_nullable_dtypes=True
        # )
        df.xs(home_id, drop_level=False).to_csv(
            f'{home_id}_raw_measurements.zip',
            encoding='utf-8',
            compression= dict(method='zip',
                              archive_name=f'{home_id}_raw_measurements.csv'),
            date_format='%Y-%m-%dT%H:%M:%S%z'
        )
    except FileNotFoundError as e:
        print(f"Error: {e}. Skipping file {home_id}_raw_measurements.parquet.")
        continue     
