# Add gas quality to raw data

In [None]:
import pandas as pd
import numpy as np

rhc_file_path='rhc_raw_measurements.parquet'
rhc_gas_quality_file = 'rhc_gas_quality.parquet'            # produced by RHC_gas_quality.ipynb

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload


## Get measurements

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas = pd.read_parquet(
        rhc_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


## Get and merge gas quality data

In [None]:
%%time
# read gas quality data from Parquet file produced by RHC_gas_quality.ipynb
try:
    df_gas_quality = pd.read_parquet(
        rhc_gas_quality_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
df_gas_quality

In [None]:
df_gas_quality.index.dtype

In [None]:
df_gas_quality.info()

In [None]:
# %%time
# df_gas_quality['actual_gas_std_hhv__J_m_3'] = df_gas_quality['actual_gas_std_hhv__J_m_3'].astype(str)

In [None]:
df_gas_quality.info()

In [None]:
df_meas.info()

In [None]:
df_meas.index.dtypes

In [None]:
# Extract necessary values from df_meas
unique_ids = df_meas.index.get_level_values('id').unique()

In [None]:
unique_ids

In [None]:
source_category = 'batch_import'
source_type = 'EDSN'
property_name = 'actual_gas_std_hhv__J_m_3'

In [None]:
# Create a new MultiIndex for df_gas_quality to be compatible with df_meas
# Replicate df_gas_quality for each unique id
multiindex_tuples = []
for _id in unique_ids:
    for timestamp in df_gas_quality.index:
        multiindex_tuples.append((_id, source_category, source_type, timestamp, property_name))

# Create the new MultiIndex
new_index = pd.MultiIndex.from_tuples(multiindex_tuples, names=['id', 'source_category', 'source_type', 'timestamp', 'property'])

In [None]:
# Replicate the gas quality data for each id
df_gas_quality_replicated = pd.DataFrame(
    data={'value': df_gas_quality['actual_gas_std_hhv__J_m_3'].values.repeat(len(unique_ids))},
    index=new_index
)

In [None]:
%%time
# Merge df_gas_quality_replicated with df_meas (assuming a left join to keep all df_meas data)
df_meas = pd.concat([df_meas, df_gas_quality_replicated], axis=0).sort_index()

In [None]:
df_meas.info()

In [None]:
df_meas.index.dtypes

In [None]:
df_meas.index.get_level_values('source_category').unique()

In [None]:
df_meas.index.get_level_values('source_type').unique()

In [None]:
df_meas.index.get_level_values('property').unique()

## Write to parquet file(s)

In [None]:
df_meas.index.dtypes

In [None]:
%%time 
# Convert the 'value' column to string type
df_meas['value'] = df_meas['value'].astype(str)

In [None]:
%%time 
df_meas.to_parquet(rhc_file_path, index=True, engine='pyarrow')

In [None]:
# # uncomment this code to enable it 
# %%time 
# for home_id in tqdm(df_meas.index.get_level_values('id').unique()):
#     df_meas.xs(home_id, drop_level=False).to_parquet(f'{home_id}_raw_measurements.parquet', index=True, engine='pyarrow')