# Export measurements and preprocessed properties to zipped CSV files

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

rhc_file_path='rhc_raw_measurements.parquet'
rhc_preprocessed_poperties_file='rhc_preprocessed_properties.parquet'


## Write to csv file(s)

### Load Measured Data from parquet file

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas = pd.read_parquet(
        rhc_file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_meas.index.is_monotonic_increasing:
    print('df_meas needed index sorting')
    df_meas = df_meas.sort_index()  

In [None]:
df_meas.info()

In [None]:
df_meas.index.dtypes

In [None]:
print(f"df_meas.count().sum(): {df_meas.count().sum():_}")

In [None]:
df_meas

### Write raw measurements per home to zipped .CSV files

In [None]:
%%time 
for home_id in tqdm(df_meas.index.get_level_values('id').unique()):
    df_meas.xs(home_id, drop_level=False).to_csv(
        f'{home_id}_raw_measurements.zip',
        encoding='utf-8',
        compression= dict(method='zip',
                          archive_name=f'{home_id}_raw_measurements.csv'),
        date_format='%Y-%m-%dT%H:%M:%S%z'
    )

## Reading preprocessed interpolated properties from a parquet file

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_prep = pd.read_parquet(
        rhc_preprocessed_poperties_file, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")

In [None]:
#sorting the DataFrame index is needed to get good performance on certain filters
#this guarding code to check whether DataFrames are properly sorted
if not df_prep.index.is_monotonic_increasing:
    print('df needed index sorting')
    df_prep = df_prep.sort_index()  

In [None]:
df_prep.info()

In [None]:
df_prep.index.dtypes

In [None]:
print(f"df_prep.count().sum(): {df_prep.count().sum():_}")

In [None]:
df_prep

### Write preprocessed properties per home to zipped .CSV files

In [None]:
%%time 
for home_id in tqdm(df_prep.index.get_level_values('id').unique()):
    df_prep.xs(home_id, drop_level=False).to_csv(
        f'{home_id}_preprocessed_properties.zip',
        encoding='utf-8',
        compression= dict(method='zip',
                          archive_name=f'{home_id}_preprocessed_properties.csv'),
        date_format='%Y-%m-%dT%H:%M:%S%z'
    )
