# Export sanity checked REDUCEDHEATCARB data for Data Science students

In [None]:
import pandas as pd

#Source parquet file 
file_path='reducedheatcarb_sane_measurements.parquet'

# Define the list of properties to filter
properties_to_select = ['g_use_cum__m3', 
                        'temp_indoor__degC', 
                        'temp_set__degC', 
                        'co2_indoor__ppm', 
                        'rel_humidity__0',  
                        'e_ret_hi_cum__kWh', 'e_ret_lo_cum__kWh', 'e_use_hi_cum__kWh', 'e_use_lo_cum__kWh', 'e_ret_cum__kWh', 'e_use_cum__kWh']

# id mapping
pseudonym_id_student_mapping_file_path = 'pseudonym_id_student_mapping.xlsx'

#Target parquet file
file_output_path='reducedheatcarb_sane_measurements_students_data_science.parquet'

In [None]:
try:
    # df = pd.read_parquet(file_path)
    df = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


## Remap the id

In [None]:
df.index.get_level_values('id').unique()

In [None]:
df.info()

In [None]:
# Read the mapping DataFrame from the Excel file
mapping_df = pd.read_excel(pseudonym_id_student_mapping_file_path)

In [None]:
# Convert the mapping DataFrame to a dictionary with 'id' as key and 'random_id' as value
id_mapping_dict = dict(zip(mapping_df['id'], mapping_df['random_id']))

In [None]:
id_mapping_dict

In [None]:
# Replace 'id' values in the DataFrame using the dictionary
df.index = df.index.set_levels(df.index.levels[0].map(id_mapping_dict).astype(df.index.levels[0].dtype), level=0)

In [None]:
df.index.get_level_values('id').unique()

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

## Filter data for only a few properties

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
df = df[df.index.get_level_values('property').isin(properties_to_select)]

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
df.info()

In [None]:
print('#ids: ', len(df.index.get_level_values('id').unique()))
print('ids: ', list(df.index.get_level_values('id').unique()))

print('source categories: ', list(df.index.get_level_values('source_category').unique()))
print('source types: ', list(df.index.get_level_values('source_type').unique()))
print('properties: ', list(df.index.get_level_values('property').unique()))

print('values: ', df['value'].count())
print('#values per: \n', df.groupby(['source_type']).size())
print('#values per: \n', df.groupby(['source_category', 'source_type', 'property']).size())

In [None]:
df

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

## Convert index levels to category

In [None]:
# Convert specific levels of the MultiIndex to 'category'
df.index = df.index.set_levels(df.index.levels[1].astype('category'), level=1)
df.index = df.index.set_levels(df.index.levels[2].astype('category'), level=2)
df.index = df.index.set_levels(df.index.levels[4].astype('category'), level=4)

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

In [None]:
df.info()

## Convert values to floats
N.B. Only do this if you are sure there are no measurements with categorical or string values 

In [None]:
df.loc[:, 'value'] = df['value'].astype('float64')


In [None]:
df

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
for prop in df.index.get_level_values('property').unique().to_list():
    print('\n', prop)
    filtered_df = df[(df.index.get_level_values('property') == prop)]
    # Generate summary statistics
    summary_stats = filtered_df.describe()

    # Format the summary statistics to 2 decimal places
    formatted_summary_stats = summary_stats.applymap(lambda x: f"{x:.2f}")

    # Print the formatted summary statistics
    print(formatted_summary_stats)

## Write to parquet file

In [None]:
%%time 
df.to_parquet(file_output_path, index=True, engine='pyarrow')