In [None]:
import pandas as pd

file_path='needforheat_raw_measurements.parquet'
file_output_path='needforheat_export_student.parquet'

In [None]:
map_source_category = {
    'twomes-p1-reader-firmware': 'device',
    'enelogic': 'cloud_feed',
    'twomes-co2-occupancy-scd41-m5coreink-firmware': 'device'
}

In [None]:
try:
    # df = pd.read_parquet(file_path)
    df = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


# Remap the id

In [None]:
df.index.get_level_values('id').unique()

In [None]:
df.info()

In [None]:
# Read the mapping DataFrame from the Excel file
mapping_df = pd.read_excel('pseudonym_id_student_mapping.xlsx')

In [None]:
# Convert the mapping DataFrame to a dictionary with 'id' as key and 'random_id' as value
id_mapping_dict = dict(zip(mapping_df['id'], mapping_df['random_id']))

In [None]:
id_mapping_dict

In [None]:
# Replace 'id' values in the DataFrame using the dictionary
df.index = df.index.set_levels(df.index.levels[0].map(id_mapping_dict).astype(df.index.levels[0].dtype), level=0)

In [None]:
df.index.get_level_values('id').unique()

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

# Delete device_name, add source_category and source_type 

In [None]:
# Step 1: Rename the 'source' level to 'source_type'
df = df.rename_axis(index={'source': 'source_type'})


In [None]:
df.index.names

In [None]:
df

In [None]:
df.index.get_level_values('source_type').unique()

In [None]:
# Drop the 'device_name' level from the MultiIndex
df.index = df.index.droplevel('device_name')

In [None]:
df

In [None]:
# Create a new column 'source_category' based on the mapping of 'source_type'
df['source_category'] = df.index.get_level_values('source_type').map(map_source_category)

In [None]:
df

In [None]:
# Reset the index to separate the index levels from the new column
df.reset_index(inplace=True)

In [None]:
# Set 'source_category' as the index level
df.set_index(['id', 'source_category', 'source_type', 'timestamp', 'property'], inplace=True)

In [None]:
df

In [None]:
# Extract 'source_category' and 'source_type' levels from the MultiIndex
source_category_levels = df.index.get_level_values('source_category')
source_type_levels = df.index.get_level_values('source_type')

# Combine into a DataFrame and drop duplicates
unique_combinations = pd.DataFrame({'source_category': source_category_levels, 'source_type': source_type_levels}).drop_duplicates().reset_index(drop=True)

In [None]:
unique_combinations

In [None]:
df


In [None]:
counts =  df.groupby(['source_category', 'source_type', 'property']).size()
# Filter out rows with count 0
counts_filtered = counts[counts != 0]

# Print filtered counts
print(counts_filtered)

# Filter data for only a few properties

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
df = df[(df.index.get_level_values('property') == 'g_use_cum__m3') | (df.index.get_level_values('property') == 'temp_indoor__degC')]

In [None]:
df.info()

In [None]:
df

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

In [None]:
# Convert specific levels of the MultiIndex to 'category'
df.index = df.index.set_levels(df.index.levels[1].astype('category'), level=1)
df.index = df.index.set_levels(df.index.levels[2].astype('category'), level=2)
df.index = df.index.set_levels(df.index.levels[4].astype('category'), level=4)

In [None]:
df.loc[:, 'value'] = df['value'].astype('float64')


In [None]:
df.info()

In [None]:
df

In [None]:
%%time 
df.to_parquet(file_output_path, index=True, engine='pyarrow')