# Remap id for student analysis: remeha data

In [None]:
import pandas as pd
file_path='remeha_export.parquet'
file_output_path='remeha_export_student.parquet'


In [None]:
%%time
# Attempt to read the Parquet file
try:
    df = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df

In [None]:
# Read the mapping DataFrame from the Excel file
mapping_df = pd.read_excel('pseudonym_id_student_mapping.xlsx')

In [None]:
df.index.get_level_values('id').unique()

In [None]:
# Convert the mapping DataFrame to a dictionary with 'id' as key and 'random_id' as value
id_mapping_dict = dict(zip(mapping_df['id'], mapping_df['random_id']))

# Replace 'id' values in the DataFrame using the dictionary
df.index = df.index.set_levels(df.index.levels[0].map(id_mapping_dict).astype(df.index.levels[0].dtype), level=0)


In [None]:
df.index.get_level_values('id').unique()

In [None]:
df

In [None]:
df.info()

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

In [None]:
counts =  df.groupby(['source_category', 'source_type', 'property']).size()
# Filter out rows with count 0
counts_filtered = counts[counts != 0]

# Print filtered counts
print(counts_filtered)

# Filter data for only a few properties

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
df = df[(df.index.get_level_values('property') == 'g_use_ch_inf_cum__kWh') | (df.index.get_level_values('property') == 'g_use_dhw_inf_cum__kWh')| (df.index.get_level_values('property') == 'temp_in__degC')]

In [None]:
df.index.get_level_values('property').unique().to_list()

In [None]:
df

In [None]:
df.sample(25)

In [None]:
df.info()

In [None]:
%%time 
df.to_parquet(file_output_path, index=True, engine='pyarrow')