# Import data

In [None]:
import pandas as pd
import os
import missingno as msno
from IPython.display import display

file_path='remeha_export.parquet'

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df.info()

In [None]:
df.memory_usage()

In [None]:
# Get the names of the levels in the MultiIndex
level_names = df.index.names

# Get the datatypes of the levels in the MultiIndex
level_dtypes = [df.index.get_level_values(level).dtype for level in range(df.index.nlevels)]

# Display the names and datatypes of the levels in the MultiIndex
for i, name in enumerate(level_names):
    print(f"Level {i}: Name = {name}, Dtype = {level_dtypes[i]}")

In [None]:
df

In [None]:
df['value'].count()

In [None]:
df['value'].notna().sum()

In [None]:
counts =  df.groupby(['source_category', 'source_type', 'property']).size()
# Filter out rows with count 0
counts_filtered = counts[counts != 0]

# Print filtered counts
print(counts_filtered)

In [None]:
df.groupby('id').size()

In [None]:
print('\n'.join(df.index.get_level_values('property').unique().tolist()))


# Get overview

In [None]:
df.index.names

In [None]:
df_unstacked = df.unstack(level='property')

In [None]:
df_unstacked

In [None]:
df_unstacked.info()

In [None]:
df_unstacked.describe()

In [None]:
for id in df_unstacked.index.get_level_values('id').unique():
    print(f"Description for id {id}:")
    display(df_unstacked[df_unstacked.index.get_level_values('id') == id].describe())


# Detect stuck values

In [None]:
# Get unique ids
unique_ids = df_unstacked.index.get_level_values('id').unique()

# Initialize an empty DataFrame to store the variance per id per column
variance_df = pd.DataFrame(index=unique_ids, columns=df_unstacked.columns)

# Calculate variance for each id and each column
for id in unique_ids:
    id_df = df_unstacked[df_unstacked.index.get_level_values('id') == id]
    variance_df.loc[id] = id_df.var()

# Define a function to apply conditional formatting
def highlight_zero_variance(val):
    if pd.notna(val) and val == 0:
        return 'background-color: yellow'
    return ''

# Apply conditional formatting to the DataFrame
variance_df_styled = variance_df.style.applymap(highlight_zero_variance)

# Display the styled DataFrame
variance_df_styled


In [None]:
for id in df_unstacked.index.get_level_values('id').unique():
    df_unstacked[df_unstacked.index.get_level_values('id') == id].describe()

In [None]:
msno.bar(df_unstacked)

In [None]:
msno.matrix(df_unstacked)

In [None]:
msno.matrix(df_unstacked.swaplevel('timestamp', 'id', axis=0))

In [None]:
for id in df_unstacked.index.get_level_values('id').unique():
    msno.matrix(df_unstacked[df_unstacked.index.get_level_values('id') == id])