In [None]:
import os

from datetime import datetime, timedelta
import pytz
import pylab as plt

import pandas as pd
import numpy as np
import json

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2
import sys
sys.path.append('../data/')

%load_ext autoreload
import gc

from measurements import Measurements

from tqdm.notebook import tqdm

# Replace 'file_path' with the actual path to your file
file_path='remeha_schedules_20231129-20240402.parquet'
file_output_path='remeha_schedules_export.parquet'



In [None]:
# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)

# Convert file size to kilobytes, megabytes, etc. for better readability
file_size_kb = file_size_bytes / 1024
file_size_mb = file_size_kb / 1024
file_size_gb = file_size_mb / 1024

# Print the file size
print(f"File Size: {file_size_bytes} bytes ({file_size_kb:.2f} KB, {file_size_mb:.2f} MB, {file_size_gb:.2f} GB)")

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_schedules = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_schedules.info()


In [None]:
df_schedules.memory_usage()

In [None]:
# Rename the 'pseudonym' column to 'id' and set as index
df_schedules.rename(columns={'pseudonym': 'id'}, inplace=True)
df_schedules = df_schedules.set_index(['id', 'zone_type', 'zone_name'])

In [None]:
len(list(df_schedules.index.get_level_values('id').unique()))

In [None]:
list(df_schedules.index.get_level_values('zone_type').unique())

In [None]:
df_schedules.reset_index()[['zone_type', 'zone_name']].drop_duplicates().values.tolist()

In [None]:
df_schedules

## Delete duplicate rows

In [None]:
df_schedules = df_schedules.drop_duplicates()

In [None]:
df_schedules.info()

## Select only CH schedules

In [None]:
idx = pd.IndexSlice
df_ch_schedules = df_schedules.loc[idx[:,'CH',:],['heating_program', 'heating_activities', 'valid_from','valid_to']]

In [None]:
df_ch_schedules

In [None]:
 df_ch_schedules.groupby(['id', 'zone_type']).count()

In [None]:
 df_ch_schedules.groupby(['id', 'zone_type', 'zone_name']).count()

## Select only analyzed ids

In [None]:
analysis_ids = [401632,
 403603,
 404873,
 410260,
 412715,
 424197,
 429011,
 430062,
 434931,
 444964,
 449134,
 450298,
 456638,
 458000,
 458852,
 478667,
 483173,
 487126,
 494233,
 495906]

In [None]:
df_ch_schedules_analysis = df_ch_schedules.loc[df_ch_schedules.index.get_level_values('id').isin(analysis_ids)]


In [None]:
df_ch_schedules_analysis.groupby(['id', 'zone_type', 'zone_name']).count().sort_values(by='heating_program', ascending=True)

In [None]:
df_ch_schedules_analysis.groupby(['id', 'zone_type', 'zone_name']).count().sort_values(by='heating_program', ascending=True).to_excel('count_schedules.xlsx', index=True)

In [None]:
from deepdiff import DeepDiff

In [None]:
# Create a function to compare rows
def compare_rows(row1, row2):
    return DeepDiff(json.loads(row1['heating_program']), json.loads(row2['heating_program'])), DeepDiff(json.loads(row1['heating_activities']), json.loads(row2['heating_activities']))
# Function to compare rows safely
def compare_rows(row1, row2):
    # Convert to JSON if not None
    heating_program1 = json.loads(row1['heating_program']) if pd.notna(row1['heating_program']) else None
    heating_program2 = json.loads(row2['heating_program']) if pd.notna(row2['heating_program']) else None
    
    heating_activities1 = json.loads(row1['heating_activities']) if pd.notna(row1['heating_activities']) else None
    heating_activities2 = json.loads(row2['heating_activities']) if pd.notna(row2['heating_activities']) else None
    
    return DeepDiff(heating_program1, heating_program2), DeepDiff(heating_activities1, heating_activities2)


In [None]:
# Check for differences in consecutive rows that have the same valid_from
for id in analysis_ids:
     df_filtered = df_ch_schedules.loc[df_ch_schedules.index.get_level_values('id')==id]
     # Iterate over the DataFrame to find differences
     for i in range(1, len(df_filtered)):
         row1 = df_filtered.iloc[i - 1]
         row2 = df_filtered.iloc[i]
         # Check if 'valid_from' and 'valid_to' are equal
         if row1['valid_from'] == row2['valid_from'] and row1['valid_to'] == row2['valid_to']:
             diff_program, diff_activities = compare_rows(row1, row2)
             if diff_program or diff_activities:
                 print(f"\nDifferences for id {id} between row {i-1} and row {i}, both valid from {row1['valid_from']} to {row1['valid_to']}:")
                 print("- Heating Program Diff:", diff_program)
                 print("- Heating Activities Diff:", diff_activities)

In [None]:
# Check for differences in consecutive rows that do NOT have the same valid_from
for id in analysis_ids:
     df_filtered = df_ch_schedules.loc[df_ch_schedules.index.get_level_values('id')==id]
     # Iterate over the DataFrame to find differences
     for i in range(1, len(df_filtered)):
         row1 = df_filtered.iloc[i - 1]
         row2 = df_filtered.iloc[i]
         # Check if 'valid_from' and 'valid_to' are equal
         if row1['valid_from'] != row2['valid_from'] and row1['valid_to'] != row2['valid_to']:
             diff_program, diff_activities = compare_rows(row1, row2)
             if diff_program or diff_activities:
                 print(f"\nDifferences for id {id} between row valid from {row1['valid_from']} to {row1['valid_to']} and {row2['valid_from']} to {row2['valid_to']}:")
                 print("- Heating Program Diff:", diff_program)
                 print("- Heating Activities Diff:", diff_activities)

In [None]:
 for id in analysis_ids:
     display(df_ch_schedules.drop_duplicates().loc[df_ch_schedules.drop_duplicates().index.get_level_values('id')==id])

In [None]:
sorted(df_ch_schedules_analysis.index.get_level_values('zone_name').unique().to_list())

In [None]:
for id in analysis_ids:
    heating_program = df_ch_schedules_analysis.loc[(id, 'CH'),'heating_program']
    # Convert JSON string to dictionary and print it
    for zone, program in heating_program.items():
        print(f"\nID: {id}, ZONE: {zone}, PROGRAM: {json.loads(program)}")

    heating_activities = df_ch_schedules_analysis.loc[(id, 'CH'),'heating_activities']
    # Convert JSON string to dictionary and print it
    for zone, activities in heating_activities.items():
        print(f"\nID: {id}, ZONE: {zone}, ACTIVITIES: {json.loads(activities)}")

In [None]:
for id in analysis_ids:
     # Sample heating_activities and heating_program data
    for zone, program in heating_program.items():
        heating_program = json.loads(df_ch_schedules_analysis.loc[(id, 'CH'), 'heating_program'][zone])
        heating_activities = json.loads(df_ch_schedules_analysis.loc[(id, 'CH'), 'heating_activities'][zone])
        
        # Create a dictionary to map ActivityNumber to temperature from heating_activities
        activity_temps = {activity['ActivityNumber']: activity['Temperature'] for activity in heating_activities}
        
        # Variable to track if all temperatures are consistent
        all_consistent = True
        
        # Iterate through the heating program and compare temperatures
        for program_name, days in heating_program.items():
            # Check if the value for program_name is a dictionary (expected structure), skip if it's not
            if isinstance(days, dict):
                for day, entries in days.items():
                    for entry in entries:
                        activity_num = entry['ActivityNumber']
                        program_temp = entry.get('Temperature')  # Get the temperature in the heating_program
                        activity_temp = activity_temps.get(activity_num)  # Get the corresponding temperature from heating_activities
        
                        # Check for consistency
                        if program_temp is not None and activity_temp is not None:
                            if program_temp != activity_temp:
                                print(f"Inconsistency on {day} for {program_name} at {entry['StartTime']}: "
                                      f"Program temperature is {program_temp}, but activity temperature is {activity_temp}.")
                                all_consistent = False
                            else:
                                print(f"Consistent on {day} for {program_name} at {entry['StartTime']}: "
                                      f"Temperature is {program_temp}.")
            else:
                print(f"Skipping {program_name}: value is not a dictionary, but {type(days)}")
        
        # Print a message confirming if all temperatures were consistent
        if all_consistent:
            print(f"All temperatures in the heating program  for id {id} and zone {zone} are consistent with the activity temperatures.")
        else:
            print(f"There were inconsistencies found in the heating program for id {id} and zone {zone}")


In [None]:
%%time 
df.to_parquet(file_output_path, index=True, engine='pyarrow')