# Preprocess schedules

In [None]:
import os

from datetime import datetime, timedelta
import pytz
import pylab as plt

import pandas as pd
import numpy as np
import json
from collections import Counter, defaultdict

import difflib
from IPython.display import display, HTML

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2
import sys
sys.path.append('../data/')

%load_ext autoreload
import gc

from measurements import Measurements

from tqdm.notebook import tqdm

# Replace 'file_path' with the actual path to your file
file_path='remeha_schedules_20231129-20240402.parquet'
file_output_path='remeha_schedules_export.parquet'



In [None]:
def extract_schedule_info(heating_program_json):
    """
    Extract the day of the week, start time (local time), and temperature setpoint
    from the heating program.
    """
    heating_program = json.loads(heating_program_json)
    
    # Assuming the heating program contains only one zone (as per your requirements)
    if len(heating_program) > 1:
        raise Exception("Multiple zones found in heating program.")
    
    schedule_data = []
    
    # Loop over days of the week (assuming structure like {"HeatingProgram1": {day: [...], day: [...]}})
    for day, events in heating_program['HeatingProgram1'].items():
        for event in events:
            start_time_str = event['StartTime']  # Example: "07:00"
            setpoint = event['SetPoint'] if 'SetPoint' in event else event['Temperature']
            
            # Convert start time to datetime object (assumes the time is in HH:MM format)
            start_time = datetime.strptime(start_time_str, '%H:%M').time()
            
            # Collect day of week, start time, and temperature setpoint
            schedule_data.append({
                'day_of_week': day,
                'start_time': start_time,
                'setpoint': setpoint
            })
    
    return schedule_data

In [None]:
# Helper function to get ISO weekday number
def iso_weekday(day):
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return weekdays.index(day)

def parse_heating_program(heating_program_json, heating_activities_json):
    # Parse the input JSON strings
    heating_programs = json.loads(heating_program_json)
    heating_activities = json.loads(heating_activities_json)
    
    # Get the active time program number
    active_program_number = heating_programs.get("ActiveTimeProgram")
    
    if not active_program_number:
        return []  # Return an empty list if no active program is found

    # Convert the active program number to the corresponding key
    active_program_key = f"HeatingProgram{active_program_number}"
    
    # Get the active heating program schedule
    active_schedule = heating_programs.get(active_program_key, {})

    # Use a dictionary to store unique day/time entries with temperature
    schedule_dict = defaultdict(dict)
    
    # Loop over each day's schedule in the active program
    for day, activities in active_schedule.items():
        for activity in activities:
            start_time = activity.get('StartTime')
            activity_number = activity.get('ActivityNumber')

            # Initialize temperature
            temperature = None
            
            # If ActivityNumber exists, find the corresponding activity by ActivityNumber
            if activity_number is not None:
                matching_activity = next((a for a in heating_activities if a['ActivityNumber'] == activity_number), None)

                # If a matching activity is found, use its temperature
                if matching_activity:
                    temperature = matching_activity.get('Temperature')
            
            # Use Temperature from activity if it exists and no matching activity was found
            if temperature is None:
                temperature = activity.get('Temperature')
            
            # If there's a temperature value (from ActivityNumber or direct temperature), add to schedule
            if temperature is not None and start_time is not None:
                # Check for duplicates or conflicts
                if start_time in schedule_dict[day]:
                    if schedule_dict[day][start_time] != temperature:
                        raise ValueError(f"Conflicting temperatures for {day} at {start_time}: {schedule_dict[day][start_time]} vs {temperature}")
                else:
                    schedule_dict[day][start_time] = temperature

    # Sort the schedule by ISO weekday order and start time
    result = []
    for day in sorted(schedule_dict.keys(), key=iso_weekday):
        for start_time in sorted(schedule_dict[day].keys()):
            result.append({
                'day': day,
                'start_time': start_time,
                'temperature': schedule_dict[day][start_time]
            })

    return result

In [None]:
# Function to generate HTML diff between two lists of dictionaries
def generate_diff(previous, current):
    """Generate an HTML diff between two lists of dictionaries."""
    # Convert lists of dictionaries to JSON strings
    previous_json = json.dumps(previous, indent=4) if previous else ''
    current_json = json.dumps(current, indent=4) if current else ''

    if previous_json == '':  # If there is no previous program
        return f"<span style='color: green;'>New: {current_json}</span>"
    elif current_json == '':  # If there is no current program
        return f"<span style='color: red;'>Removed: {previous_json}</span>"

    # Generate diff using difflib
    diff = difflib.ndiff(previous_json.splitlines(), current_json.splitlines())
    html_diff = []

    for line in diff:
        line = line.strip()  # Remove leading/trailing whitespace
        if line.startswith('+ '):
            html_diff.append(f"<span style='color: green;'>{line[2:]}</span>")  # Added lines in green
        elif line.startswith('- '):
            html_diff.append(f"<span style='color: red; text-decoration: line-through;'>{line[2:]}</span>")  # Removed lines in red
        elif line.startswith('^'):  # Ignore lines starting with ^
            continue
        else:
            html_diff.append(line[2:])  # Unchanged lines

    return ''.join(html_diff)  # Join without breaks for HTML display

## Read schedule file

In [None]:
# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)

# Convert file size to kilobytes, megabytes, etc. for better readability
file_size_kb = file_size_bytes / 1024
file_size_mb = file_size_kb / 1024
file_size_gb = file_size_mb / 1024

# Print the file size
print(f"File Size: {file_size_bytes} bytes ({file_size_kb:.2f} KB, {file_size_mb:.2f} MB, {file_size_gb:.2f} GB)")

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_schedules = pd.read_parquet(
        file_path, 
        engine='pyarrow',
        dtype_backend='numpy_nullable'
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_schedules.info()


In [None]:
df_schedules.memory_usage()

In [None]:
# Rename the 'pseudonym' column to 'id' and set as index
df_schedules.rename(columns={'pseudonym': 'id'}, inplace=True)
df_schedules = df_schedules.set_index(['id', 'zone_type', 'zone_name'])

In [None]:
len(list(df_schedules.index.get_level_values('id').unique()))

In [None]:
list(df_schedules.index.get_level_values('zone_type').unique())

In [None]:
df_schedules.reset_index()[['zone_type', 'zone_name']].drop_duplicates().values.tolist()

## Delete duplicate rows

In [None]:
initial_count = df_schedules.shape[0]  # Count before deletion
df_schedules = df_schedules.drop_duplicates()
deleted_count = initial_count - df_schedules.shape[0]  # Count deleted rows
display(deleted_count)

In [None]:
df_schedules.info()

## Select only CH schedules

In [None]:
idx = pd.IndexSlice
df_ch_schedules = df_schedules.loc[idx[:,'CH',:],['heating_program', 'heating_activities', 'valid_from','valid_to']]

In [None]:
df_ch_schedules

In [None]:
df_ch_schedules.groupby(['id', 'zone_type']).count()

In [None]:
df_ch_schedules.groupby(['id', 'zone_type', 'zone_name']).count()

## Select only analyzed ids

In [None]:
analysis_ids = [401632,
 403603,
 404873,
 410260,
 412715,
 424197,
 429011,
 430062,
 434931,
 444964,
 449134,
 450298,
 456638,
 458000,
 458852,
 478667,
 483173,
 487126,
 494233,
 495906]

In [None]:
df_ch_schedules_analysis = df_ch_schedules.loc[df_ch_schedules.index.get_level_values('id').isin(analysis_ids)]

In [None]:
invalid_intervals = df_ch_schedules_analysis[df_ch_schedules_analysis['valid_from'] > df_ch_schedules_analysis['valid_to']]
if not invalid_intervals.empty:
    print("Invalid intervals found:")
    with pd.option_context('display.max_colwidth', None):
        display(invalid_intervals)


In [None]:
df_ch_schedules_analysis.groupby(['id', 'zone_type', 'zone_name']).count().sort_values(by='heating_program', ascending=True)

In [None]:
# Apply the new parsing logic that uses heating_activities if temperature is not embedded
df_ch_schedules_analysis.loc[:,'active_schedule'] = df_ch_schedules_analysis.apply(
    lambda row: parse_heating_program(row['heating_program'], row['heating_activities']) if pd.notna(row['heating_program']) and pd.notna(row['heating_activities']) else None, axis=1
)


In [None]:
# Step 3:  Ensure valid_from and valid_to are both datetime
df_ch_schedules_analysis.loc[:,'valid_from'] = pd.to_datetime(df_ch_schedules_analysis['valid_from'])
df_ch_schedules_analysis.loc[:,'valid_to'] = pd.to_datetime(df_ch_schedules_analysis['valid_to'])


In [None]:
# Step 3: Fill NaT in valid_to with valid_from of the next row within the same id
# Resetting index to simplify access
df_ch_schedules_analysis = df_ch_schedules_analysis.reset_index().sort_values(by=['id', 'valid_from'])

In [None]:
for i in range(len(df_ch_schedules_analysis) - 1):
    current_row = df_ch_schedules_analysis.iloc[i]
    next_row = df_ch_schedules_analysis.iloc[i + 1]
    
    # Check if the current row has a NaT valid_to and the next row has the same id
    if pd.isna(current_row['valid_to']) and current_row['id'] == next_row['id']:
        # Fill NaT with the valid_from of the next row
        df_ch_schedules_analysis.at[current_row.name, 'valid_to'] = next_row['valid_from']
        print(f"Filled NaT with the valid_from of the next row: {next_row['valid_from']}")

In [None]:
# Step 4: fill NaT with future date
df_ch_schedules_analysis.loc[:,'valid_to'] = df_ch_schedules_analysis['valid_to'].fillna(pd.Timestamp('2100-12-31').tz_localize('Europe/Amsterdam'))

In [None]:
# Step 5: Create IntervalIndex using valid_from and valid_to
df_ch_schedules_analysis.loc[:,'valid_interval'] = pd.IntervalIndex.from_arrays(
    df_ch_schedules_analysis['valid_from'].values,  # Ensure it's an array
    df_ch_schedules_analysis['valid_to'].values    # Ensure it's an array
)

In [None]:
# Step 6: Create a copy with specified MultiIndex and columns
df_ch_schedules_analysis = df_ch_schedules_analysis.copy().reset_index()

In [None]:
# Now set the MultiIndex [id, valid_interval]
df_ch_schedules_analysis.set_index(['id', 'valid_interval'], inplace=True)

In [None]:
# Keep only 'zone_name' and 'active_schedule' (renamed to 'program')
df_ch_schedules_analysis = df_ch_schedules_analysis[['zone_name', 'active_schedule']].rename(columns={'active_schedule': 'program'})

In [None]:
# Step 2: Remove duplicates
initial_count = df_ch_schedules_analysis.shape[0]  # Count before deletion
df_ch_schedules_analysis = df_ch_schedules_analysis[~df_ch_schedules_analysis.index.duplicated(keep='first')]  # Keep the first occurrence of duplicates
deleted_count = initial_count - df_ch_schedules_analysis.shape[0]  # Count deleted rows

In [None]:
deleted_count

In [None]:
df_ch_schedules_analysis = df_ch_schedules_analysis.sort_index()

In [None]:
df_ch_schedules_analysis = df_ch_schedules_analysis.drop(columns='zone_name')

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_ch_schedules_analysis)

In [None]:
# Prepare to hold the generated HTML diff
df_ch_schedules_analysis['previous_program'] = None
df_ch_schedules_analysis['diff'] = ''

In [None]:
# Iterate through the DataFrame
for i in range(1, len(df_ch_schedules_analysis)):
    current_row = df_ch_schedules_analysis.iloc[i]
    previous_row = df_ch_schedules_analysis.iloc[i - 1]

    # Check if the current and previous rows have the same id
    if current_row.name[0] == previous_row.name[0]:  # Compare 'id' from MultiIndex
        # Generate the HTML diff
        previous_program = previous_row['program']
        current_program = current_row['program']
        
        df_ch_schedules_analysis.at[current_row.name, 'previous_program'] = previous_program
        df_ch_schedules_analysis.at[current_row.name, 'diff'] = generate_diff(previous_program, current_program)



In [None]:
# Display the DataFrame with HTML rendering
with pd.option_context('display.max_colwidth', None):
    # Display only the relevant columns with HTML
    display(HTML(df_ch_schedules_analysis[['diff']].to_html(escape=False)))

## Write schedules to parquet file

In [None]:
df_ch_schedules_analysis_to_write = df_ch_schedules_analysis.drop(columns = ['previous_program', 'diff'])

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_ch_schedules_analysis_to_write)

In [None]:
%%time 
df_ch_schedules_analysis_to_write.to_parquet(file_output_path, index=True, engine='pyarrow')