# Import data

In [None]:
import pandas as pd
import numpy as np

nfh_input_file_path='nfh_raw_meas.parquet'
enelogic_input_path='rhc_enelogic_monthly_export.parquet'
remeha_input_file_path='remeha_export.parquet'
rhc_output_file_path='rhc_raw_meas.parquet'
home_data_file_path = "home_data.parquet"

# usually, two decimals suffice for displaying DataFrames (NB internally, precision may be higher)
pd.options.display.precision = 2

import sys
sys.path.append('../data/')
sys.path.append('../view/')

%load_ext autoreload


%matplotlib inline
%matplotlib widget
import pylab as plt
import itertools
from plotter import Plot
from tqdm.notebook import tqdm
import historicdutchweather
from measurements import Measurements, WeatherMeasurements

from urllib.error import HTTPError  # Import HTTPError from urllib.error

In [None]:
units_to_mathtext = property_types = {
    'ppm' : r'$ppm$',
    'kWh' : r'$kWh$',
    'm3' : r'$m^{3}$',
    'degC' : r'$Â°C$',
    'W' : r'$W$',
    'V' : r'$V$',
    '0' : r'$[-]$',
    'bool': r'$0 = False; 1 = True$',
    'W_m_2' : r'$W\cdotm^{-1}$'
}

## Read NeedForHeat data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_nfh = pd.read_parquet(
        nfh_input_file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_nfh.info()

In [None]:
df_meas_nfh

## Read Enelogic data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_enelogic = pd.read_parquet(
        enelogic_input_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_enelogic.info()

In [None]:
df_meas_enelogic

In [None]:
%%time
df_meas_enelogic['value'] = df_meas_enelogic['value'].astype(str)

In [None]:
df_meas_enelogic.info()

## Read Remeha data

In [None]:
%%time
# Attempt to read the Parquet file
try:
    df_meas_remeha = pd.read_parquet(
        remeha_input_file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
df_meas_remeha.info()

In [None]:
df_meas_remeha

In [None]:
%%time
df_meas_remeha['value'] = df_meas_remeha['value'].astype(str)

In [None]:
df_meas_remeha.info()

## Merge measurements


In [None]:
%%time
df_meas = pd.concat([df_meas_nfh, df_meas_enelogic, df_meas_remeha])

## Get and merge geospatially interpolated KNMI weather measurements

In [None]:
%%time
# read home data from Parquet file
try:
    df_homes = pd.read_parquet(
        home_data_file_path, 
        engine='pyarrow',
        use_nullable_dtypes=True
        )
    print("File was successfully read without specifying compression codec.")
except Exception as e:
    print(f"Error reading file: {e}")


In [None]:
%%time

# Determine weather start and end dates
mask = ~((df_meas.index.get_level_values('source_category') == 'batch_import') & (df_meas.index.get_level_values('source_type') == 'enelogic'))
filtered_df = df_meas[mask]

# Step 2: Calculate the min and max timestamp for the entire DataFrame (generic)
weather_min_timestamp = filtered_df.index.get_level_values('timestamp').min()
weather_max_timestamp = filtered_df.index.get_level_values('timestamp').max()

metrics={'T': ('temp_out__degC', 0.1), # H Temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation
        'FH': ('wind__m_s_1', 0.1), # FH: Hourly mean wind speed (in 0.1 m/s)
        'Q': ('ghi__W_m_2', (100 * 100) / (60 * 60)) # Q: Global radiation (in J/cm^2) during the hourly division, 1 m^2 = 100 cm/m^2 * 100 cm/m^2, 1 h = 60 min/h * 60 s/min
       }

weather_interval = pd.Interval(left=weather_min_timestamp, right=weather_max_timestamp, closed='both')

In [None]:
%%time
%autoreload 2
df_weather = WeatherMeasurements.fetch_weather_data(weather_interval, metrics=metrics)

In [None]:
df_weather.info()


In [None]:
df_weather

In [None]:
df_weather.describe().T

In [None]:
%%time
%autoreload 2
df_meas_weather = WeatherMeasurements.interpolate_weather_data(df_weather, df_homes)

In [None]:
df_meas_weather.info()

In [None]:
# Describe statistics across all properties
df_meas_weather['value'].unstack('property').describe().T

In [None]:
df_meas_weather_unstacked = df_meas_weather['value'].unstack('property')

In [None]:
print(f"wind__m_s_1 <0 : #{df_meas_weather_unstacked[df_meas_weather_unstacked['wind__m_s_1'] <0]['wind__m_s_1'].count()} = {df_meas_weather_unstacked[df_meas_weather_unstacked['wind__m_s_1'] <0]['wind__m_s_1'].count()/df_meas_weather_unstacked['wind__m_s_1'].count():%}")

In [None]:
print(f"ghi__W_m_2 <0 : #{df_meas_weather_unstacked[df_meas_weather_unstacked['ghi__W_m_2'] <0]['ghi__W_m_2'].count()} = {df_meas_weather_unstacked[df_meas_weather_unstacked['ghi__W_m_2'] <0]['ghi__W_m_2'].count()/df_meas_weather_unstacked['ghi__W_m_2'].count():%}")

In [None]:
# Check whether all weather locations are interpolated

# Step 1: Filter IDs with all NaN metrics
ids_with_missing_metrics = df_meas_weather_unstacked[df_meas_weather_unstacked.isna().all(axis=1)].index.get_level_values('id').unique()

# Step 2: Extract H3 cell IDs from df_homes based on ids_with_missing_metrics
h3_cell_ids = df_homes.loc[df_homes.index.isin(ids_with_missing_metrics), 'weather_H3_cell_id'].tolist()
print(f"Weather cells without properly interpolated values: {h3_cell_ids}")

# Step 3: Prepare marker_df using df_weather (assuming df_weather contains lat and lon)
marker_df = df_weather.reset_index()[['lat__degN', 'lon__degE']].drop_duplicates()

# Optionally, include popup_text if available in df_weather
marker_df['popup_text'] = 'KNMI Station'  # Replace with actual station names or IDs if available

# Step 4: Call the function to plot H3 cells and markers
Plot.plot_h3_cells_and_markers(h3_cell_ids, marker_df, output_file="map_with_h3_cells.html")

## Merge weather measurements


In [None]:
%%time
df_meas_weather['value'] = df_meas_weather['value'].astype(str)

In [None]:
%%time
df_meas = pd.concat([df_meas, df_meas_weather])

## Initial exploration: size, ids, start & stop times per id

In [None]:
df_meas.info()

In [None]:
print(f"len(df_meas): {len(df_meas):_}")

In [None]:
%%time
# deduplicate the measurements
df_meas = df_meas.reset_index().drop_duplicates().set_index(['id', 'source_category', 'source_type', 'timestamp', 'property']).sort_index()

In [None]:
print(f"len(df_meas): {len(df_meas):_}")

In [None]:
list(df_meas.index.get_level_values('id').unique())

In [None]:
len(df_meas.index.get_level_values('id').unique())

In [None]:
%%time
df_meas.reset_index().groupby(['id', 'source_type'])['timestamp'].agg(['min', 'max'])

In [None]:
print(f"df_meas['value'].count(): {df_meas['value'].count():_}")

In [None]:
df_meas.duplicated().any()

In [None]:
df_meas.info()

In [None]:
df_meas.describe()

In [None]:
df_meas

In [None]:
list(df_meas.index.get_level_values('source_category').unique())

In [None]:
list(df_meas.index.get_level_values('source_type').unique())

In [None]:
list(df_meas.index.get_level_values('property').unique())

In [None]:
df_meas.groupby(['source_type']).size()

In [None]:
df_meas.groupby(['source_category', 'source_type', 'property']).size()

In [None]:
print(f"df_meas.count().sum(): {df_meas.count().sum():_}")

## Write to parquet file(s)

In [None]:
%%time 
# Convert the 'value' column to string type
df_meas['value'] = df_meas['value'].astype(str)

In [None]:
%%time 
df_meas.to_parquet(rhc_output_file_path, index=True, engine='pyarrow')

In [None]:
# %%time 
# for home_id in tqdm(df_meas.index.get_level_values('id').unique()):
#     df_meas.xs(home_id, drop_level=False).to_parquet(f'{home_id}_raw_measurements.parquet', index=True, engine='pyarrow')

## Write to csv file(s)

### Write raw measurements per home to zipped .CSV files

In [None]:
%%time 
for home_id in tqdm(df_meas.index.get_level_values('id').unique()):
    try:
        # df_meas_home =  pd.read_parquet(
        #     f'{home_id}_raw_measurements.parquet', 
        #     engine='pyarrow',
        #     use_nullable_dtypes=True
        # )
        df_meas.xs(home_id, drop_level=False).to_csv(
            f'{home_id}_raw_measurements.zip',
            encoding='utf-8',
            compression= dict(method='zip',
                              archive_name=f'{home_id}_raw_measurements.csv'),
            date_format='%Y-%m-%dT%H:%M:%S%z'
        )
    except FileNotFoundError as e:
        print(f"Error: {e}. Skipping file {home_id}_raw_measurements.parquet.")
        continue     
