In [1]:
from datetime import datetime, timedelta
import pytz
import math
import pylab as plt

import pandas as pd
import numpy as np

import sys
sys.path.append('../data/')
sys.path.append('../view/')
sys.path.append('../analysis/')

%load_ext autoreload

%matplotlib widget
from plotter import Plot
from filewriter import ExcelWriter as ex

from extractor import WeatherExtractor, Extractor, Period

from inversegreyboxmodel import Learner

import logging
logger = logging.getLogger('Twomes data extraction')
logger.setLevel(logging.NOTSET)



n_std_outliers = 3.0 # default for the multiplier of the the standard deviation; further out than this times the std, outliers are removed during preprocessing
up_intv = '5min' # the default upsampling interval that is used before interpolation is done
gap_n_intv = 11 # the default maximum number of consecutive NaNs to fill(one for each upsampling interval), i.e. valid measurement values (11+1)* 5 min = 1 hour apart apart will be bridget by interpolation, but not more
sampling_interval = '15min' # the default interval on which interpolation will be done during preprocessing
moving_horizon_duration_d = 7
required_columns_for_sanity = ['home_id', 'outdoor_eff_temp_avg_C', 'irradiation_hor_avg_W_per_m2', 'indoor_temp_avg_C', 'gas_sup_avg_W', 'e_remaining_heat_avg_W', 'interval_s']
        

sanity_fraction = 0.5

#location: center of Assendorp neighbourhood in Zwolle
lat, lon = 52.5065500000, 6.0996100000

#timezone: 
timezone_database = 'UTC'
timezone_homes = 'Europe/Amsterdam'

# TODO: get list of pseudonyms, (valid) dates and reference parameters from Excel file?  
# utimately: #min, max dates of the analysis
first_day = pytz.timezone(timezone_homes).localize(datetime(2021, 10, 25))
last_day = pytz.timezone(timezone_homes).localize(datetime(2022, 5, 8))
# first_day = pytz.timezone(timezone_homes).localize(datetime(2022, 1, 3))
# last_day = pytz.timezone(timezone_homes).localize(datetime(2022, 1, 24))

# homes = [803422, 805164, 809743, 811308, 815925, 817341, 822479, 829947, 830088, 831062, 839440, 845966, 845997, 846697, 857477, 864296, 873985, 879481, 881611, 886307, 895671, 897349, 899510]
# reversedhomes = [899510, 897349, 895671, 886307, 881611, 879481, 873985, 864296, 857477, 846697, 845997, 845966, 839440, 831062, 830088, 829947, 822479, 817341, 815925, 811308, 809743, 805164, 803422]
# Optionally: oveerride homes and dates for which to get data
homes =  [803422]
# homes =  [817341, 886307, 873985, 803422, 805164]

#homes with problematic smart meter timestamps:
# homes =  [895671, 809743, 815925]
        


In [2]:
#check whether the datetime object is properly timezone-aware
print(first_day, first_day.tzinfo)


2021-10-25 00:00:00+02:00 Europe/Amsterdam


In [3]:
#check whether the datetime object is properly timezone-aware
print(last_day, first_day.tzinfo)

2022-05-08 00:00:00+02:00 Europe/Amsterdam


In [4]:
%%time 
%autoreload 2
# get geospatially interpolated weather from KNMI
# for Twomes, the Weather for all all homes studies can be approached by a single location
# get the dataframe only once for all homes to save time
tz_knmi='Europe/Amsterdam'

df_weather = WeatherExtractor.get_interpolated_weather_nl(first_day, last_day, lat, lon, tz_knmi, timezone_homes, sampling_interval)

weather_extractor_starttime:  2021-10-24 23:00:00+02:00
weather_extractor_endtime:  2022-05-09 01:00:00+02:00
CPU times: user 48.3 ms, sys: 6.65 ms, total: 54.9 ms
Wall time: 53.5 ms


In [5]:
df_weather

Unnamed: 0,outdoor_temp_avg_C,wind_m_per_s_avg,irradiation_hor_avg_W_per_m2,outdoor_eff_temp_avg_C
2021-10-25 00:00:00+02:00,5.466645,2.604638,0.0,3.730219
2021-10-25 00:15:00+02:00,5.541461,2.625970,0.0,3.790814
2021-10-25 00:30:00+02:00,5.616278,2.647303,0.0,3.851410
2021-10-25 00:45:00+02:00,5.691095,2.668635,0.0,3.912005
2021-10-25 01:00:00+02:00,5.765911,2.689967,0.0,3.972600
...,...,...,...,...
2022-05-08 22:45:00+02:00,5.679318,1.235572,0.0,4.855603
2022-05-08 23:00:00+02:00,5.336775,1.170658,0.0,4.556336
2022-05-08 23:15:00+02:00,5.042697,1.170658,0.0,4.262258
2022-05-08 23:30:00+02:00,4.748619,1.170658,0.0,3.968181


In [6]:
#see more statisctics about the weather data
df_weather.describe(include='all')

Unnamed: 0,outdoor_temp_avg_C,wind_m_per_s_avg,irradiation_hor_avg_W_per_m2,outdoor_eff_temp_avg_C
count,18816.0,18816.0,18816.0,18816.0
mean,6.776625,3.62836,82.295479,4.357718
std,4.26188,2.145596,159.753893,4.143159
min,-5.675577,-2e-323,0.0,-8.649752
25%,3.980206,2.090136,0.0,1.561753
50%,6.697973,3.259655,0.0,4.190069
75%,9.483935,4.768622,80.148505,6.968198
max,20.504261,16.82201,849.062087,18.101169


In [None]:
%%time 
# get interpolated data from the Twomes database and combine with weather data already obtained

logger.setLevel(logging.INFO)


df_data_homes = Extractor.get_preprocessed_homes_data(homes, first_day, last_day, timezone_database, timezone_homes,
                                                      up_intv, gap_n_intv, sampling_interval, 
                                                      required_columns_for_sanity,
                                                      df_weather)
logger.setLevel(logging.NOTSET)


  0%|          | 0/1 [00:00<?, ?it/s]

Retrieving data for home 803422 from 2021-10-24T21:00:00+00:00 to 2022-05-08T23:00:00+00:00 ...


In [None]:
print('df_data_homes.index[0]: ', df_data_homes.index[0])

In [None]:
df_data_homes

In [None]:
#see more statisctics
df_data_homes.describe(include='all')

In [None]:
# present some sanity metrics for the extracted data
total_measurement_time = timedelta(seconds = df_data_homes['interval_s'].sum())
print('Total measurement time: ', total_measurement_time)
sane_fraction = df_data_homes['sanity_frac'].mean()
print('Sane fraction measurement time: {:.2f}'.format(sane_fraction))
sane_measurement_time = total_measurement_time * sane_fraction
print('Sane  measurement time: ', sane_measurement_time)

In [None]:
%%time 
%autoreload 2

filename_prefix = datetime.now().astimezone(pytz.timezone('Europe/Amsterdam')).replace(microsecond=0).isoformat().replace(":","")
ex.write(df_data_homes, str('{0}-data_homes-{1}-{2}.xlsx'.format(filename_prefix, first_day.isoformat(),first_day.isoformat())))

In [None]:
# plot temperature data
logger.setLevel(logging.NOTSET)
Plot.weather_and_other_temperatures('Weather in Assendorp, Zwolle', df_weather)

# N.B. The resulting figure below can be manipulated interactively; hover with mouse for tips & tricks

In [None]:
# how to select data from a single home
# df_data_homes.loc[817341]

In [None]:
%%time 
%autoreload 2

# homes with most data
# homes_to_analyse = [886307, 873985, 817341]
# start_analysis_period = pytz.timezone(timezone_homes).localize(datetime(2021, 12, 20))
# end_analysis_period = pytz.timezone(timezone_homes).localize(datetime(2022, 1, 18))
# date and times of seemingly valid periods for short set of 3 
# 886307	2021-12-19 18:55	2022-03-17 09:37
# 873985	2021-12-20 12:54	2022-03-17 17:54
# 817341	2022-01-14 20:27	2022-03-17 18:15
    
sanity_fraction_analysis = sanity_fraction

# Use one of the lines below to set the moving horizon duration used for analysis 
# moving_horizon_duration_d_analysis = 4
moving_horizon_duration_d_analysis = moving_horizon_duration_d

# learn the model parameters and write rerults an intermediate results to excel files
logging.basicConfig(level=logging.INFO)
df_results = Learner.learn_home_parameter_moving_horizon(df_data_homes, 
                                                         n_std_outliers, up_intv, gap_n_intv, sampling_interval, 
                                                         moving_horizon_duration_d_analysis, sanity_fraction_analysis,
                                                         homes, first_day, first_day, 
                                                         showdetails=True, hint_A_m2=6.0)

logging.basicConfig(level=logging.NOTSET)


In [None]:
#show the results
df_results

In [None]:
df_data_homes

In [None]:
# plot temperature data of multiple homes from an array 
# %autoreload 2
for home_id in homes_to_analyze:
    df_data_one_home = df_data_homes.loc[home_id]
    Plot.weather_and_other_temperatures(home_id, df_data_one_home, [('indoor_temp_degC','r'),('indoor_setpoint_temp_degC','g')])

# # N.B. The resulting figure below can be manipulated interactively; hover with mouse for tips & tricks


In [None]:
#plot a series of weeks for all homes
# plt.ioff()

# for home_id in homes_to_analyze:
#     df_data_one_home = df_data_homes.loc[home_id]
#     for moving_horizon_start in pd.date_range(start=first_day, end=first_day, inclusive='left', freq='7D'):
#         moving_horizon_end = min(first_day, moving_horizon_start + timedelta(days=7))
#         df_moving_horizon = df_data_one_home[moving_horizon_start:moving_horizon_end]
#         Plot.weather_and_other_temperatures(home_id, df_moving_horizon, ['indoor_temp_degC'])

# plt.ion()  
    