In [7]:
import pandas as pd
import numpy as np

In [2]:
hourly_data_path = 'students_drahi_production_consumption_hourly.csv'
hourly_data = pd.read_csv(hourly_data_path)

In [3]:
hourly_data

Unnamed: 0,datetime,AirTemp,pres,rain,rh,wd,ws,Global_Solar_Flux,Diffuse_Solar_Flux,Direct_Solar_Flux,...,kw_heatingcoolingtotal_zone1,kw_heatingcoolingtotal_zone2,kw_lights_zone1,kw_lights_zone2,kw_total_zone1,kw_total_zone2,kw_ventilation_zone1,kw_ventilation_zone2,kw_water_heater_zone2,plugs_zone2
0,2022-01-01 00:00:00+00:00,10.279,1005.058,0.000,95.870,187.576,1.758,-0.271,-0.370,-0.110,...,-1.798,-2.037,-0.002,-0.222,2.091,2.308,-0.027,-0.030,-0.004,-0.156
1,2022-01-01 01:00:00+00:00,10.318,1004.790,0.000,95.335,218.280,2.401,-0.371,-0.499,-0.231,...,-1.794,-2.170,-0.002,-0.278,2.086,2.504,-0.027,-0.044,-0.004,-0.157
2,2022-01-01 02:00:00+00:00,9.901,1004.791,0.013,95.699,185.351,1.821,-0.846,-0.999,-0.235,...,-1.795,-2.202,-0.002,-0.238,2.085,2.501,-0.026,-0.043,-0.004,-0.157
3,2022-01-01 03:00:00+00:00,9.444,1004.554,0.000,95.881,193.537,1.760,-0.632,-0.755,-0.173,...,-1.811,-2.238,-0.002,-0.240,2.100,2.539,-0.026,-0.044,-0.004,-0.157
4,2022-01-01 04:00:00+00:00,8.587,1004.524,0.000,96.738,174.806,1.613,-0.395,-0.489,-0.110,...,-1.814,-2.287,-0.002,-0.280,2.102,2.952,-0.027,-0.044,-0.356,-0.156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,2023-12-31 19:00:00+00:00,7.523,983.610,0.000,77.079,248.157,5.871,-1.114,-1.283,-0.246,...,-1.058,-1.896,-0.130,-0.031,1.659,2.058,-0.031,-0.022,0.000,-0.206
17516,2023-12-31 20:00:00+00:00,7.021,984.488,0.000,81.753,246.199,5.223,-1.130,-1.270,0.178,...,-1.070,-1.886,-0.121,-0.019,1.661,2.038,-0.031,-0.023,0.000,-0.206
17517,2023-12-31 21:00:00+00:00,6.746,984.980,0.000,83.942,245.813,4.572,-0.944,-1.041,0.281,...,-7.507,-1.890,-0.120,-0.019,8.045,2.049,-0.031,-0.023,0.000,-0.215
17518,2023-12-31 22:00:00+00:00,6.703,985.161,0.000,83.873,241.623,4.568,-0.935,-1.029,0.203,...,-3.758,-1.868,-0.120,-0.019,4.361,2.021,-0.031,-0.023,0.000,-0.207


In [65]:
time = pd.to_datetime(hourly_data['datetime']).values.astype('datetime64[s]')
time

array(['2022-01-01T00:00:00', '2022-01-01T01:00:00',
       '2022-01-01T02:00:00', ..., '2023-12-31T21:00:00',
       '2023-12-31T22:00:00', '2023-12-31T23:00:00'],
      dtype='datetime64[s]')

In [67]:
power_consumption = hourly_data['kw_total_zone2'].values
power_consumption

array([2.308, 2.504, 2.501, ..., 2.049, 2.021, 2.021])

In [71]:
dec = [] # daily energy consumption
t_dec = []

In [72]:
for ti, t in enumerate(time):
    tmp_t = pd.Timestamp(t)

    if np.isclose(tmp_t.hour, 0) and np.isclose(tmp_t.minute, 0):

        day_end = np.datetime64(tmp_t + pd.Timedelta(days=1))
        ind = np.where((time > tmp_t) & (time < day_end), True, False)

        if len(time[ind]) > 0 and not np.isnan(power_consumption[ind]).any():
            t_dec.append(np.datetime64(tmp_t).astype('datetime64[s]'))
            dec.append(np.trapz(power_consumption[ind], time[ind].astype(int))/3600)

In [80]:
t_dec = np.array(t_dec)
dec = np.array(dec)

np.array([t_dec, dec]).T

array([[datetime.datetime(2022, 1, 1, 0, 0), 58.144000000000005],
       [datetime.datetime(2022, 1, 2, 0, 0), 58.63750000000001],
       [datetime.datetime(2022, 1, 3, 0, 0), 82.622],
       ...,
       [datetime.datetime(2023, 12, 29, 0, 0), 67.7715],
       [datetime.datetime(2023, 12, 30, 0, 0), 58.11099999999999],
       [datetime.datetime(2023, 12, 31, 0, 0), 63.33]], dtype=object)

In [93]:
N = 7 # N days of predictors beforehand
final_ind = []
final_hourly = []

predictor_window = pd.Timedelta(days=N)

In [120]:
for ti, t in enumerate(t_dec):
    tmp_t = pd.Timestamp(t)

    ind = np.where((time >= tmp_t - predictor_window) & (time < tmp_t), True, False) # finding indices within the N prior days

    bad_ind = np.isnan(hourly_data.iloc[ind, 1::].values)
    if len(time[ind]) >= 24 * N and not bad_ind.any(): # rejecting any data with NaNs; useful for the student dataset
        final_ind.append(ti)
        final_hourly.append(hourly_data.iloc[ind, 1::].values)

In [122]:
target_time = t_dec[final_ind]
targets = dec[final_ind]
predictors = np.array(final_hourly)

In [128]:
np.array([target_time, targets]).T

array([[datetime.datetime(2022, 1, 8, 0, 0), 45.964000000000006],
       [datetime.datetime(2022, 1, 9, 0, 0), 57.75049999999999],
       [datetime.datetime(2022, 1, 10, 0, 0), 111.27949999999998],
       [datetime.datetime(2022, 1, 11, 0, 0), 96.05150000000002],
       [datetime.datetime(2022, 1, 12, 0, 0), 101.8965],
       [datetime.datetime(2022, 1, 13, 0, 0), 99.7285],
       [datetime.datetime(2022, 1, 14, 0, 0), 94.511],
       [datetime.datetime(2022, 1, 15, 0, 0), 57.508],
       [datetime.datetime(2022, 1, 24, 0, 0), 136.92149999999998],
       [datetime.datetime(2022, 1, 25, 0, 0), 153.89149999999998],
       [datetime.datetime(2022, 1, 26, 0, 0), 133.4875],
       [datetime.datetime(2022, 1, 27, 0, 0), 135.09600000000003],
       [datetime.datetime(2022, 1, 28, 0, 0), 117.304],
       [datetime.datetime(2022, 1, 29, 0, 0), 84.843],
       [datetime.datetime(2022, 1, 30, 0, 0), 83.34049999999999],
       [datetime.datetime(2022, 1, 31, 0, 0), 131.41649999999998],
       [dat

In [146]:
len(set([s.split("T")[0] for s in time[[i not in final_ind for i in range(len(time))]].astype('str')]))

726