In [3]:
import pandas as pd, numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [4]:
# NB: this requires openpyxl to be installed
xl = pd.ExcelFile("../data/Dataset.xlsx")
# extract sheets that start with consumer
consumer_sheets = [name for name in xl.sheet_names
                   if name.startswith("Consumer")]
# TODO: take all consumer sheets, not only the first 10 (= delete next line)
#consumer_sheets = consumer_sheets[:20]

In [5]:
# mapping consumer to DataFrame containing tabular data
cons2df = {name: xl.parse(name).drop(columns=["Unnamed: 0","Total Consumption"])
           for name in tqdm(consumer_sheets)}

100%|██████████| 50/50 [04:22<00:00,  5.26s/it]


### Creation of dataset for task 1

In [6]:
# sum over all columns that are not Periods (which is at index 0)
sum_cols = cons2df["Consumer1"].columns[1:].tolist()
for name, df in cons2df.items():
    df["Total Consumption"] = df[sum_cols].sum(axis=1)

In [25]:
# constants based on 15min intervals
day_length = 4 * 24
week_window = day_length * 7

# iterate over DataFrames and extract x = week -> y = next day pairs
xs, ys = [], []
for df in cons2df.values():
  # extract the total consumption data from the dataset
  data = df["Total Consumption"].values

  # obtain start and end values of the window
  # NB: the window end includes the day to be predicted
  starts = np.arange(0, len(data) - week_window - day_length + 1, day_length)
  ends = starts + week_window + day_length # end includes the day to predict
  cons_xs, cons_ys = [], [] # save data for this consumer
  for start, end in zip(starts, ends):
    # extract the 8-day window from the dataset
    window = data[start:end]
    # split the window in week data and the day to predict
    x, y = window[:-day_length], window[-day_length:]
    # put the samples in the dataset
    x_days = 0
    # get the consumption per day
    week = []
    for i in range(0, len(x), day_length):
      values = x[i:i+day_length]
      week.append(sum(values))
    cons_xs.append(week)
    cons_ys.append(sum(y))
  assert end == len(data) # ensure that we covered all of the input data
  xs.append(np.stack(cons_xs))
  ys.append(np.stack(cons_ys))
# create length dimension
xs, ys = np.stack(xs), np.stack(ys)
# swap day (=N) and household dimensions such that we can sample days for
# dataset creation
xs = xs.transpose(1, 0, 2)
ys = ys.transpose(1, 0)
xs.shape, ys.shape

((359, 50, 7), (359, 50))

In [26]:
# generate train/validation/test splits with 80/10/10 ratio
xtr, xvalte, ytr, yvalte = train_test_split(xs, ys, test_size=.2, shuffle=False)
xval, xte, yval, yte = train_test_split(xvalte, yvalte, test_size=.5, shuffle=False)
xtr = xtr.transpose(1, 0, 2)
ytr = ytr.transpose(1, 0)
xval = xval.transpose(1, 0, 2)
yval = yval.transpose(1, 0)
xte = xte.transpose(1, 0, 2)
yte = yte.transpose(1, 0)
xs = xs.transpose(1, 0, 2)
ys = ys.transpose(1, 0)

In [27]:
np.savez("../data/task1_train", x=xtr, y=ytr)
np.savez("../data/task1_val", x=xval, y=yval)
np.savez("../data/task1_test", x=xte, y=yte)
np.savez("../data/task1_data", x=xs, y=ys)