In [None]:
# import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as md
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torchvision.utils as vutils
import csv
import gc
import math
from torchsummary import summary
from datetime import datetime
import random

In [None]:
def preprocess_appliance_data( data, appliance, window):
  '''
  preprocess_appliance_data preprocess the input data for a specific appliance. Based on the characteristics
  of the appliance, 6 hour windows are created as individual timeseriers and stored in a pandas dataframe.

  :data: the pandas dataframe containing the data for one appliance, it should have two columns 'localminute' for the timestamp and the appliance's name columm
  :appliance: the name of the current appliance
  :window: the size of the window to be created
  
  '''

  data = data.loc[:, ['localminute', appliance]]
  data['localminute'] = pd.to_datetime(data['localminute'], utc=True)
  #data = data.sort_values(by='localminute', ascending=True)

  data['localminute'] = data['localminute'].dt.tz_localize(None)
  data = data[data['localminute'].notna()]
  data = data.set_index('localminute' )

  # resample the timeseries in order to have all the datasets with the same sampling rate
  data = data.resample('5S').ffill()

  data['datetem'] = pd.to_datetime(data.index)
  # seperate the timeseries data into 6 hour windows
  data['date'] = data['datetem'].dt.date
  data['hour'] = data['datetem'].dt.hour.apply(lambda x: 0 if 0 <= x and x <=5 else 1 if 6 <= x and x <= 11 else 2 if 12 <= x and x <= 17 else 3)

  data = data.drop('datetem', axis=1)
  #data = data.dropna()

  #set very small values to 0 as they may be erros in the measurement
  #data[appliance] =  data[appliance].apply(lambda x: 0 if x <= 0.2 else x)

  group = data.groupby(['date','hour'])
  
  # create a list with the datapoints of each window, this is done in order to create vectors for the GAN later on
  data_list = group.apply(lambda x: np.array(list(x.iloc[0:-1,0]), dtype=np.float32)).to_frame().reset_index()

  #data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: x/1000)

  # if there are any windows with size smaller than specified discrard them
  data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: np.NaN  if len(x) < window else x)
  # discard any windows that only contain noise i.e. their energy sum is very small
  data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: np.NaN if np.sum(x)<20.0 else x)

  # apply normalization using min max values to all timeseries
  data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: (abs(x)))
  data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: (x-np.min(x))/(np.max(x)-np.min(x)))

  # if any windows are missing data points also discard them
  data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: np.NaN if np.isnan(x).any() else x)

  #keep only the timeseries with higher variance i.e. more distinct pulses, this is used depending on the current appliance to process
  #data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: np.NaN if np.var(x)<0.08 else x)
  #data_list[data_list.columns[2]] = data_list[data_list.columns[2]].apply(lambda x: np.NaN if np.var(x)>0.02 else x)

  labels = data_list.index.to_list()
  # the final number of samples created for the specific appliance
  print('Number of samples: ', len(labels))

  data_list = data_list.dropna()

  return data_list

Choose the list of files to be read and preprocessed


In [None]:
# you should put the file names in this list after you download the datasets, for example for dataport data:
files_to_open = ['dataport_electric_vehicle2_3000', 'dataport_electric_vehicle1222','dataport_electric_vehicle27', 'dataport_electric_vehicle5679', 'dataport_electric_vehicle3000', 'dataport_electric_vehicle9053','dataport_electric_vehicle3517','dataport_electric_vehicle5058']
files_path = 'path/to/files'
# name the appliance currently using 
appliance =  'electric_vehicle'

# preprocess the first file 
data_first = pd.read_csv(f'{files_path}{files_to_open[0]}.csv',header=None, names=["localminute", appliance] )
train_pre = preprocess_ukdale(data_first, appliance,window_size )

# create the training set for one appliance, using all the available data from the dataset 
for i, files in enumerate(files_to_open[1:], 1):
  print(f'loading {i} , {files}')
  temp = pd.read_csv(f'{files_path}{files}.csv',header=None, names=["localminute", appliance] )
  temp = preprocess_appliance_data(temp, appliance,window_size )
  train_pre = pd.concat([train_pre, temp])

# print the created dataset
print(train_pre)


               localminute  car1
0      2019-05-01 16:39:18   NaN
1      2019-05-01 16:21:08   NaN
2      2019-05-01 16:39:19   NaN
3      2019-05-01 16:39:17   NaN
4      2019-05-01 16:39:20   NaN
...                    ...   ...
422579 2019-07-02 16:40:19   NaN
422580 2019-07-02 16:40:16   NaN
422581 2019-07-02 16:40:14   NaN
422582 2019-07-02 16:40:12   NaN
422583 2019-07-02 16:40:09   NaN

[422584 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
                     car1             datetem        date  hour
localminute                                                    
2019-05-01 05:00:00   NaN 2019-05-01 05:00:00  2019-05-01     0
2019-05-01 05:00:05   NaN 2019-05-01 05:00:05  2019-05-01     0
2019-05-01 05:00:10   NaN 2019-05-01 05:00:10  2019-05-01     0
2019-05-01 05:00:15   NaN 2019-05-01 05:00:15  2019-05-01     0
2019-05-01 05:00:20   NaN 2019-05-01 05:00:20  2019-05-01     0
...                   ...                 ...        

In [None]:
# same in a pickle the created datasets, for example
train_pre.to_pickle("/path_to_saved/dataport_electric_vehicle_5s.pkl")