In [1]:
import pandas as pd
import numpy as np
from numpy import random
import datetime

In [2]:
np.random.seed(123)

In [3]:
test = pd.read_csv('../data/weather1_education_test.csv', index_col=0, parse_dates=['timestamp'])
test

Unnamed: 0,area,building_name,electricity,primary_space_usage,timestamp,TemperatureC,month,year,date,hour,...,hour_22,hour_23,wkday_0,wkday_1,wkday_2,wkday_3,wkday_4,wkday_5,wkday_6,PSU_PrimClass
0,1201.0,PrimClass_Janelle,2.400000,PrimClass,2014-12-01 00:00:00,7.0,12,2014,1,0,...,0,0,1,0,0,0,0,0,0,1
1,1201.0,PrimClass_Janelle,2.000000,PrimClass,2014-12-01 01:00:00,5.0,12,2014,1,1,...,0,0,1,0,0,0,0,0,0,1
2,1201.0,PrimClass_Janelle,2.000000,PrimClass,2014-12-01 02:00:00,5.0,12,2014,1,2,...,0,0,1,0,0,0,0,0,0,1
3,1201.0,PrimClass_Janelle,2.400000,PrimClass,2014-12-01 03:00:00,6.0,12,2014,1,3,...,0,0,1,0,0,0,0,0,0,1
4,1201.0,PrimClass_Janelle,2.600000,PrimClass,2014-12-01 04:00:00,7.0,12,2014,1,4,...,0,0,1,0,0,0,0,0,0,1
5,1201.0,PrimClass_Janelle,2.600000,PrimClass,2014-12-01 05:00:00,7.0,12,2014,1,5,...,0,0,1,0,0,0,0,0,0,1
6,1201.0,PrimClass_Janelle,2.800000,PrimClass,2014-12-01 06:00:00,7.0,12,2014,1,6,...,0,0,1,0,0,0,0,0,0,1
7,1201.0,PrimClass_Janelle,18.300000,PrimClass,2014-12-01 07:00:00,8.0,12,2014,1,7,...,0,0,1,0,0,0,0,0,0,1
8,1201.0,PrimClass_Janelle,21.500000,PrimClass,2014-12-01 08:00:00,8.0,12,2014,1,8,...,0,0,1,0,0,0,0,0,0,1
9,1201.0,PrimClass_Janelle,24.900000,PrimClass,2014-12-01 09:00:00,8.0,12,2014,1,9,...,0,0,1,0,0,0,0,0,0,1


In [4]:
def make_LSTM_data(df):
    '''
    for each building in dataframe df, take 3 months for input and 9 months for evaluating the model
    the split day is taken randomly
    '''
    buildings = df['building_name'].unique() #list of name of buildings
    num_buildings = len(buildings) # number of buildings
    num_obs = len(df) // num_buildings #number of observations per uilding
    three_months_hours = 24*90 #three months in hour
    nine_months_hours = len(df) - three_months_hours #nine month is defined to be the rest
    #make a long dataframe by duplicating
    for _ in range(2):
        df = pd.concat([df, df], ignore_index=True)
    #sort by building and index
    df = df.rename_axis('index').sort_values(['building_name', 'index']).reset_index(drop=True)
    X = pd.DataFrame() #data for fitting which contains three months data
    Y = pd.DataFrame() #data for evaluating which contains nine months data
    for building in buildings:
        start_day = datetime.timedelta(np.random.randint(low=0, high=num_obs//24)) #randomly pick a startday
        first_day = df[df['building_name']==building]['timestamp'].iloc[0]
        start_day = first_day + start_day
        start_idx = df[(df['building_name']==building) & (df['timestamp']==start_day)].index[0] #index of the start day
        x = df.iloc[start_idx: start_idx+three_months_hours] #three months data
        X = pd.concat([X, x], axis=0)
        y = df.iloc[start_idx+three_months_hours: start_idx+num_obs] #nine months data
        Y = pd.concat([Y, y], axis=0)
    return X, Y

In [5]:
X, Y = make_LSTM_data(test)

In [6]:
X

Unnamed: 0,area,building_name,electricity,primary_space_usage,timestamp,TemperatureC,month,year,date,hour,...,hour_22,hour_23,wkday_0,wkday_1,wkday_2,wkday_3,wkday_4,wkday_5,wkday_6,PSU_PrimClass
7726,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-10-19 00:00:00,12.0,10,2015,19,0,...,0,0,1,0,0,0,0,0,0,1
7727,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-10-19 01:00:00,12.0,10,2015,19,1,...,0,0,1,0,0,0,0,0,0,1
7728,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-10-19 02:00:00,12.0,10,2015,19,2,...,0,0,1,0,0,0,0,0,0,1
7729,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-10-19 03:00:00,11.0,10,2015,19,3,...,0,0,1,0,0,0,0,0,0,1
7730,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-10-19 04:00:00,11.0,10,2015,19,4,...,0,0,1,0,0,0,0,0,0,1
7731,1201.0,PrimClass_Janelle,2.600000,PrimClass,2015-10-19 05:00:00,11.0,10,2015,19,5,...,0,0,1,0,0,0,0,0,0,1
7732,1201.0,PrimClass_Janelle,15.100000,PrimClass,2015-10-19 06:00:00,11.0,10,2015,19,6,...,0,0,1,0,0,0,0,0,0,1
7733,1201.0,PrimClass_Janelle,17.700000,PrimClass,2015-10-19 07:00:00,11.0,10,2015,19,7,...,0,0,1,0,0,0,0,0,0,1
7734,1201.0,PrimClass_Janelle,26.600000,PrimClass,2015-10-19 08:00:00,11.0,10,2015,19,8,...,0,0,1,0,0,0,0,0,0,1
7735,1201.0,PrimClass_Janelle,24.200000,PrimClass,2015-10-19 09:00:00,11.0,10,2015,19,9,...,0,0,1,0,0,0,0,0,0,1


In [7]:
Y

Unnamed: 0,area,building_name,electricity,primary_space_usage,timestamp,TemperatureC,month,year,date,hour,...,hour_22,hour_23,wkday_0,wkday_1,wkday_2,wkday_3,wkday_4,wkday_5,wkday_6,PSU_PrimClass
9886,1201.0,PrimClass_Janelle,2.200000,PrimClass,2015-01-17 02:00:00,4.0,1,2015,17,2,...,0,0,0,0,0,0,0,1,0,1
9887,1201.0,PrimClass_Janelle,2.100000,PrimClass,2015-01-17 03:00:00,4.0,1,2015,17,3,...,0,0,0,0,0,0,0,1,0,1
9888,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-01-17 04:00:00,4.0,1,2015,17,4,...,0,0,0,0,0,0,0,1,0,1
9889,1201.0,PrimClass_Janelle,2.100000,PrimClass,2015-01-17 05:00:00,4.0,1,2015,17,5,...,0,0,0,0,0,0,0,1,0,1
9890,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-01-17 06:00:00,3.0,1,2015,17,6,...,0,0,0,0,0,0,0,1,0,1
9891,1201.0,PrimClass_Janelle,2.100000,PrimClass,2015-01-17 07:00:00,3.0,1,2015,17,7,...,0,0,0,0,0,0,0,1,0,1
9892,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-01-17 08:00:00,2.0,1,2015,17,8,...,0,0,0,0,0,0,0,1,0,1
9893,1201.0,PrimClass_Janelle,2.100000,PrimClass,2015-01-17 09:00:00,3.0,1,2015,17,9,...,0,0,0,0,0,0,0,1,0,1
9894,1201.0,PrimClass_Janelle,2.000000,PrimClass,2015-01-17 10:00:00,4.0,1,2015,17,10,...,0,0,0,0,0,0,0,1,0,1
9895,1201.0,PrimClass_Janelle,2.400000,PrimClass,2015-01-17 11:00:00,4.0,1,2015,17,11,...,0,0,0,0,0,0,0,1,0,1
