In [6]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np

from store_data import DataStore

## Feature Analysis and Engineering

In [7]:
dataset = pd.read_csv('data1/dataset_seattle.csv')

In [8]:
dataset_expand_holidays = pd.get_dummies(dataset['Holiday'])
dataset = pd.concat([dataset, dataset_expand_holidays], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Electricity:Facility [kW](Monthly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Gas:Facility [kW](Monthly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),...,Christmas Day,Columbus Day,Independence Day,Labor Day,Martin Luther King Day,Memorial Day,New Years Day,Presidents Day,Thanksgiving,Veterans Day
0,1536,0.0,01/01 01:00:00,17.414973,,1.999148,64.870091,,0.0,64.870091,...,0,0,0,0,0,0,1,0,0,0
1,1537,0.0,01/01 02:00:00,16.984599,,1.568775,50.797116,,0.0,50.797116,...,0,0,0,0,0,0,1,0,0,0
2,1538,0.0,01/01 03:00:00,17.433579,,2.017755,65.585215,,0.0,65.585215,...,0,0,0,0,0,0,1,0,0,0
3,1539,0.0,01/01 04:00:00,16.901739,,1.485915,48.049542,,0.0,48.049542,...,0,0,0,0,0,0,1,0,0,0
4,1540,0.0,01/01 05:00:00,17.410047,,1.994222,64.77888,,0.0,64.77888,...,0,0,0,0,0,0,1,0,0,0


In [9]:
dataset = dataset.drop(columns=['Holiday_code', 'Holiday'])
weekday_features = pd.read_csv('data/weekday_features.csv')
weekday_features.head(5)

Unnamed: 0,Date,Day,is_weekend,is_holiday,Holiday_name
0,01/01,Thursday,0,1,New Years Day
1,01/02,Friday,0,0,
2,01/03,Saturday,1,0,
3,01/04,Sunday,1,0,
4,01/05,Monday,0,0,


In [10]:
dataset = pd.merge(dataset, weekday_features,
                   on='Date', how='inner')
dataset.count()

Unnamed: 0                                    140160
Cooling:Electricity [kW](Hourly)              140160
Date/Time                                     140160
Electricity:Facility [kW](Hourly)             140160
Electricity:Facility [kW](Monthly)                 0
Fans:Electricity [kW](Hourly)                 140160
Gas:Facility [kW](Hourly)                     140160
Gas:Facility [kW](Monthly)                         0
Heating:Electricity [kW](Hourly)              140160
Heating:Gas [kW](Hourly)                      140160
InteriorEquipment:Electricity [kW](Hourly)    140160
InteriorEquipment:Gas [kW](Hourly)             78840
InteriorLights:Electricity [kW](Hourly)       140160
Water Heater:WaterSystems:Gas [kW](Hourly)    113880
building_type                                 140160
location                                      140160
location_id                                   140160
building_id                                   140160
Date                                          

In [11]:
dataset["Electricity:Facility [kW](Monthly)"].fillna(0, inplace=True)
dataset["Gas:Facility [kW](Monthly)"].fillna(0, inplace=True)
dataset["InteriorEquipment:Gas [kW](Hourly)"].fillna(0, inplace=True)
dataset["Water Heater:WaterSystems:Gas [kW](Hourly)"].fillna(0, inplace=True)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Electricity:Facility [kW](Monthly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Gas:Facility [kW](Monthly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),...,Martin Luther King Day,Memorial Day,New Years Day,Presidents Day,Thanksgiving,Veterans Day,Day,is_weekend,is_holiday,Holiday_name
0,1536,0.0,01/01 01:00:00,17.414973,0.0,1.999148,64.870091,0.0,0.0,64.870091,...,0,0,1,0,0,0,Thursday,0,1,New Years Day
1,1537,0.0,01/01 02:00:00,16.984599,0.0,1.568775,50.797116,0.0,0.0,50.797116,...,0,0,1,0,0,0,Thursday,0,1,New Years Day
2,1538,0.0,01/01 03:00:00,17.433579,0.0,2.017755,65.585215,0.0,0.0,65.585215,...,0,0,1,0,0,0,Thursday,0,1,New Years Day
3,1539,0.0,01/01 04:00:00,16.901739,0.0,1.485915,48.049542,0.0,0.0,48.049542,...,0,0,1,0,0,0,Thursday,0,1,New Years Day
4,1540,0.0,01/01 05:00:00,17.410047,0.0,1.994222,64.77888,0.0,0.0,64.77888,...,0,0,1,0,0,0,Thursday,0,1,New Years Day


### One Hot Encoding

In [12]:
dataset_expand_days = pd.get_dummies(dataset['Day'])
dataset = pd.concat([dataset, dataset_expand_days], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Electricity:Facility [kW](Monthly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Gas:Facility [kW](Monthly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),...,is_weekend,is_holiday,Holiday_name,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,1536,0.0,01/01 01:00:00,17.414973,0.0,1.999148,64.870091,0.0,0.0,64.870091,...,0,1,New Years Day,0,0,0,0,1,0,0
1,1537,0.0,01/01 02:00:00,16.984599,0.0,1.568775,50.797116,0.0,0.0,50.797116,...,0,1,New Years Day,0,0,0,0,1,0,0
2,1538,0.0,01/01 03:00:00,17.433579,0.0,2.017755,65.585215,0.0,0.0,65.585215,...,0,1,New Years Day,0,0,0,0,1,0,0
3,1539,0.0,01/01 04:00:00,16.901739,0.0,1.485915,48.049542,0.0,0.0,48.049542,...,0,1,New Years Day,0,0,0,0,1,0,0
4,1540,0.0,01/01 05:00:00,17.410047,0.0,1.994222,64.77888,0.0,0.0,64.77888,...,0,1,New Years Day,0,0,0,0,1,0,0


In [13]:
dataset.columns

Index(['Unnamed: 0', 'Cooling:Electricity [kW](Hourly)', 'Date/Time',
       'Electricity:Facility [kW](Hourly)',
       'Electricity:Facility [kW](Monthly)', 'Fans:Electricity [kW](Hourly)',
       'Gas:Facility [kW](Hourly)', 'Gas:Facility [kW](Monthly)',
       'Heating:Electricity [kW](Hourly)', 'Heating:Gas [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)',
       'InteriorEquipment:Gas [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'Water Heater:WaterSystems:Gas [kW](Hourly)', 'building_type',
       'location', 'location_id', 'building_id', 'Date', 'time', 'sunriseTime',
       'sunsetTime', 'temperatureHigh', 'dewPoint', 'humidity', 'windSpeed',
       'cloudCover', 'Christmas Day', 'Columbus Day', 'Independence Day',
       'Labor Day', 'Martin Luther King Day', 'Memorial Day', 'New Years Day',
       'Presidents Day', 'Thanksgiving', 'Veterans Day', 'Day', 'is_weekend',
       'is_holiday', 'Holiday_name', 'Friday', 'Monday', 'Saturda

In [14]:
dataset_expand_building_type = pd.get_dummies(dataset['building_type'])
dataset = pd.concat([dataset, dataset_expand_building_type], axis=1)
dataset_expand_location = pd.get_dummies(dataset['location'])
dataset = pd.concat([dataset, dataset_expand_location], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Electricity:Facility [kW](Monthly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Gas:Facility [kW](Monthly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),...,RefBldgPrimarySchoolNew,RefBldgQuickServiceRestaurantNew,RefBldgSecondarySchoolNew,RefBldgSmallHotelNew,RefBldgSmallOfficeNew,RefBldgStand-aloneRetailNew,RefBldgStripMallNew,RefBldgSuperMarketNew,RefBldgWarehouseNew,USA_WA_SEATTLE
0,1536,0.0,01/01 01:00:00,17.414973,0.0,1.999148,64.870091,0.0,0.0,64.870091,...,0,0,0,0,0,0,0,0,1,1
1,1537,0.0,01/01 02:00:00,16.984599,0.0,1.568775,50.797116,0.0,0.0,50.797116,...,0,0,0,0,0,0,0,0,1,1
2,1538,0.0,01/01 03:00:00,17.433579,0.0,2.017755,65.585215,0.0,0.0,65.585215,...,0,0,0,0,0,0,0,0,1,1
3,1539,0.0,01/01 04:00:00,16.901739,0.0,1.485915,48.049542,0.0,0.0,48.049542,...,0,0,0,0,0,0,0,0,1,1
4,1540,0.0,01/01 05:00:00,17.410047,0.0,1.994222,64.77888,0.0,0.0,64.77888,...,0,0,0,0,0,0,0,0,1,1


In [15]:
dataset.to_csv('data1/dataset_seattle.csv')

In [16]:
"""
data_store = DataStore('35.227.50.121')
data_store.connect_to_database()
data_store.store_data(energystats)
"""

"\ndata_store = DataStore('35.227.50.121')\ndata_store.connect_to_database()\ndata_store.store_data(energystats)\n"