In [19]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np

from store_data import DataStore

## Feature Analysis and Engineering

In [20]:
dataset = pd.read_csv('data1/dataset.csv')

In [21]:
dataset.isna().sum()

Unnamed: 0                                          0
Unnamed: 0.1                                        0
Cooling:Electricity [kW](Hourly)                    0
Date/Time                                           0
Electricity:Facility [kW](Hourly)                   0
Fans:Electricity [kW](Hourly)                       0
Gas:Facility [kW](Hourly)                           0
Heating:Electricity [kW](Hourly)                    0
Heating:Gas [kW](Hourly)                            0
InteriorEquipment:Electricity [kW](Hourly)          0
InteriorEquipment:Gas [kW](Hourly)                  0
InteriorLights:Electricity [kW](Hourly)             0
Water Heater:WaterSystems:Gas [kW](Hourly)          0
building_type                                       0
location                                            0
location_id                                         0
building_id                                         0
Date                                                0
Holiday                     

In [22]:
dataset_expand_holidays = pd.get_dummies(dataset['Holiday'])
dataset = pd.concat([dataset, dataset_expand_holidays], axis=1)
dataset.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),...,Christmas Day,Columbus Day,Independence Day,Labor Day,Martin Luther King Day,Memorial Day,New Years Day,Presidents Day,Thanksgiving,Veterans Day
0,0,2328,0.0,01/01 01:00:00,80.931046,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,0,0,0,1,0,0,0
1,1,2329,0.0,01/01 02:00:00,80.384874,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,0,0,0,1,0,0,0
2,2,2330,0.0,01/01 03:00:00,79.658969,0.0,2.915533,0.0,0.0,6.812987,...,0,0,0,0,0,0,1,0,0,0
3,3,2331,0.0,01/01 04:00:00,79.056419,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,0,0,0,1,0,0,0
4,4,2332,0.0,01/01 05:00:00,85.446387,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,0,0,0,1,0,0,0


In [23]:
dataset = dataset.drop(columns=['Holiday_code', 'Holiday'])
weekday_features = pd.read_csv('data/weekday_features.csv')
weekday_features.head(5)

Unnamed: 0,Date,Day,is_weekend,is_holiday,Holiday_name
0,01/01,Thursday,0,1,New Years Day
1,01/02,Friday,0,0,
2,01/03,Saturday,1,0,
3,01/04,Sunday,1,0,
4,01/05,Monday,0,0,


In [24]:
dataset = pd.merge(dataset, weekday_features,
                   on='Date', how='inner')
dataset.count()

Unnamed: 0                                    1681344
Unnamed: 0.1                                  1681344
Cooling:Electricity [kW](Hourly)              1681344
Date/Time                                     1681344
Electricity:Facility [kW](Hourly)             1681344
Fans:Electricity [kW](Hourly)                 1681344
Gas:Facility [kW](Hourly)                     1681344
Heating:Electricity [kW](Hourly)              1681344
Heating:Gas [kW](Hourly)                      1681344
InteriorEquipment:Electricity [kW](Hourly)    1681344
InteriorEquipment:Gas [kW](Hourly)            1681344
InteriorLights:Electricity [kW](Hourly)       1681344
Water Heater:WaterSystems:Gas [kW](Hourly)    1681344
building_type                                 1681344
location                                      1681344
location_id                                   1681344
building_id                                   1681344
Date                                          1681344
time                        

### One Hot Encoding

In [25]:
dataset_expand_days = pd.get_dummies(dataset['Day'])
dataset = pd.concat([dataset, dataset_expand_days], axis=1)
dataset.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),...,is_weekend,is_holiday,Holiday_name,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,2328,0.0,01/01 01:00:00,80.931046,0.0,1.77125,0.0,0.0,6.812987,...,0,1,New Years Day,0,0,0,0,1,0,0
1,1,2329,0.0,01/01 02:00:00,80.384874,0.0,1.77125,0.0,0.0,6.812987,...,0,1,New Years Day,0,0,0,0,1,0,0
2,2,2330,0.0,01/01 03:00:00,79.658969,0.0,2.915533,0.0,0.0,6.812987,...,0,1,New Years Day,0,0,0,0,1,0,0
3,3,2331,0.0,01/01 04:00:00,79.056419,0.0,1.77125,0.0,0.0,6.812987,...,0,1,New Years Day,0,0,0,0,1,0,0
4,4,2332,0.0,01/01 05:00:00,85.446387,0.0,1.77125,0.0,0.0,6.812987,...,0,1,New Years Day,0,0,0,0,1,0,0


In [26]:
dataset.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Cooling:Electricity [kW](Hourly)',
       'Date/Time', 'Electricity:Facility [kW](Hourly)',
       'Fans:Electricity [kW](Hourly)', 'Gas:Facility [kW](Hourly)',
       'Heating:Electricity [kW](Hourly)', 'Heating:Gas [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)',
       'InteriorEquipment:Gas [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'Water Heater:WaterSystems:Gas [kW](Hourly)', 'building_type',
       'location', 'location_id', 'building_id', 'Date', 'time', 'sunriseTime',
       'sunsetTime', 'temperatureHigh', 'dewPoint', 'humidity', 'windSpeed',
       'cloudCover', 'Electricity:Facility [kW](Hourly)_lag',
       'Electricity:Facility [kW](Hourly)_future',
       'Electricity:Facility_delta_current_lag', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Day', 'Memorial Day', 'New Years Day',
       'Presidents Day', 'Thanksgiving', 'Veterans Day

In [27]:
dataset_expand_building_type = pd.get_dummies(dataset['building_type'])
dataset = pd.concat([dataset, dataset_expand_building_type], axis=1)
dataset_expand_location = pd.get_dummies(dataset['location'])
dataset = pd.concat([dataset, dataset_expand_location], axis=1)
dataset.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),...,USA_CA_LOS_ANGELES,USA_CA_SAN_FRANCISCO,USA_CO_BOULDER,USA_GA_ATLANTA,USA_IL_CHICAGO-OHARE,USA_MD_BALTIMORE,USA_MN_MINNEAPOLIS,USA_NV_LAS_VEGAS,USA_TX_HOUSTON,USA_WA_SEATTLE
0,0,2328,0.0,01/01 01:00:00,80.931046,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,1,0,0,0,0,0,0
1,1,2329,0.0,01/01 02:00:00,80.384874,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,1,0,0,0,0,0,0
2,2,2330,0.0,01/01 03:00:00,79.658969,0.0,2.915533,0.0,0.0,6.812987,...,0,0,0,1,0,0,0,0,0,0
3,3,2331,0.0,01/01 04:00:00,79.056419,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,1,0,0,0,0,0,0
4,4,2332,0.0,01/01 05:00:00,85.446387,0.0,1.77125,0.0,0.0,6.812987,...,0,0,0,1,0,0,0,0,0,0


In [28]:
dataset.to_csv('data1/dataset.csv')

In [29]:
"""
data_store = DataStore('35.227.50.121')
data_store.connect_to_database()
data_store.store_data(energystats)
"""

"\ndata_store = DataStore('35.227.50.121')\ndata_store.connect_to_database()\ndata_store.store_data(energystats)\n"