In [9]:
import warnings
warnings.filterwarnings('ignore')
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

In [10]:
from src.utilities import get_month
from src.store_data import DataStore

## Feature Analysis and Engineering

In [11]:
dataset = pd.read_csv('../../data/data1/processed/dataset_seattle.csv')

In [12]:
dataset.isna().sum()

Unnamed: 0                                         0
Cooling:Electricity [kW](Hourly)                   0
Date/Time                                          0
Electricity:Facility [kW](Hourly)                  0
Fans:Electricity [kW](Hourly)                      0
Gas:Facility [kW](Hourly)                          0
Heating:Electricity [kW](Hourly)                   0
Heating:Gas [kW](Hourly)                           0
InteriorEquipment:Electricity [kW](Hourly)         0
InteriorEquipment:Gas [kW](Hourly)                 0
InteriorLights:Electricity [kW](Hourly)            0
Water Heater:WaterSystems:Gas [kW](Hourly)         0
building_type                                      0
location                                           0
location_id                                        0
building_id                                        0
Date                                               0
Holiday                                       136296
Holiday_code                                  

In [13]:
dataset_expand_holidays = pd.get_dummies(dataset['Holiday'])
dataset = pd.concat([dataset, dataset_expand_holidays], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),InteriorEquipment:Gas [kW](Hourly),...,Christmas Day,Columbus Day,Independence Day,Labor Day,Martin Luther King Day,Memorial Day,New Years Day,Presidents Day,Thanksgiving,Veterans Day
0,1560,0.0,01/01 01:00:00,85.494324,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,1,0,0,0
1,1561,0.0,01/01 02:00:00,84.794544,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,1,0,0,0
2,1562,0.0,01/01 03:00:00,84.201696,0.0,2.915564,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,1,0,0,0
3,1563,0.0,01/01 04:00:00,83.726241,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,1,0,0,0
4,1564,0.0,01/01 05:00:00,89.848974,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,1,0,0,0


In [14]:
dataset = dataset.drop(columns=['Holiday_code', 'Holiday'])
weekday_features = pd.read_csv('../../data/data1/raw/weekday_features.csv')
weekday_features.head(5)

Unnamed: 0,Date,Day,is_weekend,is_holiday,Holiday_name
0,01/01,Thursday,0,1,New Years Day
1,01/02,Friday,0,0,
2,01/03,Saturday,1,0,
3,01/04,Sunday,1,0,
4,01/05,Monday,0,0,


In [15]:
dataset = pd.merge(dataset, weekday_features,
                   on='Date', how='inner')
dataset.count()

Unnamed: 0                                    140112
Cooling:Electricity [kW](Hourly)              140112
Date/Time                                     140112
Electricity:Facility [kW](Hourly)             140112
Fans:Electricity [kW](Hourly)                 140112
Gas:Facility [kW](Hourly)                     140112
Heating:Electricity [kW](Hourly)              140112
Heating:Gas [kW](Hourly)                      140112
InteriorEquipment:Electricity [kW](Hourly)    140112
InteriorEquipment:Gas [kW](Hourly)            140112
InteriorLights:Electricity [kW](Hourly)       140112
Water Heater:WaterSystems:Gas [kW](Hourly)    140112
building_type                                 140112
location                                      140112
location_id                                   140112
building_id                                   140112
Date                                          140112
time                                          140112
sunriseTime                                   

### One Hot Encoding

In [16]:
dataset['day_of_week'] = dataset['Day']
dataset_expand_days = pd.get_dummies(dataset['Day'])
dataset = pd.concat([dataset, dataset_expand_days], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),InteriorEquipment:Gas [kW](Hourly),...,is_holiday,Holiday_name,day_of_week,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,1560,0.0,01/01 01:00:00,85.494324,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,1,New Years Day,Thursday,0,0,0,0,1,0,0
1,1561,0.0,01/01 02:00:00,84.794544,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,1,New Years Day,Thursday,0,0,0,0,1,0,0
2,1562,0.0,01/01 03:00:00,84.201696,0.0,2.915564,0.0,0.0,6.812987,1.75125,...,1,New Years Day,Thursday,0,0,0,0,1,0,0
3,1563,0.0,01/01 04:00:00,83.726241,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,1,New Years Day,Thursday,0,0,0,0,1,0,0
4,1564,0.0,01/01 05:00:00,89.848974,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,1,New Years Day,Thursday,0,0,0,0,1,0,0


In [17]:
dataset.columns

Index(['Unnamed: 0', 'Cooling:Electricity [kW](Hourly)', 'Date/Time',
       'Electricity:Facility [kW](Hourly)', 'Fans:Electricity [kW](Hourly)',
       'Gas:Facility [kW](Hourly)', 'Heating:Electricity [kW](Hourly)',
       'Heating:Gas [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)',
       'InteriorEquipment:Gas [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'Water Heater:WaterSystems:Gas [kW](Hourly)', 'building_type',
       'location', 'location_id', 'building_id', 'Date', 'time', 'sunriseTime',
       'sunsetTime', 'temperatureHigh', 'dewPoint', 'humidity', 'windSpeed',
       'cloudCover', 'Electricity:Facility [kW](Hourly)_lag',
       'Electricity:Facility [kW](Hourly)_future',
       'Electricity:Facility_delta_current_lag', 'Christmas Day',
       'Columbus Day', 'Independence Day', 'Labor Day',
       'Martin Luther King Day', 'Memorial Day', 'New Years Day',
       'Presidents Day', 'Thanksgiving', 'Veterans Day', 'Day', 'is_we

In [18]:
dataset_expand_building_type = pd.get_dummies(dataset['building_type'])
dataset = pd.concat([dataset, dataset_expand_building_type], axis=1)
dataset_expand_location = pd.get_dummies(dataset['location'])
dataset = pd.concat([dataset, dataset_expand_location], axis=1)
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Cooling:Electricity [kW](Hourly),Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Gas:Facility [kW](Hourly),Heating:Electricity [kW](Hourly),Heating:Gas [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),InteriorEquipment:Gas [kW](Hourly),...,RefBldgPrimarySchoolNew,RefBldgQuickServiceRestaurantNew,RefBldgSecondarySchoolNew,RefBldgSmallHotelNew,RefBldgSmallOfficeNew,RefBldgStand-aloneRetailNew,RefBldgStripMallNew,RefBldgSuperMarketNew,RefBldgWarehouseNew,USA_WA_SEATTLE
0,1560,0.0,01/01 01:00:00,85.494324,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,0,1,0,1
1,1561,0.0,01/01 02:00:00,84.794544,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,0,1,0,1
2,1562,0.0,01/01 03:00:00,84.201696,0.0,2.915564,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,0,1,0,1
3,1563,0.0,01/01 04:00:00,83.726241,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,0,1,0,1
4,1564,0.0,01/01 05:00:00,89.848974,0.0,1.77125,0.0,0.0,6.812987,1.75125,...,0,0,0,0,0,0,0,1,0,1


In [19]:
dataset['month'] = dataset['Date'].apply(get_month)

In [20]:
dataset.to_csv('../../data/data1/processed/dataset_seattle.csv')

In [21]:
dataset[['Electricity:Facility [kW](Hourly)', 'Electricity:Facility [kW](Hourly)_lag',
       'Electricity:Facility [kW](Hourly)_future',
       'Electricity:Facility_delta_current_lag','Date/Time']].head(50)

Unnamed: 0,Electricity:Facility [kW](Hourly),Electricity:Facility [kW](Hourly)_lag,Electricity:Facility [kW](Hourly)_future,Electricity:Facility_delta_current_lag,Date/Time
0,85.494324,17.414973,14.657581,68.079351,01/01 01:00:00
1,84.794544,16.984599,14.657581,67.809945,01/01 02:00:00
2,84.201696,17.433579,14.657581,66.768117,01/01 03:00:00
3,83.726241,16.901739,14.657581,66.824502,01/01 04:00:00
4,89.848974,17.410047,14.657581,72.438928,01/01 05:00:00
5,89.201522,16.899693,14.657581,72.30183,01/01 06:00:00
6,126.340442,17.407501,14.657581,108.93294,01/01 07:00:00
7,138.090475,16.927552,14.657581,121.162923,01/01 08:00:00
8,159.347215,8.489757,13.470692,150.857458,01/01 09:00:00
9,134.335479,7.90289,13.470692,126.432589,01/01 10:00:00


In [22]:
"""
data_store = DataStore('35.227.50.121')
data_store.connect_to_database()
data_store.store_data(energystats)
"""

"\ndata_store = DataStore('35.227.50.121')\ndata_store.connect_to_database()\ndata_store.store_data(energystats)\n"