## Number of sites and Buildings

In [2]:
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import psycopg2
from psycopg2 import Error

import os
path = os.path.join(os.path.expanduser('~'), 'Documents', 'github.passwords', 'energy.predictor.2020.config')
print (path)

myfile = open(path,"rt")
config= myfile.read()

/home/douglas/Documents/github.passwords/energy.predictor.2020.config


In [14]:
connection = psycopg2.connect(config)

cursor = connection.cursor()

# Read PostgreSQL into a dataframe
sql =''' SELECT 
            COUNT(DISTINCT bm.building_id) AS num_buildings,
            COUNT(DISTINCT bm.site_id) AS num_sites
        FROM 
             building_metadata AS bm
            ;'''
num_sites_buildings = pd.read_sql_query(sql,connection)

if (connection):
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [18]:
num_sites_buildings

Unnamed: 0,num_buildings,num_sites
0,1449,16


## Range of Values

In [24]:
connection = psycopg2.connect(config)

cursor = connection.cursor()

# Read PostgreSQL into a dataframe
sql =''' SELECT 
            MAX(t.timestamp) as max_time,
            MIN(t.timestamp) as min_time,
            MAX(w.air_temperature) as max_temp,
            MIN(w.air_temperature) as min_temp,
            MAX(w.dew_temperature) as max_dew_temp,
            MIN(w.dew_temperature) as min_dew_temp,
            MAX(w.sea_level_pressure) as max_sea_level,
            MIN(w.sea_level_pressure) as min_sea_level,
            MAX(w.wind_direction) as max_wind_d,
            MIN(w.wind_direction) as min_wind_d,
            MAX(w.wind_speed) as max_wind_speed,
            MIN(w.wind_speed) as min_wind_speed
        FROM 
             weather_train AS w
         INNER JOIN building_metadata AS bm
             ON (bm.site_id = w.site_id)
         INNER JOIN train AS t
             ON (t.timestamp = w.timestamp) AND (t.building_id = bm.building_id)
         WHERE 
             w.timestamp IS NOT NULL 
             AND w.air_temperature IS NOT NULL 
             AND w.dew_temperature IS NOT NULL
             AND w.sea_level_pressure IS NOT NULL 
             AND w.wind_direction IS NOT NULL 
             AND w.wind_speed IS NOT NULL
            ;'''
max_min = pd.read_sql_query(sql,connection)

if (connection):
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [25]:
max_min

Unnamed: 0,max_time,min_time,max_temp,min_temp,max_dew_temp,min_dew_temp,max_sea_level,min_sea_level,max_wind_d,min_wind_d,max_wind_speed,min_wind_speed
0,2016-12-31 23:00:00,2016-01-01,47.2,-28.9,26.1,-35.0,1045.5,968.2,360.0,0.0,19.0,0.0


### temp in celsius

In [50]:
del max_min_readings

connection = psycopg2.connect(config)

cursor = connection.cursor()

# Read PostgreSQL into a dataframe
sql =''' SELECT 
            bm.site_id,
            t.meter,
            MAX(t.meter_reading) as max_reading,
            MIN(t.meter_reading) as min_reading
        FROM 
             weather_train AS w
         INNER JOIN building_metadata AS bm
             ON (bm.site_id = w.site_id)
         INNER JOIN train AS t
             ON (t.timestamp = w.timestamp) AND (t.building_id = bm.building_id)
         WHERE 
             w.timestamp IS NOT NULL 
             AND w.air_temperature IS NOT NULL 
             AND w.dew_temperature IS NOT NULL
             AND w.sea_level_pressure IS NOT NULL 
             AND w.wind_direction IS NOT NULL 
             AND w.wind_speed IS NOT NULL
         GROUP BY t.meter, bm.site_id
            ;'''
max_min_readings = pd.read_sql_query(sql,connection)

if (connection):
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [51]:
max_min_readings.dtypes

site_id          int64
meter            int64
max_reading    float64
min_reading    float64
dtype: object

All values but site 0 in kWh

site 1 in ktbu 1 ktbu = 0.000293071 kwh

In [52]:
#max_min_readings.index.name = 'meter'
#max_min_readings.reset_index(inplace=True)

meterName = {0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}
max_min_readings['meter'] = max_min_readings['meter'].map(meterName)

max_min_readings.dtypes

max_min_readings['max_reading'] = max_min_readings.apply(lambda x: x.max_reading * 0.000293071 if x.site_id == 0 else x.max_reading, axis =1)
max_min_readings['min_reading'] = max_min_readings.apply(lambda x: x.min_reading * 0.000293071 if x.site_id == 0 else x.min_reading, axis =1)

max_min_readings

Unnamed: 0,site_id,meter,max_reading,min_reading
0,0,electricity,1.324974,0.0
1,1,electricity,1482.19,0.0
2,2,electricity,3193.57,0.0
3,3,electricity,3095.44,0.0
4,4,electricity,2293.88,0.0
5,6,electricity,1296.94,0.0
6,7,electricity,17502.1,0.0
7,8,electricity,4373.17,0.0
8,9,electricity,79769.0,0.0
9,10,electricity,1076.02,0.0


Box plot distribution needed
Need to follow up with sf vs kWh, see if outliers

## Timestamp Analysis

In [5]:
connection = psycopg2.connect(config)

cursor = connection.cursor()

# Read PostgreSQL into a dataframe
sql =''' SELECT 
            t.building_id,
            t.meter,
            MAX(t.timestamp) as max_timestamp,
            MIN(t.timestamp) as min_timestamp
        FROM 
             weather_train AS w
         INNER JOIN building_metadata AS bm
             ON (bm.site_id = w.site_id)
         INNER JOIN train AS t
             ON (t.timestamp = w.timestamp) AND (t.building_id = bm.building_id)
         WHERE 
             w.timestamp IS NOT NULL 
             AND w.air_temperature IS NOT NULL 
             AND w.dew_temperature IS NOT NULL
             AND w.sea_level_pressure IS NOT NULL 
             AND w.wind_direction IS NOT NULL 
             AND w.wind_speed IS NOT NULL
         GROUP BY t.building_id,t.meter
            ;'''
timestamp_distribution = pd.read_sql_query(sql,connection)

if (connection):
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [53]:
timestamp_distribution['length'] = timestamp_distribution['max_timestamp']-timestamp_distribution['min_timestamp'] 
timestamp_distribution

Unnamed: 0,max_timestamp,min_timestamp,length
0,2016-12-31 23:00:00,2016-01-01 00:00:00,365 days 23:00:00
1,2016-12-31 23:00:00,2016-01-01 00:00:00,365 days 23:00:00
2,2016-12-31 23:00:00,2016-01-01 00:00:00,365 days 23:00:00
3,2016-12-31 23:00:00,2016-01-01 00:00:00,365 days 23:00:00
4,2016-12-31 23:00:00,2016-01-01 00:00:00,365 days 23:00:00
...,...,...,...
2286,2016-12-31 23:00:00,2016-01-08 20:00:00,358 days 03:00:00
2287,2016-12-31 23:00:00,2016-01-08 20:00:00,358 days 03:00:00
2288,2016-12-31 23:00:00,2016-01-08 20:00:00,358 days 03:00:00
2289,2016-12-31 23:00:00,2016-01-08 20:00:00,358 days 03:00:00


If timestamp length all the same, will not help cleaning process