In [1]:
import numpy as np
from numpy import random
import math
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

In [2]:
df = pd.read_csv('../data/raw/csvs/train.csv', parse_dates=['timestamp'])
# set indexes
df.set_index(['building_id','timestamp'], inplace=True)
# sort index in order to use .loc
df.sort_index(inplace=True)

<b>I. DataFrame overview</b>

train.csv

<ul>
    <li>building_id - Foreign key for the building metadata.</li>
    <li>meter - The meter id code. Read as {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}. Not every building has all meter types.</li>
    <li>timestamp - When the measurement was taken</li>
    <li>meter_reading - The target variable. Energy consumption in kWh (or equivalent). Note that this is real data with measurement error, which we expect will impose a baseline level of modeling error. UPDATE: as discussed here, the site 0 electric meter readings are in kBTU.</li>
</ul> 

In [3]:
df.shape

(20216100, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 20216100 entries, (0, 2016-01-01 00:00:00) to (1448, 2016-12-31 23:00:00)
Data columns (total 2 columns):
meter            int64
meter_reading    float64
dtypes: float64(1), int64(1)
memory usage: 385.7 MB


In [5]:
# Are there NaN values ? No.
df.isna().sum()

meter            0
meter_reading    0
dtype: int64

<b>II. Building_id</b>

In [6]:
building_ids = df.index.get_level_values('building_id')
print('n buildings : {}'.format(building_ids.nunique()))

n buildings : 1449


In [7]:
# for the moment we'll drop site_0 buildings

In [8]:
# Load building_metadata and set index
bdata = pd.read_csv('../data/raw/csvs/building_metadata.csv', index_col='building_id')
bdata.head()

Unnamed: 0_level_0,site_id,primary_use,square_feet,year_built,floor_count
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,Education,7432,2008.0,
1,0,Education,2720,2004.0,
2,0,Education,5376,1991.0,
3,0,Education,23685,2002.0,
4,0,Education,116607,1975.0,


In [9]:
print('Number of site 0 buildings in our subset : {}'.format(bdata['site_id'].value_counts()[0]))

Number of site 0 buildings in our subset : 105


In [10]:
# kick out site_ids == 0
bdata_w0 = bdata[bdata['site_id'] != 0]

<b>Randomly select a building</b>

In [12]:
# Randomly select a building
random.seed(2)
building_list = bdata.index.tolist()
sampled_id_list = random.choice(building_list, 10)
print(sampled_id_list)

[1192  527  493  299  466 1099  360 1287  674  433]


Get corresponding metadata, available meters, and n_timestamps by meters.

In [14]:
bdata.loc[sampled_id_list]

Unnamed: 0_level_0,site_id,primary_use,square_feet,year_built,floor_count
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1192,13,Office,43587,,
527,3,Public services,9392,,
493,3,Education,283000,2014.0,
299,3,Education,356000,2014.0,
466,3,Public services,22500,,
1099,13,Education,332884,,
360,3,Education,69600,1949.0,
1287,14,Education,60582,,
674,5,Entertainment/public assembly,149704,1976.0,8.0
433,3,Entertainment/public assembly,23055,,


In [15]:
df_cpdg_meter = df.loc[sampled_id_list].copy()

In [17]:
df_cpdg_meter.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,meter,meter_reading
building_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
299,2016-02-04 01:00:00,0,36.12
299,2016-02-04 02:00:00,0,35.34
299,2016-02-04 03:00:00,0,35.37
299,2016-02-04 04:00:00,0,35.52
299,2016-02-04 05:00:00,0,38.73


In [18]:
grouped = df_cpdg_meter.groupby(['building_id', 'meter']).count()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,meter_reading
building_id,meter,Unnamed: 2_level_1
299,0,7965
360,0,8758
433,0,8782
466,0,8782
493,0,8782
527,0,8758
674,0,8784
1099,0,8782
1099,2,8784
1192,2,8784
