In [1]:
import numpy as np
import math
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose

from time import localtime, strftime

%matplotlib inline

The aim is to build a training pipeline and apply it on a small number of (building, meter)

We won't use a scikitlearn pipeline as it does not allow to clean data by deleting rows.

In [2]:
test_df = pd.read_csv('../data/raw/csvs/test.csv', parse_dates=['timestamp'])

In [3]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01
1,1,1,0,2017-01-01
2,2,2,0,2017-01-01
3,3,3,0,2017-01-01
4,4,4,0,2017-01-01


In [5]:
bdata = pd.read_csv(
    '../data/raw/csvs/building_metadata.csv', 
    index_col='building_id', 
    usecols=['building_id', 'site_id']
)
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [6]:
test_df = test_df.join(bdata, on='building_id', how='left')

In [7]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id
0,0,0,0,2017-01-01,0
1,1,1,0,2017-01-01,0
2,2,2,0,2017-01-01,0
3,3,3,0,2017-01-01,0
4,4,4,0,2017-01-01,0


In [48]:
def load_and_prepare_site_data(site_id, data_folder_path):
    
    # Loads weather data
    raw_df_weather = pd.read_csv(data_folder_path + 'weather_train.csv', 
                     parse_dates=['timestamp'], index_col=['site_id', 'timestamp'])

    b_df_weather = raw_df_weather.loc[(site_id,)]

    # keep only air_temperature and dew_temperature
    b_df_weather.drop(
        ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'cloud_coverage'],
        axis=1,
        inplace=True
    )

    # Clean timestamps index.
    clean_index = pd.date_range(start=b_df_weather.index.min(), end=b_df_weather.index.max(), freq='H')
    b_df_weather = b_df_weather.reindex(index=clean_index, copy=True)


    # Interpolate missing values.
    b_df_weather.interpolate(method='linear', limit=3, inplace=True)

    # Build time features
    b_df_weather['day_hour'] = b_df_weather.index.to_series().dt.hour
    b_df_weather['day_of_year'] = b_df_weather.index.to_series().dt.dayofyear

    # Builds averaged weather features.

    timeframes = [24]
    features_to_avg = ['air_temperature', 'dew_temperature']
    do_center = False

    for c in features_to_avg:
        ts = b_df_weather[c]
        for timeframe in timeframes:
            shifted_ts = ts.rolling(timeframe, center=do_center).mean()
            new_col_name = '' + c + '_ma_' + str(timeframe) + 'H'
            b_df_weather[new_col_name] = shifted_ts
            
            
    # Drops rows with NaNs.
    b_df_weather.dropna(axis=0, how='any', inplace=True)
            
    print('shape={}'.format(b_df_weather.shape))
        
    return b_df_weather

In [66]:
# Loads meter_reading data
def load_meter_data(building_id, meter_id, data_folder_path):
    
    raw_df_meters = pd.read_csv(data_folder_path + 'train.csv', parse_dates=['timestamp'])

    to_keep = (raw_df_meters['building_id']==building_id) & (raw_df_meters['meter']==meter_id)
    b_df_meters = raw_df_meters[to_keep].copy()

    b_df_meters.drop('building_id', axis=1, inplace=True)
    b_df_meters.drop('meter', axis=1, inplace=True)

    b_df_meters.set_index('timestamp', inplace=True)
    b_df_meters.sort_index(inplace=True)
    
    return b_df_meters

In [79]:
# Drops rows that are not in both dtaframe indexes.
def prepare_meter_train_set(site_weather_df, building_meter_df):
    
    common_index = site_weather_df.index.intersection(other=building_meter_df.index)
    
    # Reset indexes
    
    X = site_weather_df.loc[common_index].copy()
    Y = building_meter_df.loc[common_index].copy()

    print('isna():')
    print(X.isna().sum().sum())
    print(Y.isna().sum().sum())
    
    
    return (X,Y)

In [16]:
bdata.head()

Unnamed: 0_level_0,site_id
building_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [19]:
site_ids = bdata['site_id'].unique().tolist()

In [8]:
train_df = pd.read_csv('../data/raw/csvs/train.csv', parse_dates=['timestamp'])

In [10]:
train_df_grouped = train_df.groupby(['building_id', 'meter']).count()
train_df_grouped.drop('timestamp', axis=1, inplace=True)
train_df_grouped.rename({'meter_reading' : 'n_meter_readings'}, axis=1, inplace=True)
train_df_grouped.sort_values(by='n_meter_readings', axis=0, ascending=False, inplace=True)

In [12]:
train_df_grouped = train_df_grouped.join(bdata, on='building_id', how='left')

In [22]:
train_df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,8784,0
685,0,8784,5
672,0,8784,5
673,0,8784,5
674,0,8784,5


In [None]:
# main
"""

for site in site_id:

    load_and_prepare_site_data()

    for building on this site:
        
        for meter in building_meter:
        
            load_meter_data()
            prepare_train_set()
            
            cross-validate()
            fit()
            
            save()



"""




In [31]:
train_df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2380 entries, (0, 0) to (403, 0)
Data columns (total 2 columns):
n_meter_readings    2380 non-null int64
site_id             2380 non-null int64
dtypes: int64(2)
memory usage: 55.6 KB


In [33]:
train_df_grouped.iloc[:2000].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
1003,3,8306,10
815,0,8255,8
810,0,8255,8
853,0,8208,8
1273,1,8194,14


In [40]:
# sub-sample train_df

# sample among first 2000 meters (with the most observations by meters)
subsample_indexes = np.random.choice(2000, 10, replace=False)

subsample_building_meters = train_df_grouped.iloc[subsample_indexes]

In [41]:
subsample_building_meters

Unnamed: 0_level_0,Unnamed: 1_level_0,n_meter_readings,site_id
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1
1135,0,8784,13
1265,0,8694,14
962,2,8768,9
255,3,8783,2
1228,0,8784,14
1252,3,8784,14
989,0,8766,9
608,0,8783,4
1247,1,8784,14
973,2,8777,9


In [64]:
subsample_building_meters.loc[1265].index.tolist()

[0]

In [57]:
subsample_building_meters.index.get_level_values('building_id').unique().tolist()

[1135, 1265, 962, 255, 1228, 1252, 989, 608, 1247, 973]

In [43]:
subsample_site_ids = subsample_building_meters['site_id'].unique()
subsample_site_ids

array([13, 14,  9,  2,  4])

In [80]:
data_folder = '../data/raw/csvs/'


for site_id in subsample_site_ids:
    
    print('site {}'.format(site_id))
    
    site_weather_data = load_and_prepare_site_data(site_id, data_folder)
    
    site_building_meters = subsample_building_meters[subsample_building_meters['site_id']==site_id]
    site_buildings = site_building_meters.index.get_level_values('building_id').unique().tolist()
    #site_buildings.sort_values(by='building_id', axis='index', inplace=True)
    
    print(site_buildings)
    
    for building in site_buildings:
        
        building_meters = site_building_meters.loc[building].index.tolist()
        
        for building_meter in building_meters:
            
            print('(building, meter)=({}, {})'.format(building, building_meter))
            
            meter_df = load_meter_data(building, building_meter, data_folder)
            
            print('meter_df.shape={}'.format(meter_df.shape))
            
            x_train, y_train = prepare_meter_train_set(site_weather_data, meter_df)
            
            print('x_train.shape={}'.format(x_train.shape))
            print('y_train.shape={}'.format(y_train.shape))
    
    print('--')

site 13
shape=(8761, 6)
isna():
air_temperature           0
dew_temperature           0
day_hour                  0
day_of_year               0
air_temperature_ma_24H    0
dew_temperature_ma_24H    0
dtype: int64
[1135]
(building, meter)=(1135, 0)
meter_df.shape=(8784, 1)
isna():
0
0
x_train.shape=(8761, 6)
y_train.shape=(8761, 1)
--
site 14
shape=(8733, 6)
isna():
air_temperature           0
dew_temperature           0
day_hour                  0
day_of_year               0
air_temperature_ma_24H    0
dew_temperature_ma_24H    0
dtype: int64
[1265, 1228, 1252, 1247]
(building, meter)=(1265, 0)
meter_df.shape=(8694, 1)
isna():
0
0
x_train.shape=(8643, 6)
y_train.shape=(8643, 1)
(building, meter)=(1228, 0)
meter_df.shape=(8784, 1)
isna():
0
0
x_train.shape=(8733, 6)
y_train.shape=(8733, 1)
(building, meter)=(1252, 3)
meter_df.shape=(8784, 1)
isna():
0
0
x_train.shape=(8733, 6)
y_train.shape=(8733, 1)
(building, meter)=(1247, 1)
meter_df.shape=(8784, 1)
isna():
0
0
x_train.shape=(8733, 6