In [2]:
import numpy as np
import math
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose

from time import localtime, strftime

%matplotlib inline

In [6]:
this_building = 1176

# {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}
this_meter = 2

# "additive”, “multiplicative”, "none"
this_seasonal_model = 'none'

data_folder = '../../data/raw/csvs/'
save_folder = '../../data/intermediate/experimentation_train_sets/'

build_and_save_train(this_building, this_meter, this_seasonal_model, data_folder, save_folder)

building is on site 13
selected_features = ['day_of_week', 'day_hour', 'dew_temperature_ma_24H', 'dew_temperature', 'air_temperature_ma_24H', 'air_temperature']
saved as ../../data/intermediate/experimentation_train_sets/train_b_1176_m_2_sm_n_t_20200205_152517.csv


In [5]:
def build_and_save_train(building_id, meter_id, seasonal_model, data_folder_path, save_folder_path):
    
    # Loads building_metadata to get site-building mapping
    bdata = pd.read_csv(data_folder_path + 'building_metadata.csv', 
                        index_col='building_id', 
                        usecols=['building_id', 'site_id'])

    this_site = bdata.loc[this_building, 'site_id']
    print('building is on site {}'.format(this_site))
    
    
    # Loads meter_reading data
    raw_df_meters = pd.read_csv(data_folder_path + 'train.csv', parse_dates=['timestamp'])

    to_keep = (raw_df_meters['building_id']==this_building) & (raw_df_meters['meter']==this_meter)
    b_df_meters = raw_df_meters[to_keep].copy()

    b_df_meters.drop('building_id', axis=1, inplace=True)
    b_df_meters.drop('meter', axis=1, inplace=True)

    b_df_meters.set_index('timestamp', inplace=True)
    b_df_meters.sort_index(inplace=True)
    
    
    # Seasonal decomposition.
    
    if seasonal_model not in ['additive', 'multiplicative', 'none']:
        raise ValueError('seasonal_model arg must be in {\"additive\", \"multiplicative\", \"none\"}')
    
    if seasonal_model != 'none':
        
        x_mr = b_df_meters['meter_reading']
        mr_decomposition = seasonal_decompose(x_mr.asfreq('1H', method='ffill'), model=seasonal_model, two_sided=True)

        b_df_meters['meter_reading_seasonal'] = mr_decomposition.seasonal
        b_df_meters['meter_reading_trend'] = mr_decomposition.trend
        if seasonal_model == 'additive':
            b_df_meters['meter_reading_deseasoned'] = b_df_meters['meter_reading'] - b_df_meters['meter_reading_seasonal']
        elif seasonal_model == 'multiplicative':
            b_df_meters['meter_reading_deseasoned'] = b_df_meters['meter_reading'] / b_df_meters['meter_reading_seasonal']
        b_df_meters['meter_reading_resid'] = mr_decomposition.resid
    
    
    # Loads weather data
    raw_df_weather = pd.read_csv(data_folder_path + 'weather_train.csv', 
                     parse_dates=['timestamp'], index_col=['site_id','timestamp'])

    b_df_weather = raw_df_weather.loc[(this_site,)]

    b_df_weather.drop('cloud_coverage', axis=1, inplace=True)

    
    # Joins meter_reading and weather dfs.
    df_features = b_df_meters.join(b_df_weather, on='timestamp', how='left')

    
    # Builds time features.
    df_features['day_hour'] = df_features.index.to_series().dt.hour
    df_features['day_of_week'] = df_features.index.to_series().dt.dayofweek
    
    
    # Builds shifted weather features.
    features_to_shift = b_df_weather.columns.to_list()

    shifts = [1, 2, 5, 10]

    for c in features_to_shift:
        ts = df_features[c]
        for shift in shifts:
            shifted_ts = ts.shift(periods=shift, freq='1H')
            new_col_name = '' + c + '_shift_' + str(shift) + 'H'
            df_features[new_col_name] = shifted_ts
            
            
    # Builds averaged weather features.

    timeframes = [4, 24, 72]
    features_to_avg = features_to_shift.copy()
    features_to_avg.remove('wind_direction')
    do_center = False

    for c in features_to_avg:
        ts = df_features[c]
        for timeframe in timeframes:
            shifted_ts = ts.rolling(timeframe, center=do_center).mean()
            new_col_name = '' + c + '_ma_' + str(timeframe) + 'H'
            df_features[new_col_name] = shifted_ts
            
            
    # Drops features with to much NaNs

    #precip_depth_1_hr_ma_72H         297
    #sea_level_pressure_ma_4H         442
    #sea_level_pressure_ma_24H       2014
    #sea_level_pressure_ma_72H       4590

    #wind_direction_shift_1H          221
    #wind_direction_shift_2H          223
    #wind_direction_shift_5H          229
    #wind_direction_shift_10H         235

    #air_temperature_ma_72H           215
    #dew_temperature_ma_72H           215

    to_drop=[
        'precip_depth_1_hr_ma_72H', 
        'sea_level_pressure_ma_4H' ,
        'sea_level_pressure_ma_24H',
        'sea_level_pressure_ma_72H',
        'wind_direction_shift_1H',
        'wind_direction_shift_2H',
        'wind_direction_shift_5H',
        'wind_direction_shift_10H',
        'air_temperature_ma_72H',
        'dew_temperature_ma_72H'
    ]

    df_features.drop(to_drop, axis=1, inplace=True)
    df_features.isna().sum()
    
    
    # Keeps only a selected subset of the generated features. (for instance based on eda).
    selected_features = [
        'day_of_week',
        'day_hour',
        'dew_temperature_ma_24H',
        'dew_temperature',
        'air_temperature_ma_24H',
        'air_temperature',
        #'wind_speed_ma_24H',
        #'precip_depth_1_hr_ma_24H',
        #'precip_depth_1_hr_ma_4H'
        #'sea_level_pressure_shift_10H'
    ]
    
    main_target = ['meter_reading']
    
    seasonal_decomposition_targets = [
        'meter_reading_trend',
        'meter_reading_seasonal',
        'meter_reading_deseasoned',
        'meter_reading_resid'
    ]
    
    selected_targets = main_target
    if seasonal_model != 'none':
        selected_targets += seasonal_decomposition_targets
    
    print('selected_features = {}'.format(selected_features))
    
    df_selected_features = df_features[selected_features+selected_targets].copy()

    
    # Drops rows with NaNs.
    df_selected_features.dropna(axis=0, how='any', inplace=True)
    
    
    # Saves as csv.
    time_str = '_t_' + strftime('%Y%m%d_%H%M%S', localtime())
    seasonal_model_abbr = {'additive' : 'add', 'multiplicative' : 'mult', 'none' : 'n'}
    sm_str = '_sm_' + seasonal_model_abbr.get(seasonal_model)
    
    filename = 'train_b_' + str(this_building) + '_m_' + str(this_meter) + sm_str + time_str + '.csv'
    filepath = save_folder_path + filename
    
    df_selected_features.to_csv(filepath)
    print('saved as {}'.format(filepath))
