In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import gc, math

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# Preprocessing Test

In [10]:
X = {'building_id':[104],'meter':[0],'timestamp':['2017-06-02 12:00:00']}

In [2]:
def preprocess(X):

    weather_preproc = pd.read_csv('../raw_data/weather_preproc.csv', parse_dates=['timestamp'])

    building_preproc = pd.read_csv("../raw_data/building_preproc.csv")


    X = pd.DataFrame.from_dict(X)
    
    X['timestamp'] = pd.to_datetime(X.timestamp)
    row = X.merge(building_preproc.iloc[:,1:], on='building_id', how='left')

    row = row.merge(weather_preproc.iloc[:,1:], on=['site_id', 'timestamp'], how='left')

    beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
              (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

    for item in beaufort:
        row.loc[(row['wind_speed']>=item[1]) & (row['wind_speed']<item[2]), 'beaufort_scale'] = item[0]

    row["timestamp"] = pd.to_datetime(row["timestamp"])

    def transform(df):
        df['hour'] = np.uint8(df['timestamp'].dt.hour)
        df['day'] = np.uint8(df['timestamp'].dt.day)
        df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
        df['month'] = np.uint8(df['timestamp'].dt.month)
        df['year'] = np.uint8(df['timestamp'].dt.year-1900)

        df['square_feet'] = np.log(df['square_feet'])

        return df

    row = transform(row)

    def encode_cyclic_feature(df, col, max_vals):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    #     df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
        del df[col]
        return df

    dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
    us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

    row.loc[(row['weekday']) == 5 | (row['weekday'] == 6) , 'is_holiday'] = 1

    row = row.drop(['timestamp'], axis=1)

    row = encode_cyclic_feature(row, 'weekday', 7)
    row = encode_cyclic_feature(row, 'hour', 24)
    row = encode_cyclic_feature(row, 'day', 31)
    row = encode_cyclic_feature(row, 'month', 12)

    row = row.loc[:,['site_id', 'building_id', 'primary_use', 'meter', 'wind_direction',
           'is_holiday', 'square_feet', 'year_built', 'air_temperature',
           'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'floor_count',
           'beaufort_scale', 'weekday_sin', 'day_sin', 'hour_sin', 'month_sin']]
    return row

In [11]:
X = preprocess(X)

In [3]:
def pred(X):
    model1 = lgb.Booster(model_file='model1.txt')
    model2 = lgb.Booster(model_file='model2.txt')
    y_pred = (model1.predict(X)+model2.predict(X))/2
    
    #to_return = {'meter_reading':float(y_pred)}

    return y_pred

In [12]:
pred(X)

{'meter_reading': 5.780886692302947}

## Monthly Prediction

In [11]:
pd.concat([X1,X2,X3])

Unnamed: 0,site_id,building_id,primary_use,meter,wind_direction,is_holiday,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,floor_count,beaufort_scale,weekday_sin,day_sin,hour_sin,month_sin
0,0,104,6,0,120.0,0,10.721724,2003.0,26.7,7.0,23.9,-1.0,4.0,1.0,-0.433884,0.394356,1.224647e-16,1.224647e-16
0,0,104,6,0,240.0,0,10.721724,2003.0,28.3,6.666667,23.3,0.0,4.0,3.0,-0.433884,0.394356,-0.258819,1.224647e-16
0,0,104,6,0,295.0,0,10.721724,2003.0,29.4,6.333333,23.3,0.0,4.0,2.0,-0.433884,0.394356,-0.5,1.224647e-16


In [None]:
bu_id = 100
meter = 0
year = 2017
month = 1

In [None]:
first_timestamp = "2017-01-01 0:00"

last_timestamp = "2017-01-31 23:00"

In [36]:
dates = pd.date_range("2017-01-01 0:00", "2017-01-31 23:00", freq="1h")

input_values=pd.DataFrame(dates)
input_values['building_id'] = 100
input_values['meter'] = 0
input_values = input_values.iloc[:,[1,2,0]]
input_values = input_values.rename(columns={0:'timestamp'})

X = preprocess(input_values)

X

Unnamed: 0,site_id,building_id,primary_use,meter,wind_direction,is_holiday,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,floor_count,beaufort_scale,weekday_sin,day_sin,hour_sin,month_sin
0,0,100,4,0,140.0,0,10.104631,1968.0,17.2,0.0,13.3,0.0,4.0,2.0,-0.781831,2.012985e-01,0.000000,0.5
1,0,100,4,0,130.0,0,10.104631,1968.0,16.7,2.0,13.3,0.0,4.0,2.0,-0.781831,2.012985e-01,0.258819,0.5
2,0,100,4,0,130.0,0,10.104631,1968.0,15.6,2.0,12.8,0.0,4.0,2.0,-0.781831,2.012985e-01,0.500000,0.5
3,0,100,4,0,150.0,0,10.104631,1968.0,15.0,0.0,12.8,0.0,4.0,1.0,-0.781831,2.012985e-01,0.707107,0.5
4,0,100,4,0,0.0,0,10.104631,1968.0,15.0,2.0,13.3,0.0,4.0,0.0,-0.781831,2.012985e-01,0.866025,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,0,100,4,0,0.0,1,10.104631,1968.0,21.1,0.0,6.1,0.0,4.0,0.0,0.781831,-2.449294e-16,-0.965926,0.5
740,0,100,4,0,150.0,1,10.104631,1968.0,18.3,2.0,7.8,0.0,4.0,2.0,0.781831,-2.449294e-16,-0.866025,0.5
741,0,100,4,0,170.0,1,10.104631,1968.0,16.1,0.0,7.2,0.0,4.0,1.0,0.781831,-2.449294e-16,-0.707107,0.5
742,0,100,4,0,180.0,1,10.104631,1968.0,14.4,0.0,7.8,0.0,4.0,1.0,0.781831,-2.449294e-16,-0.500000,0.5


In [11]:
X

Unnamed: 0,site_id,building_id,primary_use,meter,wind_direction,is_holiday,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,floor_count,beaufort_scale,weekday_sin,day_sin,hour_sin,month_sin
0,0,100,4,0,140.0,0,10.104631,1968.0,17.2,0.0,13.3,0.0,4.0,2.0,-0.781831,2.012985e-01,0.000000,0.5
1,0,100,4,0,130.0,0,10.104631,1968.0,16.7,2.0,13.3,0.0,4.0,2.0,-0.781831,2.012985e-01,0.258819,0.5
2,0,100,4,0,130.0,0,10.104631,1968.0,15.6,2.0,12.8,0.0,4.0,2.0,-0.781831,2.012985e-01,0.500000,0.5
3,0,100,4,0,150.0,0,10.104631,1968.0,15.0,0.0,12.8,0.0,4.0,1.0,-0.781831,2.012985e-01,0.707107,0.5
4,0,100,4,0,0.0,0,10.104631,1968.0,15.0,2.0,13.3,0.0,4.0,0.0,-0.781831,2.012985e-01,0.866025,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,0,100,4,0,0.0,1,10.104631,1968.0,21.1,0.0,6.1,0.0,4.0,0.0,0.781831,-2.449294e-16,-0.965926,0.5
740,0,100,4,0,150.0,1,10.104631,1968.0,18.3,2.0,7.8,0.0,4.0,2.0,0.781831,-2.449294e-16,-0.866025,0.5
741,0,100,4,0,170.0,1,10.104631,1968.0,16.1,0.0,7.2,0.0,4.0,1.0,0.781831,-2.449294e-16,-0.707107,0.5
742,0,100,4,0,180.0,1,10.104631,1968.0,14.4,0.0,7.8,0.0,4.0,1.0,0.781831,-2.449294e-16,-0.500000,0.5


In [16]:
X.square_feet /= 2

In [19]:
X.primary_use = 1

In [30]:
y_pred = pred(X)

In [31]:
dates.columns

Index([0, 'building_id', 'meter'], dtype='object')

In [32]:
y_pred = pd.DataFrame(y_pred,columns=['Cons. (kwh)'],index = input_values.timestamp )

In [34]:
y_pred.to_json()

'{"Cons. (kwh)":{"1483228800000":4.8256610026,"1483232400000":4.8077429341,"1483236000000":4.855338636,"1483239600000":4.850941551,"1483243200000":4.8596946702,"1483246800000":4.8569553619,"1483250400000":4.8474871268,"1483254000000":4.8429583682,"1483257600000":4.8569942702,"1483261200000":4.8356056617,"1483264800000":4.841870767,"1483268400000":4.8287527103,"1483272000000":4.8956394595,"1483275600000":4.9468426518,"1483279200000":4.9786090664,"1483282800000":4.9971406579,"1483286400000":4.9925087577,"1483290000000":5.0052577673,"1483293600000":4.9925087577,"1483297200000":4.9884324519,"1483300800000":4.9883112142,"1483304400000":4.9050841032,"1483308000000":4.8807157878,"1483311600000":4.7751983588,"1483315200000":4.7410186768,"1483318800000":4.7699494994,"1483322400000":4.7642675018,"1483326000000":4.7726019952,"1483329600000":4.7837588595,"1483333200000":4.7875781471,"1483336800000":4.8070288886,"1483340400000":4.831337315,"1483344000000":4.8101743348,"1483347600000":4.8167683127,"