In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import gc, math

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# Preprocessing Test

In [5]:
def preprocess(X):

    weather_preproc = pd.read_csv('../raw_data/weather_preproc.csv', parse_dates=['timestamp'])

    building_preproc = pd.read_csv("../raw_data/building_preproc.csv")


    X = pd.DataFrame.from_dict(X)
    
    X['timestamp'] = pd.to_datetime(X.timestamp)
    row = X.merge(building_preproc.iloc[:,1:], on='building_id', how='left')

    row = row.merge(weather_preproc.iloc[:,1:], on=['site_id', 'timestamp'], how='left')

    beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
              (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

    for item in beaufort:
        row.loc[(row['wind_speed']>=item[1]) & (row['wind_speed']<item[2]), 'beaufort_scale'] = item[0]

    row["timestamp"] = pd.to_datetime(row["timestamp"])

    def transform(df):
        df['hour'] = np.uint8(df['timestamp'].dt.hour)
        df['day'] = np.uint8(df['timestamp'].dt.day)
        df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
        df['month'] = np.uint8(df['timestamp'].dt.month)
        df['year'] = np.uint8(df['timestamp'].dt.year-1900)

        df['square_feet'] = np.log(df['square_feet'])

        return df

    row = transform(row)

    def encode_cyclic_feature(df, col, max_vals):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    #     df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
        del df[col]
        return df

    dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
    us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

    row.loc[(row['weekday']) == 5 | (row['weekday'] == 6) , 'is_holiday'] = 1

    row = row.drop(['timestamp'], axis=1)

    row = encode_cyclic_feature(row, 'weekday', 7)
    row = encode_cyclic_feature(row, 'hour', 24)
    row = encode_cyclic_feature(row, 'day', 31)
    row = encode_cyclic_feature(row, 'month', 12)

    row = row.loc[:,['site_id', 'building_id', 'primary_use', 'meter', 'wind_direction',
           'is_holiday', 'square_feet', 'year_built', 'air_temperature',
           'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'floor_count',
           'beaufort_scale', 'weekday_sin', 'day_sin', 'hour_sin', 'month_sin']]
    return row

In [11]:
X = preprocess(X)

In [3]:
def pred(X):
    model1 = lgb.Booster(model_file='model1.txt')
    model2 = lgb.Booster(model_file='model2.txt')
    y_pred = (model1.predict(X)+model2.predict(X))/2
    
    #to_return = {'meter_reading':float(y_pred)}

    return y_pred

## Monthly Prediction

In [11]:
def compare(building1: int,
            building2: int,
            meter: int,
            initial_date: str,
            final_date: str):

    initial_timestamp = initial_date +' '+"0:00"
    final_timestamp = final_date + ' '+ "23:00"

    dates = pd.date_range(initial_timestamp, final_timestamp, freq="1h")

    input_values1=pd.DataFrame(dates)
    input_values1['building_id'] = building1
    input_values1['meter'] = meter
    input_values1 = input_values1.iloc[:,[1,2,0]]
    input_values1 = input_values1.rename(columns={0:'timestamp'})

    input_values2=pd.DataFrame(dates)
    input_values2['building_id'] = building2
    input_values2['meter'] = meter
    input_values2 = input_values2.iloc[:,[1,2,0]]
    input_values2 = input_values2.rename(columns={0:'timestamp'})

    X1 = preprocess(input_values1)
    X2 = preprocess(input_values2)

    model1 = lgb.Booster(model_file='model1.txt')
    model2 = lgb.Booster(model_file='model2.txt')

    y_pred1 = (model1.predict(X1)+model2.predict(X1))/2

    y_pred2 = (model1.predict(X2)+model2.predict(X2))/2

    consq1 = y_pred1/X1.square_feet
    consq2= y_pred2/X2.square_feet
    print(type(consq1))
    print(type(consq2))
    print(consq1)
    print(consq2)

    y =  pd.DataFrame({'cons_kwh1':y_pred1,
                       'cons_square_feet1':list(consq1),
                       'cons_kwh2':y_pred2,
                       'cons_square_feet2':list(consq2)}, index = input_values1.timestamp)

    y_json = y.to_json()

    return y_json

In [12]:
y_json = compare(100,200,0,"2017-01-01","2017-01-01")

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
0     0.477569
1     0.475796
2     0.480506
3     0.480071
4     0.480937
5     0.480666
6     0.479729
7     0.479281
8     0.480670
9     0.478553
10    0.479173
11    0.477875
12    0.484495
13    0.489562
14    0.492706
15    0.494540
16    0.494081
17    0.495343
18    0.494081
19    0.493678
20    0.493666
21    0.485429
22    0.483018
23    0.472575
Name: square_feet, dtype: float64
0     0.422469
1     0.422609
2     0.420374
3     0.420245
4     0.419311
5     0.419267
6     0.419852
7     0.419826
8     0.419830
9     0.419418
10    0.420157
11    0.422187
12    0.422376
13    0.427316
14    0.439046
15    0.438482
16    0.440274
17    0.440079
18    0.439344
19    0.440709
20    0.440157
21    0.437900
22    0.437522
23    0.426779
Name: square_feet, dtype: float64


In [13]:
y_json
y_recovered = pd.read_json(y_json)



ps.line(y_recovered[['cons_square_feet1','cons_square_feet2']])

y_recovered[['cons_square_feet1','cons_square_feet2']].mean()

'{"cons_kwh1":{"1483228800000":4.8256610026,"1483232400000":4.8077429341,"1483236000000":4.855338636,"1483239600000":4.850941551,"1483243200000":4.8596946702,"1483246800000":4.8569553619,"1483250400000":4.8474871268,"1483254000000":4.8429583682,"1483257600000":4.8569942702,"1483261200000":4.8356056617,"1483264800000":4.841870767,"1483268400000":4.8287527103,"1483272000000":4.8956394595,"1483275600000":4.9468426518,"1483279200000":4.9786090664,"1483282800000":4.9971406579,"1483286400000":4.9925087577,"1483290000000":5.0052577673,"1483293600000":4.9925087577,"1483297200000":4.9884324519,"1483300800000":4.9883112142,"1483304400000":4.9050841032,"1483308000000":4.8807157878,"1483311600000":4.7751983588},"cons_square_feet1":{"1483228800000":0.4775692519,"1483232400000":0.4757959988,"1483236000000":0.4805062848,"1483239600000":0.4800711294,"1483243200000":0.4809373777,"1483246800000":0.4806662834,"1483250400000":0.479729264,"1483254000000":0.4792810775,"1483257600000":0.4806701339,"148326120