In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import gc, math

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# Preprocessing Test

In [165]:
test_df = pd.read_csv('../raw_data/test.csv', parse_dates=['timestamp'])

In [10]:
X = {'building_id':[104],'meter':[0],'timestamp':['2017-06-02 12:00:00']}

In [2]:
def preprocess(X):

    weather_preproc = pd.read_csv('../raw_data/weather_preproc.csv', parse_dates=['timestamp'])

    building_preproc = pd.read_csv("../raw_data/building_preproc.csv")


    X = pd.DataFrame.from_dict(X)
    
    X['timestamp'] = pd.to_datetime(X.timestamp)
    row = X.merge(building_preproc.iloc[:,1:], on='building_id', how='left')

    row = row.merge(weather_preproc.iloc[:,1:], on=['site_id', 'timestamp'], how='left')

    beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
              (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

    for item in beaufort:
        row.loc[(row['wind_speed']>=item[1]) & (row['wind_speed']<item[2]), 'beaufort_scale'] = item[0]

    row["timestamp"] = pd.to_datetime(row["timestamp"])

    def transform(df):
        df['hour'] = np.uint8(df['timestamp'].dt.hour)
        df['day'] = np.uint8(df['timestamp'].dt.day)
        df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
        df['month'] = np.uint8(df['timestamp'].dt.month)
        df['year'] = np.uint8(df['timestamp'].dt.year-1900)

        df['square_feet'] = np.log(df['square_feet'])

        return df

    row = transform(row)

    def encode_cyclic_feature(df, col, max_vals):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    #     df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
        del df[col]
        return df

    dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
    us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
    row['is_holiday'] = (row['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

    row.loc[(row['weekday']) == 5 | (row['weekday'] == 6) , 'is_holiday'] = 1

    row = row.drop(['timestamp'], axis=1)

    row = encode_cyclic_feature(row, 'weekday', 7)
    row = encode_cyclic_feature(row, 'hour', 24)
    row = encode_cyclic_feature(row, 'day', 31)
    row = encode_cyclic_feature(row, 'month', 12)

    row = row.loc[:,['site_id', 'building_id', 'primary_use', 'meter', 'wind_direction',
           'is_holiday', 'square_feet', 'year_built', 'air_temperature',
           'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'floor_count',
           'beaufort_scale', 'weekday_sin', 'day_sin', 'hour_sin', 'month_sin']]
    return row

In [11]:
X = preprocess(X)

In [3]:
def pred(X):
    model1 = lgb.Booster(model_file='model1.txt')
    model2 = lgb.Booster(model_file='model2.txt')
    y_pred = (model1.predict(X)+model2.predict(X))/2
    
    to_return = {'meter_reading':float(y_pred)}

    return to_return

In [12]:
pred(X)

{'meter_reading': 5.780886692302947}

## Monthly Prediction

In [14]:
month="01"
year = "2017"

In [21]:
dates =[year+'-'+month+'-'+str(d)+" "+str(h) for d in range(31) for h in range(23) if d > 10 else dates =[year+'-'+month+'-'+str(d)+" "+"0"+str(h)  ]

SyntaxError: invalid syntax (2853297707.py, line 1)

In [20]:
dates

['2017-01-11 0',
 '2017-01-11 1',
 '2017-01-11 2',
 '2017-01-11 3',
 '2017-01-11 4',
 '2017-01-11 5',
 '2017-01-11 6',
 '2017-01-11 7',
 '2017-01-11 8',
 '2017-01-11 9',
 '2017-01-11 10',
 '2017-01-11 11',
 '2017-01-11 12',
 '2017-01-11 13',
 '2017-01-11 14',
 '2017-01-11 15',
 '2017-01-11 16',
 '2017-01-11 17',
 '2017-01-11 18',
 '2017-01-11 19',
 '2017-01-11 20',
 '2017-01-11 21',
 '2017-01-11 22',
 '2017-01-12 0',
 '2017-01-12 1',
 '2017-01-12 2',
 '2017-01-12 3',
 '2017-01-12 4',
 '2017-01-12 5',
 '2017-01-12 6',
 '2017-01-12 7',
 '2017-01-12 8',
 '2017-01-12 9',
 '2017-01-12 10',
 '2017-01-12 11',
 '2017-01-12 12',
 '2017-01-12 13',
 '2017-01-12 14',
 '2017-01-12 15',
 '2017-01-12 16',
 '2017-01-12 17',
 '2017-01-12 18',
 '2017-01-12 19',
 '2017-01-12 20',
 '2017-01-12 21',
 '2017-01-12 22',
 '2017-01-13 0',
 '2017-01-13 1',
 '2017-01-13 2',
 '2017-01-13 3',
 '2017-01-13 4',
 '2017-01-13 5',
 '2017-01-13 6',
 '2017-01-13 7',
 '2017-01-13 8',
 '2017-01-13 9',
 '2017-01-13 10',
 '20

In [6]:
X1 = {'building_id':[104],'meter':[0],'timestamp':['2017-06-02 12:00:00']}
X2 = {'building_id':[104],'meter':[0],'timestamp':['2017-06-02 13:00:00']}
X3 = {'building_id':[104],'meter':[0],'timestamp':['2017-06-02 14:00:00']}

In [7]:
X1 = preprocess(X1)
X2 = preprocess(X2)
X3 = preprocess(X3)

In [11]:
pd.concat([X1,X2,X3])

Unnamed: 0,site_id,building_id,primary_use,meter,wind_direction,is_holiday,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,floor_count,beaufort_scale,weekday_sin,day_sin,hour_sin,month_sin
0,0,104,6,0,120.0,0,10.721724,2003.0,26.7,7.0,23.9,-1.0,4.0,1.0,-0.433884,0.394356,1.224647e-16,1.224647e-16
0,0,104,6,0,240.0,0,10.721724,2003.0,28.3,6.666667,23.3,0.0,4.0,3.0,-0.433884,0.394356,-0.258819,1.224647e-16
0,0,104,6,0,295.0,0,10.721724,2003.0,29.4,6.333333,23.3,0.0,4.0,2.0,-0.433884,0.394356,-0.5,1.224647e-16
