# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package lightgbm
- no data cleaning feature engineering to be used

In [58]:
# General Imports

import numpy as np
import pandas as pd
import pickle

import os, gc, sys, warnings, random, psycopg2

In [59]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [60]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [61]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

In [62]:
#data check#data check
print("Main:",list(train_df),train_df.info())
print("#" * 20)
print("Main:",list(test_df),test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17684654 entries, 0 to 17684653
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   site_id             int8          
 2   meter               int8          
 3   timestamp           datetime64[ns]
 4   meter_reading       float64       
 5   primary_use         int8          
 6   square_feet         int32         
 7   year_built          int8          
 8   floor_count         int8          
 9   air_temperature     float64       
 10  cloud_coverage      float16       
 11  dew_temperature     float64       
 12  precip_depth_1_hr   float16       
 13  sea_level_pressure  float64       
 14  wind_direction      float16       
 15  wind_speed          float64       
 16  DT_month            int8          
 17  DT_week_y           int8          
 18  DT_day_y            int8          
 19  DT_hour             int8          
 20  

In [63]:
# Check memory usage

In [67]:
# Model Parameters
import lightgbm as lgb

lgb_params = {
    
    'objective':'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'n_jobs':-1,
    'learning_rate':0.05,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',
    'colsample_bytree': 0.7,
    'subsample_freq':1,
    'subsample':0.7,
    'n_estimators':600,
    'max_bin':255,
    'verbose':-1,
    'seed': SEED,
    'early_stopping_rounds':200,
    
}

In [68]:
#prepare training data

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

#use np.log1 for distribution
#label is target

tr_data = lgb.Dataset(train_df[feature_columns],label=np.log1p(train_df[TARGET]))
eval_sets = [tr_data]

In [69]:
#run model

seed_everything(15)
lgb_params['seed'] = 15

estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = eval_sets,
            verbose_eval = 100,
      
)

Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 1.00401
[200]	training's rmse: 0.862542
[300]	training's rmse: 0.805057
[400]	training's rmse: 0.761733
[500]	training's rmse: 0.732619
[600]	training's rmse: 0.70861
Did not meet early stopping. Best iteration is:
[600]	training's rmse: 0.70861


In [33]:
feature_columns

['building_id',
 'site_id',
 'meter',
 'primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'DT_month',
 'DT_week_y',
 'DT_day_y',
 'DT_hour',
 'DT_day_w',
 'DT_day_m',
 'DT_week_m']

In [37]:
train_df.iloc[15000000:][feature_columns]


Unnamed: 0,building_id,site_id,meter,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,...,sea_level_pressure,wind_direction,wind_speed,DT_month,DT_week_y,DT_day_y,DT_hour,DT_day_w,DT_day_m,DT_week_m
15000000,228,2,3,4,49108,-97,0,19.4,0.0,12.2,...,1016.6,110.0,2.1,11,45,57,9,1,8,2
15000001,229,2,0,0,140092,0,0,19.4,0.0,12.2,...,1016.6,110.0,2.1,11,45,57,9,1,8,2
15000002,229,2,1,0,140092,0,0,19.4,0.0,12.2,...,1016.6,110.0,2.1,11,45,57,9,1,8,2
15000003,230,2,0,0,10334,0,0,19.4,0.0,12.2,...,1016.6,110.0,2.1,11,45,57,9,1,8,2
15000004,231,2,0,0,33739,-109,0,19.4,0.0,12.2,...,1016.6,110.0,2.1,11,45,57,9,1,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17684649,144,1,0,6,57673,0,5,3.8,,2.4,...,1020.9,240.0,3.1,1,53,1,0,4,1,1
17684650,144,1,3,6,57673,0,5,3.8,,2.4,...,1020.9,240.0,3.1,1,53,1,0,4,1,1
17684651,145,1,0,6,14219,0,4,3.8,,2.4,...,1020.9,240.0,3.1,1,53,1,0,4,1,1
17684652,145,1,3,6,14219,0,4,3.8,,2.4,...,1020.9,240.0,3.1,1,53,1,0,4,1,1


In [35]:
#Save model