# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package lightgbm
- no data cleaning feature engineering to be used

In [11]:
# General Imports

import numpy as np
import pandas as pd
import pickle

import os, gc, sys, warnings, random, psycopg2

In [12]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [13]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [14]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
submission = pd.read_pickle('submission.pkl')

In [15]:
#data check#data check
print("Main:",list(train_df),train_df.info())
print("#" * 20)
print("Main:",list(test_df),test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   site_id             int8          
 2   meter               int8          
 3   meter_reading       float64       
 4   timestamp           datetime64[ns]
 5   primary_use         int8          
 6   square_feet         int32         
 7   year_built          int8          
 8   floor_count         int8          
 9   air_temperature     float64       
 10  cloud_coverage      float16       
 11  dew_temperature     float64       
 12  precip_depth_1_hr   float16       
 13  sea_level_pressure  float64       
 14  wind_direction      float16       
 15  wind_speed          float64       
 16  DT_month            int8          
 17  DT_week_y           int8          
 18  DT_day_y            int8          
 19  DT_hour             int8          
 20  

In [16]:
# Check memory usage

In [17]:
# Model Parameters
import lightgbm as lgb

lgb_params = {
    
    'objective':'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'n_jobs':-1,
    'learning_rate':0.05,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',
    'colsample_bytree': 0.7,
    'subsample_freq':1,
    'subsample':0.7,
    'n_estimators':1000,
    'max_bin':255,
    'verbose':-1,
    'seed': SEED,
    'early_stopping_rounds':200,
    
}

In [18]:
#prepare training data

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

#use np.log1 for distribution
#label is target

tr_data = lgb.Dataset(train_df[feature_columns],label=np.log1p(train_df[TARGET]))
eval_sets = [tr_data]

In [31]:
#train model

seed_everything(15)
lgb_params['seed'] = 15

estimator_cv = lgb.cv(
            lgb_params,
            tr_data,
            nfold = 10,
            verbose_eval = 100,
            stratified = False
      
)



[100]	cv_agg's rmse: 1.00148 + 0.0014867
[200]	cv_agg's rmse: 0.861561 + 0.0023735
[300]	cv_agg's rmse: 0.798965 + 0.00314787
[400]	cv_agg's rmse: 0.761996 + 0.00258984
[500]	cv_agg's rmse: 0.733848 + 0.00331273
[600]	cv_agg's rmse: 0.71013 + 0.00287672
[700]	cv_agg's rmse: 0.69082 + 0.00285381
[800]	cv_agg's rmse: 0.676294 + 0.0024015
[900]	cv_agg's rmse: 0.663047 + 0.0022589
[1000]	cv_agg's rmse: 0.651802 + 0.00191639


In [28]:
# Model Predictor

test_df = test_df[feature_columns]

predictions = []

predictions = list(np.expm1(estimator.predict(test_df[feature_columns])))

#Add to submission file

submission = pd.read_pickle('submission.pkl')

In [29]:
predictions

[5.7621228334969254,
 5.851886278383959,
 5.833846157403639,
 1.1264426846983764,
 6.864555523582044,
 6.27349831530689,
 1.4420282535706104,
 3.3700439231813153,
 4.423529397257761,
 86.28022254593765,
 4.225022680697785,
 3.2037766596470214,
 2.9573838443839793,
 17.196908441344387,
 7.228616237622666,
 5.039741276352686,
 4.0930856352966245,
 16.286725181051125,
 4.3374076736778235,
 111.19643006598127,
 2.1390014723854356,
 73.73991134311281,
 10.644540276637091,
 2.764849076643498,
 8.760610817479984,
 4.464301175543088,
 4.468989284419207,
 4.137238856922552,
 2.498509796700569,
 6.73444831721429,
 5.8679439880263775,
 3.1987626618673315,
 3.917593318711716,
 3.0697304116307347,
 4.181248290969576,
 34.44342777876032,
 3.9020261349337337,
 9.15222642766382,
 3.914522560553954,
 38.84298973360393,
 5.088249818649487,
 135.40081904349998,
 4.520005865014807,
 6.018211565749571,
 4.640758957164312,
 3.906286909590333,
 2.530558280231575,
 2.530558280231575,
 7.249541877125922,
 2.87

In [30]:
submission['meter_reading'] = predictions

In [31]:
submission

Unnamed: 0,row_id,meter_reading
0,0,5.762123
1,1,5.851886
2,2,5.833846
3,3,1.126443
4,4,6.864556
...,...,...
41697595,41697595,1.152228
41697596,41697596,0.056984
41697597,41697597,107.679693
41697598,41697598,297.466910
