# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package XGboost
- no data cleaning feature engineering to be used

In [1]:
# General Imports

import numpy as np
import pandas as pd
import pickle

import os, gc, sys, warnings, random, psycopg2

In [2]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [4]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
submission = pd.read_pickle('submission.pkl')

In [5]:
#data check
print("Main:",list(train_df),train_df.info())
print("#" * 20)
print("Main:",list(test_df),test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   site_id             int8          
 2   meter               int8          
 3   meter_reading       float64       
 4   timestamp           datetime64[ns]
 5   primary_use         int8          
 6   square_feet         int32         
 7   year_built          int8          
 8   floor_count         int8          
 9   air_temperature     float64       
 10  cloud_coverage      float16       
 11  dew_temperature     float64       
 12  precip_depth_1_hr   float16       
 13  sea_level_pressure  float64       
 14  wind_direction      float16       
 15  wind_speed          float64       
 16  DT_month            int8          
 17  DT_week_y           int8          
 18  DT_day_y            int8          
 19  DT_hour             int8          
 20  

In [6]:
train_df.isnull().values.any()

True

In [7]:
train_df.isnull().sum().sum()

15595579

In [8]:
#count how many na's in each column

for col in train_df.columns:
    a = col
    b = str(train_df[col].isnull().sum().sum())
    print(a + " " + b) 

building_id 0
site_id 0
meter 0
meter_reading 0
timestamp 0
primary_use 0
square_feet 0
year_built 0
floor_count 0
air_temperature 96658
cloud_coverage 8825365
dew_temperature 100140
precip_depth_1_hr 3749023
sea_level_pressure 1231669
wind_direction 1449048
wind_speed 143676
DT_month 0
DT_week_y 0
DT_day_y 0
DT_hour 0
DT_day_w 0
DT_day_m 0
DT_week_m 0


In [9]:
train_df.dropna(axis = 0, how = 'any', inplace = True)

In [10]:
#check data
print("Main:",list(train_df),train_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8955945 entries, 2080 to 20214784
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   site_id             int8          
 2   meter               int8          
 3   meter_reading       float64       
 4   timestamp           datetime64[ns]
 5   primary_use         int8          
 6   square_feet         int32         
 7   year_built          int8          
 8   floor_count         int8          
 9   air_temperature     float64       
 10  cloud_coverage      float16       
 11  dew_temperature     float64       
 12  precip_depth_1_hr   float16       
 13  sea_level_pressure  float64       
 14  wind_direction      float16       
 15  wind_speed          float64       
 16  DT_month            int8          
 17  DT_week_y           int8          
 18  DT_day_y            int8          
 19  DT_hour             int8          
 20

In [11]:
#count how many na's in each column

for col in train_df.columns:
    a = col
    b = str(train_df[col].isnull().sum().sum())
    print(a + " " + b) 

building_id 0
site_id 0
meter 0
meter_reading 0
timestamp 0
primary_use 0
square_feet 0
year_built 0
floor_count 0
air_temperature 0
cloud_coverage 0
dew_temperature 0
precip_depth_1_hr 0
sea_level_pressure 0
wind_direction 0
wind_speed 0
DT_month 0
DT_week_y 0
DT_day_y 0
DT_hour 0
DT_day_w 0
DT_day_m 0
DT_week_m 0


In [12]:
#prepare training data
import xgboost as xgb

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

#use np.log1 for distribution
#label is target

tr_data = xgb.DMatrix(np.log1p(train_df[feature_columns]),label=np.log1p(train_df[TARGET]))
#tr_data = xgb.DMatrix(train_df[feature_columns],label=train_df[TARGET])

In [13]:
# Model Parameters


xgb_param = {'max_depth': 25, 
         'eta': 0.3, 
         'objective': 'reg:squaredlogerror'} 
    
bst = xgb.train(xgb_param,tr_data,10)

In [14]:
tst_data = xgb.DMatrix(np.log1p(train_df[feature_columns]))

In [15]:
predictions = list(np.expm1(bst.predict(tst_data)))

In [16]:
results = pd.DataFrame(data = {'actual':train_df[TARGET],'predictions':predictions})

In [17]:
results.head(100)

Unnamed: 0,actual,predictions
2080,8.5000,3.835476
2081,0.5210,0.532797
2082,243.5000,59.217979
2083,79.4880,51.994385
2084,16.7500,15.047709
...,...,...
2264,324.2260,25.427666
2265,43.2803,5.533868
2266,13.1000,17.292320
2267,20.9333,13.752945


https://www.kaggle.com/volker48/xgboost-custom-rmsle-eval-function

In [18]:
xgb.cv(xgb_param,tr_data,num_boost_round=10,nfold=5,verbose_eval=True)

[0]	train-rmsle:1.11149+0.00004	test-rmsle:1.11257+0.00012
[1]	train-rmsle:0.97633+0.00006	test-rmsle:0.97860+0.00013
[2]	train-rmsle:0.85030+0.00007	test-rmsle:0.85378+0.00013
[3]	train-rmsle:nan+nan	test-rmsle:nan+nan
[4]	train-rmsle:nan+nan	test-rmsle:nan+nan
[5]	train-rmsle:nan+nan	test-rmsle:nan+nan
[6]	train-rmsle:nan+nan	test-rmsle:nan+nan
[7]	train-rmsle:nan+nan	test-rmsle:nan+nan
[8]	train-rmsle:nan+nan	test-rmsle:nan+nan
[9]	train-rmsle:nan+nan	test-rmsle:nan+nan


Unnamed: 0,train-rmsle-mean,train-rmsle-std,test-rmsle-mean,test-rmsle-std
0,1.111489,4.2e-05,1.112566,0.000123
1,0.976332,6.1e-05,0.978602,0.000129
2,0.850302,6.9e-05,0.853784,0.000135
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,
