# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package XGboost
- no data cleaning feature engineering to be used

In [2]:
# General Imports

import numpy as np
import pandas as pd
import pickle

import os, gc, sys, warnings, random, psycopg2

In [3]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [4]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [5]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
submission = pd.read_pickle('submission.pkl')

In [6]:
#data check#data check
print("Main:",list(train_df),train_df.info())
print("#" * 20)
print("Main:",list(test_df),test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 23 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   site_id             int8          
 2   meter               int8          
 3   meter_reading       float64       
 4   timestamp           datetime64[ns]
 5   primary_use         int8          
 6   square_feet         int32         
 7   year_built          int8          
 8   floor_count         int8          
 9   air_temperature     float64       
 10  cloud_coverage      float16       
 11  dew_temperature     float64       
 12  precip_depth_1_hr   float16       
 13  sea_level_pressure  float64       
 14  wind_direction      float16       
 15  wind_speed          float64       
 16  DT_month            int8          
 17  DT_week_y           int8          
 18  DT_day_y            int8          
 19  DT_hour             int8          
 20  

In [7]:
# Check memory usage

In [8]:
#prepare training data
import xgboost as xgb

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

#label is target

tr_data = xgb.DMatrix(train_df[feature_columns],label=train_df[TARGET])

In [11]:
# Model Parameters


xgb_param = {'max_depth': 2, 
         'eta': 1, 
         'objective': 'reg:squaredlogerror',
         'eval_metric': 'rmse'} 
    
bst = xgb.train(xgb_param,tr_data,10)

In [15]:
tst_data = xgb.DMatrix(train_df[feature_columns])

In [18]:
predictions = list(bst.predict(tst_data))

In [19]:
predictions

[15.837857,
 11.009952,
 11.009952,
 11.009952,
 20.029297,
 31.491367,
 11.009952,
 20.029297,
 31.491367,
 31.491367,
 20.029297,
 41.735054,
 20.029297,
 20.029297,
 31.491367,
 31.491367,
 31.491367,
 31.424545,
 15.837857,
 31.491367,
 20.029297,
 31.491367,
 11.009952,
 11.009952,
 31.491367,
 31.491367,
 31.491367,
 20.029297,
 31.491367,
 20.029297,
 31.491367,
 31.491367,
 20.029297,
 20.029297,
 20.029297,
 20.029297,
 20.583996,
 20.583996,
 15.837857,
 31.491367,
 20.029297,
 31.491367,
 41.735054,
 31.491367,
 -8.377839,
 15.837857,
 15.161936,
 20.029297,
 20.029297,
 11.009952,
 41.735054,
 31.491367,
 31.491367,
 11.009952,
 20.029297,
 31.491367,
 31.491367,
 31.491367,
 31.491367,
 31.491367,
 20.029297,
 20.029297,
 20.029297,
 20.583996,
 20.583996,
 20.029297,
 20.029297,
 41.735054,
 41.735054,
 41.735054,
 41.735054,
 41.735054,
 41.735054,
 41.735054,
 31.491367,
 31.491367,
 20.029297,
 31.491367,
 20.029297,
 41.735054,
 11.009952,
 15.837857,
 -8.377839,
 11.

In [20]:
train_df.head()

Unnamed: 0,building_id,site_id,meter,meter_reading,timestamp,primary_use,square_feet,year_built,floor_count,air_temperature,...,sea_level_pressure,wind_direction,wind_speed,DT_month,DT_week_y,DT_day_y,DT_hour,DT_day_w,DT_day_m,DT_week_m
0,222,2,1,11.1132,2016-01-01,1,11935,-109,0,15.6,...,1015.3,270.0,3.6,1,53,1,0,4,1,1
1,0,0,0,0.0,2016-01-01,0,7432,-40,0,25.0,...,1019.7,0.0,0.0,1,53,1,0,4,1,1
2,1,0,0,0.0,2016-01-01,0,2720,-44,0,25.0,...,1019.7,0.0,0.0,1,53,1,0,4,1,1
3,2,0,0,0.0,2016-01-01,0,5376,-57,0,25.0,...,1019.7,0.0,0.0,1,53,1,0,4,1,1
4,3,0,0,0.0,2016-01-01,0,23685,-46,0,25.0,...,1019.7,0.0,0.0,1,53,1,0,4,1,1


In [23]:
# plot the two against each other

#bring two dataframes together

print(len(predictions))

20216100


In [24]:
print(train_df.shape[0])

20216100


In [26]:
pd.concat(train_df,predictions)

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

https://www.kaggle.com/volker48/xgboost-custom-rmsle-eval-function

In [None]:
xgb.cv(xgb_param,tr_data,num_boost_round=10,nfold=5,verbose_eval=True)