# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package XGboost
- no data cleaning feature engineering to be used

In [1]:
# General Imports

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os, gc, sys, warnings, random, psycopg2

In [2]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [4]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

X = train_df[feature_columns]
y = train_df[TARGET] 

# split to test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [15]:
#prepare training data
import xgboost as xgb



#label is target

tr_data = xgb.DMatrix(X_train,label=y_train)
tst_data = xgb.DMatrix(X_test, label=y_test)

In [16]:
#set paramteres

xgb_param = {'max_depth': 25, 
         'eta': 0.3, 
         'objective': 'reg:squarederror',
         'eval_metric': 'rmse'    
         } 

num_boost_round = 999

In [17]:
#use cv to tune the parameters and then update them for train
cv_results = xgb.cv(xgb_param,
       tr_data,
       num_boost_round=num_boost_round,
       nfold=5,
       seed=52,
       metrics={'rmse'},
       early_stopping_rounds=10,
       verbose_eval=True)

[0]	train-rmse:110386.58438+893.15215	test-rmse:111923.52500+4446.32541
[1]	train-rmse:80010.31562+618.22107	test-rmse:83180.77969+3613.68014


KeyboardInterrupt: 

In [None]:
# Model Parameters
#https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f



bst = xgb.train(xgb_param,
                tr_data,
                num_boost_round=num_boost_round,
                evals=[(tr_data,"Test")],
                early_stopping_rounds=10)

In [48]:
predictions = list(bst.predict(tst_data))
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(rmse)

26487.644298650834


In [49]:
results = pd.DataFrame(data = {'actual':y_test,'predictions':predictions})

In [50]:
results

Unnamed: 0,actual,predictions
8394036,113.0500,106.666237
16429056,234.2940,210.772049
8436046,70.8600,70.217094
19113226,195.0500,195.924042
15057294,8.5900,6.816153
...,...,...
2966663,10.6000,10.273479
17043557,61.2167,58.502518
11613824,55.0400,53.045090
3234452,156.4000,135.581802


In [42]:
# plot the two against each other

#bring two dataframes together

print(len(predictions))

20216100


In [43]:
print(train_df.shape[0])

20216100


https://www.kaggle.com/volker48/xgboost-custom-rmsle-eval-function