# Overview

- this is a baseline model to determine immediate performance
- will use gradient boost modelling using package XGboost
- no data cleaning feature engineering to be used

In [35]:
# General Imports

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os, gc, sys, warnings, random, psycopg2

In [36]:
# Functions

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [37]:
SEED = 42
LOCAL_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [38]:
# open pickle files

train_df = pd.read_pickle('train_df.pkl')

#choose columns to remove
remove_columns = [TARGET,'timestamp']
#get feature columns using list comprehension to remove columns - list(train_df)
feature_columns = [col for col in list(train_df) if col not in remove_columns]

X = train_df[feature_columns]
y = train_df[TARGET] 

# split to test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [39]:
#prepare training data
import xgboost as xgb



#label is target

tr_data = xgb.DMatrix(X_train,label=y_train)

In [46]:
# Model Parameters


xgb_param = {'max_depth': 25, 
         'eta': 0.3, 
         'objective': 'reg:squarederror',
         'eval_metric': 'rmse'} 
    
bst = xgb.train(xgb_param,tr_data,10)

In [47]:
tst_data = xgb.DMatrix(X_test)

In [48]:
predictions = list(bst.predict(tst_data))
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(rmse)

26487.644298650834


In [49]:
results = pd.DataFrame(data = {'actual':y_test,'predictions':predictions})

In [50]:
results

Unnamed: 0,actual,predictions
8394036,113.0500,106.666237
16429056,234.2940,210.772049
8436046,70.8600,70.217094
19113226,195.0500,195.924042
15057294,8.5900,6.816153
...,...,...
2966663,10.6000,10.273479
17043557,61.2167,58.502518
11613824,55.0400,53.045090
3234452,156.4000,135.581802


In [42]:
# plot the two against each other

#bring two dataframes together

print(len(predictions))

20216100


In [43]:
print(train_df.shape[0])

20216100


https://www.kaggle.com/volker48/xgboost-custom-rmsle-eval-function

In [44]:
xgb.cv(xgb_param,tr_data,num_boost_round=20,nfold=5,verbose_eval=True)

[0]	train-rmse:109775.09687+477.38759	test-rmse:110495.32969+2434.74336
[1]	train-rmse:79306.97500+380.20961	test-rmse:81403.18281+2420.67661
[2]	train-rmse:57877.35156+270.59656	test-rmse:61459.84062+2292.99215
[3]	train-rmse:42751.29609+221.21715	test-rmse:48141.68750+2357.36038
[4]	train-rmse:31961.70898+131.69323	test-rmse:39338.06875+2268.06578
[5]	train-rmse:24217.41289+50.44417	test-rmse:33786.16602+2124.59191
[6]	train-rmse:18660.84258+122.62555	test-rmse:30427.97070+2076.16589
[7]	train-rmse:14631.93340+180.31027	test-rmse:28369.88555+2046.51516
[8]	train-rmse:11584.62656+242.45792	test-rmse:27191.40352+2050.81873
[9]	train-rmse:9390.55762+405.47545	test-rmse:26391.10430+1987.64158
[10]	train-rmse:7653.99336+392.41488	test-rmse:25912.57188+1939.75736
[11]	train-rmse:6327.53682+381.18544	test-rmse:25626.30273+1885.94231
[12]	train-rmse:5334.71729+363.70321	test-rmse:25442.33203+1845.11880
[13]	train-rmse:4585.52480+350.71768	test-rmse:25356.35937+1823.60817
[14]	train-rmse:3993

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,109775.096875,477.387593,110495.329688,2434.743363
1,79306.975,380.209608,81403.182813,2420.676608
2,57877.351563,270.596558,61459.840625,2292.992145
3,42751.296094,221.217154,48141.6875,2357.360378
4,31961.708984,131.693234,39338.06875,2268.065784
5,24217.41289,50.444175,33786.166015,2124.591911
6,18660.842578,122.625552,30427.970703,2076.165895
7,14631.933398,180.31027,28369.885547,2046.515163
8,11584.626562,242.457924,27191.403516,2050.81873
9,9390.557617,405.475446,26391.104297,1987.641579
