## XGboost modelling




In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [4]:
# Load the data
train_df = pd.read_csv('data/train.csv', header=0)
test_df = pd.read_csv('data/test.csv', header=0)
valitade_df = pd.read_csv('data/validate.csv', header=0)

In [5]:
# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [6]:
feature_columns_to_use = ['wind','demand','photo','temp','thermo','hydro_disp','hydro_prod']


In [7]:
# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [8]:
# Prepare the inputs for the model
train_X = big_X_imputed[0:train_df.shape[0]].values
test_X = big_X_imputed[train_df.shape[0]::].values
train_y = train_df['price']

In [9]:
#Fitting XGB regressor 
model = xgb.XGBRegressor()
model.fit(train_X,train_y)
print (model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [12]:
output = model.predict(data=test_X)
prediction = pd.DataFrame({'date': test_df['date'],'price': output })
print(prediction)

                     Date     Price
0     2018-01-29 13:00:00  6.448314
1     2016-10-13 15:00:00  5.334073
2     2015-09-20 16:00:00  4.488747
3     2014-08-25 03:00:00  4.712053
4     2017-09-06 00:00:00  4.546413
5     2017-07-30 00:00:00  4.568891
6     2018-02-24 15:00:00  5.884620
7     2015-12-21 17:00:00  6.091932
8     2017-03-14 04:00:00  2.754140
9     2014-03-24 13:00:00  3.661605
10    2018-04-23 05:00:00  4.120895
11    2014-06-06 22:00:00  3.616781
12    2017-05-20 00:00:00  4.560211
13    2014-12-07 22:00:00  4.511248
14    2017-02-23 15:00:00  6.033588
15    2015-08-05 06:00:00  6.007342
16    2016-06-18 23:00:00  3.760328
17    2014-07-31 02:00:00  4.530462
18    2014-02-25 16:00:00  2.034354
19    2017-07-10 00:00:00  4.271337
20    2017-02-20 15:00:00  5.525524
21    2014-06-16 11:00:00  4.997105
22    2016-11-09 00:00:00  3.949631
23    2015-09-01 00:00:00  5.083560
24    2014-03-28 22:00:00  2.123763
25    2015-11-25 09:00:00  5.127535
26    2017-09-03 03:00:00  4

In [13]:
#file generation
prediction.to_csv("prediction.csv", index=False)