In [4]:
import pandas as pd
import numpy as np
import datetime
from copy import deepcopy

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'DejaVu Sans'
matplotlib.style.use('ggplot')
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('features.pkl')
data.head()

Unnamed: 0,price_open,price_high,price_low,price_close,volume_traded,trades_count,hour,price_open_-1,price_high_-1,price_low_-1,...,price_low_-58,price_close_-58,volume_traded_-58,trades_count_-58,price_open_-59,price_high_-59,price_low_-59,price_close_-59,volume_traded_-59,trades_count_-59
59,7154.97,7154.97,7154.97,7154.97,2.0,2,0,7163.3,7163.3,7163.3,...,7168.3,7168.3,1.0,2.0,7165.72,7165.72,7165.71,7165.71,0.021841,2.0
60,7161.2,7163.4,7161.2,7163.4,0.0158,2,0,7154.97,7154.97,7154.97,...,7170.5,7170.5,0.002,1.0,7168.3,7168.3,7168.3,7168.3,1.0,2.0
61,7154.98,7154.98,7154.97,7154.98,0.038357,3,0,7161.2,7163.4,7161.2,...,7169.2,7169.2,0.004,2.0,7170.5,7170.5,7170.5,7170.5,0.002,1.0
62,7154.98,7154.98,7154.98,7154.98,0.032201,1,0,7154.98,7154.98,7154.97,...,7169.2,7169.2,0.002,1.0,7169.2,7169.2,7169.2,7169.2,0.004,2.0
63,7154.97,7154.97,7154.97,7154.97,2.0,1,0,7154.98,7154.98,7154.98,...,7165.72,7169.2,0.075433,3.0,7169.2,7169.2,7169.2,7169.2,0.002,1.0


In [3]:
target = 'price_high'
features = list(data.columns)

for feat in features:
    scaler = MinMaxScaler()
    data[feat] = scaler.fit_transform(np.array(data[feat]).reshape(-1, 1))
    if feat == target:
        target_scaler = deepcopy(scaler)

features.remove(target)
        
data.head()

Unnamed: 0,price_open,price_high,price_low,price_close,volume_traded,trades_count,hour,price_open_-1,price_high_-1,price_low_-1,...,price_low_-58,price_close_-58,volume_traded_-58,trades_count_-58,price_open_-59,price_high_-59,price_low_-59,price_close_-59,volume_traded_-59,trades_count_-59
59,0.183576,0.183576,0.291301,0.131212,0.030762,0.011494,0.0,0.331323,0.331323,0.419554,...,0.496536,0.300203,0.015381,0.011494,0.374246,0.374246,0.456659,0.267368,0.000336,0.011494
60,0.294076,0.333097,0.387221,0.238083,0.000243,0.011494,0.0,0.183576,0.183576,0.291301,...,0.530408,0.328093,3.1e-05,0.0,0.420007,0.420007,0.496536,0.300203,0.015381,0.011494
61,0.183753,0.183753,0.291301,0.131339,0.00059,0.022989,0.0,0.294076,0.333097,0.387221,...,0.510393,0.311613,6.1e-05,0.011494,0.459028,0.459028,0.530408,0.328093,3.1e-05,0.0
62,0.183753,0.183753,0.291455,0.131339,0.000495,0.0,0.0,0.183753,0.183753,0.291301,...,0.510393,0.311613,3.1e-05,0.0,0.43597,0.43597,0.510393,0.311613,6.1e-05,0.011494
63,0.183576,0.183576,0.291301,0.131212,0.030762,0.0,0.0,0.183753,0.183753,0.291455,...,0.456813,0.311613,0.00116,0.022989,0.43597,0.43597,0.510393,0.311613,3.1e-05,0.0


In [5]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [6, 8, 10],
    'max_features': [2, 4, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [2, 4, 8, 12],
    'n_estimators': [100, 200, 300, 1000],
    'eval_metric': ['mae'],
    'min_child_weight': [1, 3, 5],
    'colsample_bytree': [1],
    'objective': ['reg:linear']
}

# Create a based model
model = xgb.XGBRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

# train test split
train, test = train_test_split(data, test_size=1 / 3, random_state=99)  # test_size=0.2

In [None]:
%%time
# Fit the grid search to the data
grid_search.fit(train[features], train[target])

print('Best params: ', grid_search.best_params_)

best_grid = grid_search.best_estimator_

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 42.5min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 65.3min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 91.5min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 120.2min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 154.4min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed: 191.1min
[Parallel(n_jobs=-1)]: Done 3888 out of 3888 | elapsed: 226.6min finished


Parameters: { bootstrap, max_features, min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Best params:  {'bootstrap': True, 'colsample_bytree': 1, 'eval_metric': 'mae', 'max_depth': 8, 'max_features': 2, 'min_child_weight': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 1000, 'objective': 'reg:linear'}
CPU times: user 1min 51s, sys: 2.5 s, total: 1min 53s
Wall time: 3h 46min 43s


In [8]:
model = xgb.XGBRegressor(bootstrap= True, colsample_bytree=1, eval_metric='mae', max_depth=8, 
                         max_features=2, min_child_weight=3, min_samples_leaf=3, min_samples_split=2, 
                         n_estimators=1000, objective='reg:linear')
model.fit(train[features], train[target])

Parameters: { bootstrap, max_features, min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [16]:
predictions = model.predict(test[features])
predictions_unscaled = target_scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
actuals = target_scaler.inverse_transform(np.array(test[target]).reshape(-1, 1))

mae = mean_absolute_error(actuals, predictions)

In [17]:
mae

7188.396244508733

In [11]:
target_scaler.inverse_transform(np.array(mae).reshape(-1, 1))[0]

array([7144.74836732])

In [13]:
predictions

array([0.99997187, 1.0010693 , 0.62430817, ..., 1.0000545 , 0.99997   ,
       0.17393327], dtype=float32)

In [10]:

mae = evaluate(best_grid, test[features], test[target])
print("MAE, original scale: ", target_scaler.inverse_transform(np.array(mae).reshape(-1, 1))[0])

NameError: name 'evaluate' is not defined