https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f

## Importing modules
### Seperated between libraries to keep track

In [6]:
import numpy as np

In [7]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [8]:
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
from xgboost import XGBRegressor

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [11]:
from hyperopt import hp
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Function Building

In [63]:
"""CORRELATED FEATURES"""
"""
train = pd.read_csv("../data/train_correlated.csv")

X_test = pd.read_csv("../data/test_correlated.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
"""

In [141]:
"""ALL FEATURES"""
#"""
train = pd.read_csv("../data/data_all_features.csv")

X_test = pd.read_csv("../data/test_all_features.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
#"""

In [142]:
X, X_hold, y, y_hold = train_test_split(X_all,y_all)

In [143]:
X_hold = X_hold.reset_index()
y_hold = y_hold.reset_index()

original_index = y_hold["index"]
X_hold = X_hold.drop(columns = ["index"])
y_hold = y_hold.drop(columns = ["index"])

In [144]:
def XGB_val_in_hyperopt(XGBModel, X, y, early_stop_num = 10):
    X_train, X_valid, y_train, y_valid = train_test_split(X,y)
    evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
    XGBModel.fit(X_train, y_train,
                eval_set=evaluation, 
                eval_metric="mae",
                early_stopping_rounds=early_stop_num,
                verbose=False
                )
    predict = XGBModel.predict(X_valid)
    accuracy = XGBModel.predict(X_valid)
    n_est_used = len(XGBModel.get_booster().get_dump())
    
    return accuracy, n_est_used

In [145]:
"""Linear Regressor"""
def linear_objective(space):
    clf=XGBRegressor(
                    n_estimators = int(space['n_estimators']), 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'],
                    reg_lambda = space['reg_lambda'],
                    min_child_weight= space['min_child_weight'],
                    colsample_bytree= space['colsample_bytree'],
    )
    
    
    acc = []
    n_est = []
    for i in range(5):
        accuracy, n_est_used = XGB_val_in_hyperopt(XGBModel = clf, X = X, y = y, early_stop_num = 10)
        acc.append(accuracy)
        n_est.append(n_est_used)
    
    accuracy = np.mean(acc)
    n_est_used = int(np.mean(n_est))
    
    trials.results[-1]["n_estimators"] = n_est_used
    
    trials.trials[-1]["misc"]["vals"]["n_estimators"] = [n_est_used]
    
    return {'loss': accuracy, 'status': STATUS_OK, "n_estimators" : n_est_used}

In [146]:
def XGB_optimised_model(X, y, X_test, num_search = 100):
    global trials
    trials = Trials()

    best_hyperparams = fmin(fn = linear_objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = num_search,
                            trials = trials)
    clf=XGBRegressor(
                n_estimators = int(best_hyperparams['n_estimators']), 
                max_depth = int(best_hyperparams['max_depth']), 
                gamma = best_hyperparams['gamma'],
                reg_alpha = best_hyperparams['reg_alpha'],
                reg_lambda = best_hyperparams['reg_lambda'],
                min_child_weight = best_hyperparams['min_child_weight'],
                colsample_bytree = best_hyperparams['colsample_bytree']
    )

    clf.fit(X, y, 
            eval_metric="mae",
            verbose=True
    )

    predict = clf.predict(X_test)

    return predict, best_hyperparams, clf

In [157]:
def XGB_optimised_ensamble(X, y, X_test, num_search = 10, num_models = 10):
    predictions = {}
    parameters = {}
    models = {}
    
    for i in range(num_models):
        predictions["model_"+str(i)], parameters["model_"+str(i)], models["model_"+str(i)] = XGB_optimised_model(X, y, X_test, num_search)
    
    return predictions, parameters, models

In [148]:
def test_ensamble(ensamble_models, X_test):
    n_models = len(ensamble_models)
    predictions = pd.DataFrame()
    for i in range(n_models):
        model = "model_"+str(i)
        clf = ensamble_models[model]
        predictions[model] = clf.predict(X_test)
    
    return predictions

In [149]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 999, 1000, 1),
        'seed': 0
    }

In [158]:
predictions, parameters, models = XGB_optimised_ensamble(X, y, X_test = X_hold, num_search = 10, num_models = 25)

100%|█████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.49s/trial, best loss: 173553.171875]
100%|████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.53s/trial, best loss: 175123.625]
100%|█████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.31s/trial, best loss: 175811.203125]
100%|█████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.89s/trial, best loss: 177568.796875]
100%|█████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.57s/trial, best loss: 176849.953125]
100%|████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.42s/trial, best loss: 176450.375]
100%|███████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.44s/trial, best loss: 176179.8125]
100%|█████████████████████████████████████████████████████| 10/10 [00:17<00:00,  1.76s/trial, best loss: 176307.359375]
100%|███████████████████████████████████

In [159]:
model_predictions = pd.DataFrame(predictions)
comparison = pd.DataFrame({"Original Index":original_index,"Predictions":model_predictions.mean(axis=1), "SalePrice":y_hold["SalePrice"]})
comparison["Delta"] = comparison["Predictions"] - comparison["SalePrice"]
model_predictions

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,model_11,model_12,model_13,model_14,model_15,model_16,model_17,model_18,model_19,model_20,model_21,model_22,model_23,model_24
0,116487.960938,103206.921875,104514.140625,104773.546875,115318.226562,121210.531250,119245.890625,111304.351562,111997.960938,105245.015625,112676.898438,119544.156250,123750.093750,124337.992188,110668.882812,123043.257812,95377.742188,114509.281250,94029.859375,110734.757812,103170.867188,109821.445312,118185.921875,123101.539062,105985.023438
1,125141.781250,123633.421875,121179.210938,126701.250000,124956.679688,121055.320312,128995.726562,126220.531250,120031.960938,122789.289062,132471.062500,124758.093750,129094.179688,126254.406250,123292.648438,120408.593750,119430.445312,122249.007812,123815.523438,122231.000000,121989.195312,118337.554688,119682.375000,120172.882812,122410.445312
2,206864.875000,194325.406250,197708.937500,198844.828125,199788.750000,191772.921875,198233.781250,201761.000000,199497.031250,195278.937500,199856.031250,198263.859375,185067.875000,190552.359375,196675.203125,196249.171875,198045.703125,207783.375000,195725.515625,203753.937500,201227.796875,195409.828125,198497.203125,194215.062500,204184.140625
3,103524.640625,105703.437500,102202.265625,101708.828125,102530.398438,96980.671875,101058.648438,113002.492188,105224.007812,100522.507812,112484.289062,94093.945312,109944.218750,88206.226562,115261.289062,103540.929688,111792.320312,113811.523438,93456.359375,91463.390625,102542.570312,112777.851562,102536.039062,106839.750000,103362.867188
4,367236.437500,359616.375000,379367.375000,345177.593750,363835.218750,384798.687500,340320.718750,382797.562500,344277.312500,327649.281250,357743.593750,359257.312500,350931.468750,390838.062500,385527.562500,401689.625000,367089.343750,396081.625000,351333.593750,327499.281250,363065.312500,371907.500000,333345.437500,385893.093750,343375.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,241804.046875,218173.296875,243411.296875,257302.546875,263500.937500,250308.468750,267905.125000,235453.265625,266252.781250,241446.406250,243405.812500,247906.468750,254020.015625,222104.531250,239914.296875,242079.906250,272071.281250,245664.359375,258678.406250,239325.656250,253382.703125,235220.453125,255307.703125,256572.593750,257676.109375
361,175195.390625,157460.328125,166826.312500,168547.000000,165184.765625,162054.187500,151419.625000,171104.437500,179571.046875,167579.812500,167879.640625,177027.171875,168607.343750,180323.890625,157733.187500,165965.406250,171723.875000,191083.218750,170732.031250,165274.843750,157996.156250,167958.343750,172594.437500,168776.062500,192969.578125
362,72386.804688,82526.000000,91518.929688,69293.210938,68400.070312,81781.156250,68182.796875,64176.578125,66230.046875,80587.414062,88980.812500,69772.250000,73704.835938,72111.929688,73005.367188,57042.308594,81397.976562,70975.726562,78824.765625,90162.906250,79641.406250,85809.562500,63457.332031,89176.148438,83478.296875
363,294154.375000,311966.156250,329933.687500,311088.906250,290998.750000,311651.562500,330733.968750,298421.500000,291216.281250,312810.062500,313812.656250,309046.625000,302599.406250,272451.125000,309589.812500,281290.375000,329881.125000,323429.187500,300391.250000,293026.937500,296490.593750,301367.687500,286511.468750,311767.406250,309960.531250


In [165]:
parameters

{'model_0': {'colsample_bytree': 0.7280589520464351,
  'gamma': 2.6794250756879725,
  'max_depth': 14.0,
  'min_child_weight': 1.0,
  'n_estimators': 38,
  'reg_alpha': 54.0,
  'reg_lambda': 0.4956620016168255},
 'model_1': {'colsample_bytree': 0.8527358634297997,
  'gamma': 6.135842656359645,
  'max_depth': 11.0,
  'min_child_weight': 4.0,
  'n_estimators': 36,
  'reg_alpha': 141.0,
  'reg_lambda': 0.44448047181369166},
 'model_2': {'colsample_bytree': 0.9109172944862352,
  'gamma': 7.600947941044651,
  'max_depth': 14.0,
  'min_child_weight': 10.0,
  'n_estimators': 36,
  'reg_alpha': 105.0,
  'reg_lambda': 0.6164205284659913},
 'model_3': {'colsample_bytree': 0.6977891404609307,
  'gamma': 1.5875290283488779,
  'max_depth': 15.0,
  'min_child_weight': 4.0,
  'n_estimators': 25,
  'reg_alpha': 51.0,
  'reg_lambda': 0.9739438097477965},
 'model_4': {'colsample_bytree': 0.6058852891913242,
  'gamma': 4.966853114516098,
  'max_depth': 7.0,
  'min_child_weight': 2.0,
  'n_estimators': 35

In [160]:
comparison["Delta"] = np.abs(comparison["Predictions"] - comparison["SalePrice"])
comparison["Error"] = comparison["Delta"]/comparison["SalePrice"]

In [161]:
comparison.sort_values("Error", ascending=False)

Unnamed: 0,Original Index,Predictions,SalePrice,Delta,Error
146,30,88968.148438,40000,48968.148438,1.224204
220,1324,307131.875000,147000,160131.875000,1.089332
226,916,72655.500000,35311,37344.500000,1.057588
8,812,102437.773438,55993,46444.773438,0.829475
42,874,106233.960938,66500,39733.960938,0.597503
...,...,...,...,...,...
276,1095,176274.687500,176432,157.312500,0.000892
0,202,112089.687500,112000,89.687500,0.000801
5,1295,138554.203125,138500,54.203125,0.000391
240,48,113030.203125,113000,30.203125,0.000267


In [162]:
comparison.describe()

Unnamed: 0,Original Index,Predictions,SalePrice,Delta,Error
count,365.0,365.0,365.0,365.0,365.0
mean,683.654795,184604.9375,184838.356164,15435.246458,0.092285
std,424.543816,71508.1875,76110.712572,17751.791667,0.127501
min,2.0,53950.28125,35311.0,30.203125,0.000247
25%,294.0,134676.859375,132000.0,4357.640625,0.028013
50%,677.0,169519.703125,172500.0,10986.21875,0.061414
75%,1055.0,220214.1875,226700.0,19096.4375,0.115415
max,1451.0,505777.53125,555000.0,160131.875,1.224204


In [163]:
ensamble_predictions = test_ensamble(ensamble_models = models, X_test = X_test).mean(axis=1)

In [164]:
submission = pd.DataFrame({"Id":X_test_id,"SalePrice":ensamble_predictions})
submission.to_csv("../skl-XGBoost/submission.csv", sep =",", index = False)