## Importing modules
### Seperated between libraries to keep track

In [4]:
import numpy as np

In [5]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from xgboost import XGBRegressor

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [9]:
from hyperopt import hp
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Functions

### A Single validated model

In [10]:
def XGB_val_in_hyperopt(XGBModel, X, y, early_stop_num = 10):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X,y)
    evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
    XGBModel.fit(X_train, y_train,
                eval_set=evaluation, 
                eval_metric="mae",
                early_stopping_rounds=early_stop_num,
                verbose=False
                )
    
    predict = XGBModel.predict(X_valid)
    accuracy = XGBModel.predict(X_valid)
    n_est_used = len(XGBModel.get_booster().get_dump())
    
    return accuracy, n_est_used

### A fully cross validated model, packaged as an "objective" to optimise

In [11]:
"""Linear Regressor"""
def linear_objective(space):
    
    clf=XGBRegressor(
                    n_estimators = int(space['n_estimators']), 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'],
                    reg_lambda = space['reg_lambda'],
                    min_child_weight= space['min_child_weight'],
                    colsample_bytree= space['colsample_bytree'],
    )
    
    acc = []
    n_est = []
    
    for i in range(5):
        accuracy, n_est_used = XGB_val_in_hyperopt(XGBModel = clf, X = X, y = y, early_stop_num = 10)
        acc.append(accuracy)
        n_est.append(n_est_used)
    
    accuracy = np.mean(acc)
    n_est_used = int(np.mean(n_est))
    
    trials.results[-1]["n_estimators"] = n_est_used
    trials.results[-1]["loss"] = accuracy
    trials.trials[-1]["misc"]["vals"]["n_estimators"] = [n_est_used]
    
    return {'loss': accuracy, 'status': STATUS_OK, "n_estimators" : n_est_used}

### Single optimised model (best out of num_search models tested) 

In [12]:
def XGB_optimised_model(X, y, X_test, space, num_search = 5):
    
    global trials
    trials = Trials()
    
    best_hyperparams = fmin(fn = linear_objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = num_search,
                            trials = trials)
    
    clf=XGBRegressor(
                n_estimators = int(trials.best_trial["result"]["n_estimators"]), 
                max_depth = int(best_hyperparams['max_depth']), 
                gamma = best_hyperparams['gamma'],
                reg_alpha = best_hyperparams['reg_alpha'],
                reg_lambda = best_hyperparams['reg_lambda'],
                min_child_weight = best_hyperparams['min_child_weight'],
                colsample_bytree = best_hyperparams['colsample_bytree']
    )

    clf.fit(X, y, 
            eval_metric="mae",
            verbose=True
    )

    predict = clf.predict(X_test)
    params = best_hyperparams
    
    return predict, params, clf

### Ensamble of num_model optimised models 

In [13]:
def XGB_optimised_ensamble(X, y, X_test, num_search = 5, num_models = 2):
    predictions = {}
    parameters = {}
    models = {}
    
    for i in range(num_models):
        predictions["model_"+str(i)], parameters["model_"+str(i)], models["model_"+str(i)] = XGB_optimised_model(X, y, X_test, num_search)
    
    return predictions, parameters, models

In [14]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform ('gamma', 1,9),
    'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
    'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 999, 1000, 1),
    'seed': 0
}

# Function Building

In [15]:
"""CORRELATED FEATURES"""
#"""
train = pd.read_csv("../data/train_correlated.csv")

X_test = pd.read_csv("../data/test_correlated.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
#"""

In [16]:
X, X_hold, y, y_hold = train_test_split(X_all,y_all)

In [17]:
X_hold = X_hold.reset_index()
y_hold = y_hold.reset_index()

X_hold = X_hold.drop(columns = ["index"])
y_hold = y_hold.drop(columns = ["index"])

In [20]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 999, 1000, 1),
        'seed': 0
    }

In [22]:
trials = Trials()

best_hyperparams = fmin(fn = linear_objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 1,
                        trials = trials)

100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.56trial/s, best loss: 179435.453125]


In [23]:
XGB_optimised_ensamble(X, y, X_test = X_hold)

IndexError: list index out of range

In [24]:
pred, para, mod = XGB_optimised_ensamble(X, y, X_test = X_hold)

IndexError: list index out of range