## Importing modules
### Seperated between libraries to keep track

In [1]:
import numpy as np

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from xgboost import XGBRegressor

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [6]:
from hyperopt import hp
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# TESTING

In [7]:
"""CORRELATED FEATURES"""
#"""
train = pd.read_csv("../data/train_correlated.csv")

X_test = pd.read_csv("../data/test_correlated.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
#"""

In [8]:
"""ALL FEATURES"""
"""
train = pd.read_csv("../data/data_all_features.csv")

X_test = pd.read_csv("../data/test_all_features.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
"""

'\ntrain = pd.read_csv("../data/data_all_features.csv")\n\nX_test = pd.read_csv("../data/test_all_features.csv")\nX_test_id = pd.read_csv("../data/raw_test.csv")["Id"]\n\nheaders = X_test.columns\n\ny_all = train["SalePrice"]\nX_all = train.drop(columns=["SalePrice"])[headers]\n'

In [9]:
X, X_hold, y, y_hold = train_test_split(X_all,y_all)

In [10]:
y_hold

123     153900
1035     84000
299     158500
1458    142125
1233    142000
         ...  
94      204750
1020    176000
507     208300
884     100000
393     100000
Name: SalePrice, Length: 365, dtype: int64

In [17]:
"""Linear Regressor"""
def objective(space):
    clf=XGBRegressor(
                    n_estimators = int(space['n_estimators']), 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'],
                    reg_lambda = space["reg_lambda"],
                    min_child_weight=space['min_child_weight'],
                    colsample_bytree=space['colsample_bytree'],
    )
    
    acc = []
    n_est = []
    
    for i in range(5):
        X_train, X_valid, y_train, y_valid = train_test_split(X,y)

        evaluation = [( X_train, y_train), ( X_valid, y_valid)]   

        clf.fit(X_train, y_train,
                eval_set=evaluation, 
                eval_metric="mae",
                early_stopping_rounds=25,
                verbose=False
               )
        pred = clf.predict(X_valid)
        
        accuracy = mean_absolute_error(y_valid, pred)
        n_est_used = len(clf.get_booster().get_dump())
        
        acc.append(accuracy)
        n_est.append(n_est_used)

    accuracy = np.mean(acc)
    n_est_used = int(np.mean(n_est))

    #print ("MAE: "+ str(int(round(accuracy,0)))+"; n_estimators "+str(len(clf.get_booster().get_dump())))

    
    trials.results[-1]["n_estimators"] = n_est_used
    
    trials.trials[-1]["misc"]["vals"]["n_estimators"] = [n_est_used]
    
    return {'loss': accuracy, 'status': STATUS_OK, "n_estimators" : n_est_used}

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 999, 1000, 1),
        'seed': 0
    }

In [None]:
print("test")

In [18]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 5,
                        trials = trials)

100%|██████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.59trial/s, best loss: 18333.180189324816]


In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
n_est_test = [2**i for i in range(0,13)]
print(n_est_test)

In [None]:
def test_n_ests(n_est):
    clf=XGBRegressor(
                    n_estimators = int(n_est), 
                    max_depth = int(best_hyperparams['max_depth']), 
                    gamma = best_hyperparams['gamma'],
                    reg_alpha = best_hyperparams['reg_alpha'],
                    min_child_weight = best_hyperparams['min_child_weight'],
                    colsample_bytree = best_hyperparams['colsample_bytree']
    )

    clf.fit(X, y, 
            eval_metric="mae",
            verbose=True
    )

    predict = clf.predict(X_hold)

    accuracy = mean_absolute_error(predict, y_hold)
    
    return accuracy

In [None]:
acc = []
for i in n_est_test:
    acc.append(test_n_ests(i))

In [None]:
print(acc)
plt.loglog(n_est_test, acc)
plt.plot([best_hyperparams['n_estimators'], best_hyperparams['n_estimators']], [0,1e6], color="red")
plt.ylim(10000, 30000)
plt.show()

In [None]:
clf=XGBRegressor(
                n_estimators = int(best_hyperparams['n_estimators']), 
                max_depth = int(best_hyperparams['max_depth']), 
                gamma = best_hyperparams['gamma'],
                reg_alpha = best_hyperparams['reg_alpha'],
                reg_lambda = best_hyperparams["reg_lambda"],
                min_child_weight = best_hyperparams['min_child_weight'],
                colsample_bytree = best_hyperparams['colsample_bytree']
)

clf.fit(X, y, 
        eval_metric="mae",
        verbose=True
)

predict = clf.predict(X_test)

In [None]:
submission = pd.DataFrame({"Id":X_test_id, "SalePrice":y_test})

submission.to_csv("../skl-XGBoost/submission.csv", sep =",", index = False)

# Function Building

In [7]:
"""CORRELATED FEATURES"""
#"""
train = pd.read_csv("../data/train_correlated.csv")

X_test = pd.read_csv("../data/test_correlated.csv")
X_test_id = pd.read_csv("../data/raw_test.csv")["Id"]

headers = X_test.columns

y_all = train["SalePrice"]
X_all = train.drop(columns=["SalePrice"])[headers]
#"""

In [8]:
X, X_hold, y, y_hold = train_test_split(X_all,y_all)

In [9]:
X_hold = X_hold.reset_index()
y_hold = y_hold.reset_index()

X_hold = X_hold.drop(columns = ["index"])
y_hold = y_hold.drop(columns = ["index"])

In [10]:
def XGB_val_in_hyperopt(XGBModel, X, y, early_stop_num = 10):
    X_train, X_valid, y_train, y_valid = train_test_split(X,y)
    evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
    XGBModel.fit(X_train, y_train,
                eval_set=evaluation, 
                eval_metric="mae",
                early_stopping_rounds=early_stop_num,
                verbose=False
                )
    predict = XGBModel.predict(X_valid)
    accuracy = XGBModel.predict(X_valid)
    n_est_used = len(XGBModel.get_booster().get_dump())
    
    return accuracy, n_est_used

In [11]:
"""Linear Regressor"""
def linear_objective(space):
    clf=XGBRegressor(
                    n_estimators = int(space['n_estimators']), 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'],
                    reg_lambda = space['reg_lambda'],
                    min_child_weight= space['min_child_weight'],
                    colsample_bytree= space['colsample_bytree'],
    )
    
    
    acc = []
    n_est = []
    for i in range(5):
        accuracy, n_est_used = XGB_val_in_hyperopt(XGBModel = clf, X = X, y = y, early_stop_num = 10)
        acc.append(accuracy)
        n_est.append(n_est_used)
    
    accuracy = np.mean(acc)
    n_est_used = int(np.mean(n_est))
    
    trials.results[-1]["n_estimators"] = n_est_used
    
    trials.trials[-1]["misc"]["vals"]["n_estimators"] = [n_est_used]
    
    return {'loss': accuracy, 'status': STATUS_OK, "n_estimators" : n_est_used}

In [12]:
def XGB_optimised_model(X, y, X_test, num_search = 100):
    global trials
    trials = Trials()

    best_hyperparams = fmin(fn = linear_objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = num_search,
                            trials = trials)
    clf=XGBRegressor(
                n_estimators = int(best_hyperparams['n_estimators']), 
                max_depth = int(best_hyperparams['max_depth']), 
                gamma = best_hyperparams['gamma'],
                reg_alpha = best_hyperparams['reg_alpha'],
                reg_lambda = best_hyperparams['reg_lambda'],
                min_child_weight = best_hyperparams['min_child_weight'],
                colsample_bytree = best_hyperparams['colsample_bytree']
    )

    clf.fit(X, y, 
            eval_metric="mae",
            verbose=True
    )

    predict = clf.predict(X_test)

    return predict

In [13]:
def XGB_optimised_ensamble(X, y, X_test, num_search = 20, num_models = 12):
    output = pd.DataFrame()
    
    for i in range(num_models):
        output["model_"+str(i)] = XGB_optimised_model(X, y, X_test, num_search)
    
    return output

In [14]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 999, 1000, 1),
        'seed': 0
    }

In [15]:
models = XGB_optimised_ensamble(X, y, X_test = X_hold, num_search = 10, num_models = 10)

100%|█████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.01trial/s, best loss: 177807.484375]
100%|██████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.15trial/s, best loss: 178009.71875]
100%|████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.46trial/s, best loss: 176339.875]
 40%|█████████████████████▌                                | 4/10 [00:03<00:05,  1.03trial/s, best loss: 178064.234375]


KeyboardInterrupt: 

In [None]:
models

In [None]:
output = pd.DataFrame({"Ensamble Model":models.mean(axis = 1), "Actual":y_hold["SalePrice"]})
output

In [None]:
output = pd.DataFrame({"Ensamble Model":models.mean(axis = 1), "Actual":y_hold["SalePrice"]})
output

In [None]:
mean_absolute_error(output["Ensamble Model"], output["Actual"])*365/np.sum(output["Actual"])

In [None]:
def XGB_evaluate_ensamble(ensamble_output, y_hold):
    ensamble = pd.DataFrame({"Ensamble Model":ensamble_output.mean(axis = 1), "Actual":y_hold["SalePrice"]})
    ensamble_mae = mean_absolute_error(ensamble["Ensamble Model"], ensamble["Actual"])
    return ensamble, ensamble_mae

In [None]:
searches = range(1,15)
maes = []

for i in searches:
    models = XGB_optimised_ensamble(X, y, X_test = X_hold, num_search = i, num_models = 5)
    _, ensamble_mae = XGB_evaluate_ensamble(ensamble_output = models, y_hold = y_hold)
    maes.append(ensamble_mae)

In [None]:
plt.plot(searches, maes)
plt.show()

In [None]:
n_models = range(1,25)
maes = []

for i in n_models:
    models = XGB_optimised_ensamble(X, y, X_test = X_hold, num_search = 10, num_models = i)
    _, ensamble_mae = XGB_evaluate_ensamble(ensamble_output = models, y_hold = y_hold)
    maes.append(ensamble_mae)

In [None]:
plt.plot(n_models, maes)
plt.show()