## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Prepare data for model training

In [2]:
with open("../input/workation-price-prediction-preprocess-data-v4/Workation_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

train_df = train_df[~((train_df['Per Person Price'] < 7.3) | (np.exp(train_df['Per Person Price']) > 150000))].copy()

del processed_data
gc.collect()

53

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'Per Person Price'].values
Ytrain = train_df['Per Person Price'].values
Ytrain_strat = pd.qcut(train_df['Per Person Price'].values, q=10, labels=range(0,10))
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (20989, 5752) 
Ytrain: (20989,) 
Xtest: (9000, 5752)


20

## Build and validate the model

In [4]:
FOLD = 10
NUM_SEED = 3

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_xgb = np.zeros((Xtest.shape[0], NUM_SEED))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain_strat)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        model = XGBRegressor(
            objective='reg:squarederror',
            eval_metric='rmse',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            num_round=8000,
            max_depth=11, 
            max_leaves=345,
            learning_rate=0.074,
            subsample=0.984,
            colsample_bytree=0.675,
            min_child_weight=7,
            reg_lambda=0.152,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        y_pred_meta_xgb[val, sidx] += y_pred
        y_pred_final_xgb[:, sidx] += model.predict(Xtest, ntree_limit=model.best_ntree_limit)
        
        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_xgb = y_pred_final_xgb / float(FOLD)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-rmse:8.59273	validation_1-rmse:8.59313
[50]	validation_0-rmse:0.21597	validation_1-rmse:0.25382
[99]	validation_0-rmse:0.08976	validation_1-rmse:0.16547
Seed-24 | Fold-0 | OOF Score: 0.015294691749109109
[0]	validation_0-rmse:8.59292	validation_1-rmse:8.59036
[50]	validation_0-rmse:0.21649	validation_1-rmse:0.24226
[99]	validation_0-rmse:0.09082	validation_1-rmse:0.15793
Seed-24 | Fold-1 | OOF Score: 0.014632449050544484
[0]	validation_0-rmse:8.59268	validation_1-rmse:8.59314
[50]	validation_0-rmse:0.21663	validation_1-rmse:0.24899
[99]	validation_0-rmse:0.09107	validation_1-rmse:0.15817
Seed-24 | Fold-2 | OOF Score: 0.014605087881440219
[0]	validation_0-rmse:8.59283	validation_1-rmse:8.59181
[50]	validation_0-rmse:0.21667	validation_1-rmse:0.24711
[99]	validation_0-rmse:0.09063	validation_1-rmse:0.16488
Seed-24 | Fold-3 | OOF Score: 0.0152052713655116
[0]	validation_0-rmse:8.59264	validation_1-rmse:8.59341
[50]	validation_0-rmse:0.21657	validation_1-rmse:0.24560
[99]	

In [5]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

In [6]:
df = pd.DataFrame(y_pred_final_xgb)
df.head()

Unnamed: 0,0,1,2
0,9.828232,9.819984,9.827808
1,9.322518,9.332821,9.329927
2,8.618342,8.61436,8.613281
3,8.870387,8.873793,8.867922
4,9.965712,9.965982,9.962759
