## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_log_error

## Prepare data for model training

In [2]:
with open("../input/workation-price-prediction-preprocess-data-v3/Workation_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

train_df = train_df[~((train_df['Per Person Price'] < 7.3) | (np.exp(train_df['Per Person Price']) > 150000))].copy()

del processed_data
gc.collect()

31

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'Per Person Price'].copy()
Ytrain = train_df['Per Person Price'].copy()
Ytrain_strat = pd.qcut(train_df['Per Person Price'].values, q=10, labels=range(0,10))
Xtest = test_df.copy()

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (20989, 4728) 
Ytrain: (20989,) 
Xtest: (9000, 4728)


0

## Build and validate the model

In [4]:
FOLD = 10
NUM_SEED = 3

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_cb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_cb = np.zeros((Xtest.shape[0], NUM_SEED))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain_strat)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = CatBoostRegressor(
            objective='RMSE',
            eval_metric='RMSE',
            num_boost_round=8000,
            learning_rate=0.014,
            reg_lambda=0.0487,
            bootstrap_type='Poisson',
            subsample=0.922,
            max_depth=7, 
            grow_policy='Lossguide',
            min_data_in_leaf=6, 
            max_leaves=957,
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  early_stopping_rounds=200, verbose=100)

        y_pred = model.predict(val_x)
        y_pred_meta_cb[val, sidx] += y_pred
        y_pred_final_cb[:, sidx] += model.predict(Xtest)
        
        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_cb = y_pred_final_cb / float(FOLD)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

0:	learn: 0.5345537	test: 0.5298598	best: 0.5298598 (0)	total: 101ms	remaining: 13m 24s
100:	learn: 0.2280211	test: 0.2374128	best: 0.2374128 (100)	total: 7.54s	remaining: 9m 49s
200:	learn: 0.1612447	test: 0.1857439	best: 0.1857439 (200)	total: 14.9s	remaining: 9m 36s
300:	learn: 0.1408966	test: 0.1749595	best: 0.1749595 (300)	total: 21.2s	remaining: 9m 3s
400:	learn: 0.1316182	test: 0.1713658	best: 0.1713658 (400)	total: 26.8s	remaining: 8m 28s
500:	learn: 0.1261862	test: 0.1694335	best: 0.1694335 (500)	total: 31.1s	remaining: 7m 45s
600:	learn: 0.1218167	test: 0.1680566	best: 0.1680566 (600)	total: 35.7s	remaining: 7m 19s
700:	learn: 0.1182660	test: 0.1671111	best: 0.1671111 (700)	total: 39.6s	remaining: 6m 52s
800:	learn: 0.1153501	test: 0.1664407	best: 0.1664407 (800)	total: 43.7s	remaining: 6m 32s
900:	learn: 0.1126984	test: 0.1660094	best: 0.1660049 (899)	total: 47.5s	remaining: 6m 14s
1000:	learn: 0.1101166	test: 0.1656216	best: 0.1656216 (1000)	total: 51.4s	remaining: 5m 59s
1

In [5]:
np.savez_compressed('./CB_Meta_Features.npz',
                    y_pred_meta_cb=y_pred_meta_cb, 
                    oof_score=oof_score,
                    y_pred_final_cb=y_pred_final_cb)

In [6]:
df = pd.DataFrame(y_pred_final_cb)
df.head()

Unnamed: 0,0,1,2
0,9.860833,9.860577,9.861087
1,9.320124,9.32741,9.318704
2,8.632074,8.633557,8.634842
3,8.895919,8.896718,8.898894
4,9.973604,9.973912,9.972656
