## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_log_error

## Prepare data for model training

In [2]:
with open("../input/workation-price-prediction-preprocess-data/Workation_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

train_df = train_df[~((train_df['Per Person Price'] < 7.3) | (np.exp(train_df['Per Person Price']) > 150000))].copy()

del processed_data
gc.collect()

31

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'Per Person Price'].values
Ytrain = train_df['Per Person Price'].values
Ytrain_strat = pd.qcut(train_df['Per Person Price'].values, q=10, labels=range(0,10))
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (20989, 4728) 
Ytrain: (20989,) 
Xtest: (9000, 4728)


0

## Build and validate the model

In [4]:
params = {}
params["objective"] = 'regression'
params["metric"] = 'rmse'
params["boosting"] = 'gbdt'
params["device_type"] = 'gpu'
params["learning_rate"] = 0.0204
params["lambda_l2"] = 0.00225
params["num_leaves"] = 71
params["max_depth"] = 10
params["feature_fraction"] = 0.7442
params["bagging_fraction"] = 0.89
params["bagging_freq"] = 10
params["min_data_in_leaf"] = 7
params["verbosity"] = -1
num_rounds = 8000

In [5]:
FOLD = 10
NUM_SEED = 2

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_lgb = np.zeros((Xtest.shape[0], NUM_SEED))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain_strat)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          early_stopping_rounds=200, verbose_eval=200)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_meta_lgb[val, sidx] = y_pred
        y_pred_final_lgb[:, sidx] += model.predict(Xtest, num_iteration=model.best_iteration)
        
        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_lgb = y_pred_final_lgb / float(FOLD)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.143404	valid_1's rmse: 0.174682
[400]	training's rmse: 0.117762	valid_1's rmse: 0.165212
[600]	training's rmse: 0.106608	valid_1's rmse: 0.163124
[800]	training's rmse: 0.0976596	valid_1's rmse: 0.16224
[1000]	training's rmse: 0.090303	valid_1's rmse: 0.162211
[1200]	training's rmse: 0.0840269	valid_1's rmse: 0.162147
Early stopping, best iteration is:
[1058]	training's rmse: 0.0883027	valid_1's rmse: 0.162025
Seed-24 | Fold-0 | OOF Score: 0.014999144928142224
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.1443	valid_1's rmse: 0.166562
[400]	training's rmse: 0.119551	valid_1's rmse: 0.157067
[600]	training's rmse: 0.107629	valid_1's rmse: 0.155403
[800]	training's rmse: 0.0985487	valid_1's rmse: 0.155022
Early stopping, best iteration is:
[770]	training's rmse: 0.100321	valid_1's rmse: 0.154658
Seed-24 | Fold-1 | OOF Score: 0.01435420416880881
Training until va

In [6]:
np.savez_compressed('./LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    oof_score=oof_score,
                    y_pred_final_lgb=y_pred_final_lgb)

In [7]:
df = pd.DataFrame(y_pred_final_lgb)
df.head()

Unnamed: 0,0,1
0,9.860219,9.858452
1,9.36698,9.382491
2,8.629172,8.631675
3,8.863941,8.863142
4,9.96998,9.972205
