In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Import Packages

In [2]:
! pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/xgboost-1.2.0_SNAPSHOT%2B4729458a363c64291e84da28b408a0ac8d7851fa-py3-none-manylinux2010_x86_64.whl

Collecting xgboost==1.2.0-SNAPSHOT+4729458a363c64291e84da28b408a0ac8d7851fa
[?25l  Downloading https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/xgboost-1.2.0_SNAPSHOT%2B4729458a363c64291e84da28b408a0ac8d7851fa-py3-none-manylinux2010_x86_64.whl (149.3MB)
[K     |████████████████████████████████| 149.3MB 81kB/s 
Installing collected packages: xgboost
  Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.2.0-SNAPSHOT


In [3]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 21.4MB/s eta 0:00:01[K     |██▌                             | 20kB 27.5MB/s eta 0:00:01[K     |███▉                            | 30kB 23.6MB/s eta 0:00:01[K     |█████                           | 40kB 18.1MB/s eta 0:00:01[K     |██████▍                         | 51kB 15.9MB/s eta 0:00:01[K     |███████▋                        | 61kB 17.1MB/s eta 0:00:01[K     |████████▉                       | 71kB 14.0MB/s eta 0:00:01[K     |██████████▏                     | 81kB 14.3MB/s eta 0:00:01[K     |███████████▍                    | 92kB 14.8MB/s eta 0:00:01[K     |████████████▊                   | 102kB 13.9MB/s eta 0:00:01[K     |██████████████                  | 112kB 13.9MB/s eta 0:00:01[K     |███████████████▏                | 122kB 13.9MB/s eta 0:0

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
import optuna
import pickle
import warnings
warnings.filterwarnings('ignore')

## Load processed dataset

In [5]:
with open("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Carnival_Wars_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
Xtrain = processed_data['Xtrain'].values
Ytrain = processed_data['Ytrain'].values
Xpredict = processed_data['Xpredict'].values

Ytrain = np.cbrt(Ytrain)

print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))
print("Xpredict shape: {}".format(Xpredict.shape))

Xtrain shape: (6313, 716)
Ytrain shape: (6313,)
Xpredict shape: (3430, 716)


## Hyperparameter search using Optuna

In [7]:
def objective(trial):
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=7, shuffle=True, random_state=10)
    counter = 0
    rmsle = 0

    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        model = XGBRegressor(
            objective='reg:squaredlogerror',
            eval_metric='rmsle',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            num_round=5000,
            max_depth=trial.suggest_int("max_depth", 6, 25), 
            max_leaves=trial.suggest_int("max_leaves", 40, 1500),
            learning_rate=trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 15),
            reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  early_stopping_rounds=200, verbose=False)
        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        rmsle += np.sqrt(mean_squared_log_error(val_y, y_pred))
    
    score = rmsle / float(counter)
    return score

In [8]:
study = optuna.create_study()
study.optimize(objective, n_trials=250)

[32m[I 2020-12-01 10:41:19,634][0m A new study created in memory with name: no-name-a7410402-8423-46f9-ab1b-0a2914073f73[0m
[32m[I 2020-12-01 10:41:38,124][0m Trial 0 finished with value: 1.365730436628372 and parameters: {'max_depth': 23, 'max_leaves': 299, 'learning_rate': 0.016099611764171233, 'subsample': 0.8932557349467992, 'colsample_bytree': 0.5820546465043828, 'min_child_weight': 12, 'reg_lambda': 0.00026508141247835184}. Best is trial 0 with value: 1.365730436628372.[0m
[32m[I 2020-12-01 10:41:43,693][0m Trial 1 finished with value: 1.3210434158768154 and parameters: {'max_depth': 21, 'max_leaves': 121, 'learning_rate': 0.016897284297527137, 'subsample': 0.5533359827721653, 'colsample_bytree': 0.5654198607426001, 'min_child_weight': 2, 'reg_lambda': 0.009060014483351074}. Best is trial 1 with value: 1.3210434158768154.[0m
[32m[I 2020-12-01 10:41:47,668][0m Trial 2 finished with value: 0.2575745281098617 and parameters: {'max_depth': 8, 'max_leaves': 482, 'learning_r

In [9]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 250
Best trial:
 Value: 0.032596680990013144
Params: 
 max_depth: 21
 max_leaves: 442
 learning_rate: 0.0999325081790808
 subsample: 0.998894987379509
 colsample_bytree: 0.6348458028905883
 min_child_weight: 1
 reg_lambda: 0.00013286971052919094


## Build and validate the model

In [10]:
# Set number of K-Folds and seeds
FOLD = 7
NUM_SEED = 5

# Set seeds for model training
np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_rmsle_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_xgb = 0
counter = 0


for sidx, seed in enumerate(seeds):
    seed_rmsle_score = 0
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        model = XGBRegressor(
            objective='reg:squaredlogerror',
            eval_metric='rmsle',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            num_round=8000,
            max_depth=21, 
            max_leaves=442,
            learning_rate=0.0999,
            subsample=0.999,
            colsample_bytree=0.63485,
            min_child_weight=1,
            reg_lambda=0.000133,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  early_stopping_rounds=200, verbose=500)

        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        y_pred_meta_xgb[val, sidx] = y_pred
        y_pred_final_xgb += model.predict(Xpredict, ntree_limit=model.best_ntree_limit)
        
        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        oof_rmsle_score += score
        seed_rmsle_score += score
        print("Seed-{} | Fold-{} | RMSE Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate Log Loss: {}\n\n".format(seed, (seed_rmsle_score / FOLD)))

y_pred_final_xgb /= float(counter)
oof_rmsle_score /= float(counter)
print("Aggregate RMSE Score: {}".format(oof_rmsle_score))

[0]	validation_0-rmsle:2.30827
Will train until validation_0-rmsle hasn't improved in 200 rounds.
[99]	validation_0-rmsle:0.06442
Seed-24 | Fold-0 | RMSE Score: 0.06442229813818282
[0]	validation_0-rmsle:2.33372
Will train until validation_0-rmsle hasn't improved in 200 rounds.
[99]	validation_0-rmsle:0.02263
Seed-24 | Fold-1 | RMSE Score: 0.022633675062865177
[0]	validation_0-rmsle:2.33512
Will train until validation_0-rmsle hasn't improved in 200 rounds.
[99]	validation_0-rmsle:0.03190
Seed-24 | Fold-2 | RMSE Score: 0.03188666392474331
[0]	validation_0-rmsle:2.31618
Will train until validation_0-rmsle hasn't improved in 200 rounds.
[99]	validation_0-rmsle:0.02476
Seed-24 | Fold-3 | RMSE Score: 0.024761215918117446
[0]	validation_0-rmsle:2.33543
Will train until validation_0-rmsle hasn't improved in 200 rounds.
[99]	validation_0-rmsle:0.01952
Seed-24 | Fold-4 | RMSE Score: 0.01952345069534827
[0]	validation_0-rmsle:2.32525
Will train until validation_0-rmsle hasn't improved in 200 rou

## Create submission file

In [11]:
predict_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['Product_id'] = predict_df['Product_id']
submit_df['Selling_Price'] = y_pred_final_xgb ** 3
submit_df.head()

Unnamed: 0,Product_id,Selling_Price
0,SCHE4YSTDVPVZVXW,3241.26709
1,ACCEGCATKHNRXUHW,1985.520264
2,NKCE6GJ5XVJDXNNZ,11674.480469
3,NKCEB8BK3ZXDHDHM,9985.947266
4,TOPEFDXSAHRNPF94,5532.263184


In [12]:
submit_df.to_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Predictions/prediction_v9_XGB.csv", index=False)