In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Download Packages

In [2]:
! pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [3]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 10.6MB/s eta 0:00:01[K     |██▌                             | 20kB 13.1MB/s eta 0:00:01[K     |███▉                            | 30kB 10.1MB/s eta 0:00:01[K     |█████                           | 40kB 11.0MB/s eta 0:00:01[K     |██████▍                         | 51kB 11.5MB/s eta 0:00:01[K     |███████▋                        | 61kB 12.4MB/s eta 0:00:01[K     |████████▉                       | 71kB 11.5MB/s eta 0:00:01[K     |██████████▏                     | 81kB 11.4MB/s eta 0:00:01[K     |███████████▍                    | 92kB 12.5MB/s eta 0:00:01[K     |████████████▊                   | 102kB 12.2MB/s eta 0:00:01[K     |██████████████                  | 112kB 12.2MB/s eta 0:00:01[K     |███████████████▏                | 122kB 12.2MB/s eta 0:0

## Import Packages

In [4]:
import pickle
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import optuna

## Load datasets from saved file

In [5]:
with open('/content/drive/My Drive/Colab Notebooks/The Great Hiring Challenge/TGHC_Dataset_CB.txt', 'rb') as handle: 
    data = handle.read()

processed_dataset = pickle.loads(data)
Xtrain, Ytrain = processed_dataset['Xtrain'], processed_dataset['Ytrain']
Xpredict = processed_dataset['Xpredict']

print("------------------------- Training Dataset -------------------------")
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain shape: (284780, 512)
Ytrain shape: (284780, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (122049, 512)


In [6]:
'''
pt = PowerTransformer(method='yeo-johnson')
pt.fit(Ytrain)
tmp = pt.transform(Ytrain)
del tmp
'''

Ytrain = pd.DataFrame(Ytrain, columns=['UnitPrice'])
del processed_dataset
gc.collect()

0

## Split training data into train/test datasets

In [None]:
Xtrain_new, Xtest, Ytrain_new, Ytest = train_test_split(Xtrain, Ytrain, test_size=0.1, random_state=10)

print("------------------------- Training Dataset -------------------------")
print("Xtrain_new shape: {}".format(Xtrain_new.shape))
print("Ytrain_new shape: {}".format(Ytrain_new.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest shape: {}".format(Xtest.shape))
print("Ytest shape: {}".format(Ytest.shape))

------------------------- Training Dataset -------------------------
Xtrain_new shape: (256302, 550)
Ytrain_new shape: (256302, 1)

------------------------- Test Dataset -------------------------
Xtest shape: (28478, 550)
Ytest shape: (28478, 1)


## Hyperparameter search using Optuna

In [None]:
def objective(trial):
    
    model = CatBoostRegressor(
        objective='RMSE',
        eval_metric='RMSE',
        num_boost_round=8000,
        learning_rate=trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
        reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
        bootstrap_type='Poisson',
        subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
        max_depth=trial.suggest_int("max_depth", 6, 10), 
        grow_policy='Lossguide',
        min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 15), 
        max_leaves=trial.suggest_int("max_leaves", 35, 750),
        task_type='GPU',
        verbose=0
    )

    model.fit(Xtrain_new, Ytrain_new, eval_set=[(Xtest, Ytest)], early_stopping_rounds=200, verbose=False)
    y_pred = model.predict(Xtest)
    rmse = np.sqrt(mean_squared_error(Ytest, y_pred))
    return rmse

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=250)

[32m[I 2020-11-16 10:48:53,129][0m A new study created in memory with name: no-name-59f8c525-71ed-4e7a-8684-c6029255d560[0m


KeyboardInterrupt: ignored

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 16
Best trial:
 Value: 0.07256989184159258
Params: 
 learning_rate: 0.028769157414438307
 reg_lambda: 0.0001777632682030053
 subsample: 0.6652237268267053
 max_depth: 10
 min_data_in_leaf: 4
 max_leaves: 664


In [None]:
del Xtrain_new
del Ytrain_new
del Xtest
del Ytest
gc.collect()

## Build and validate the model

In [7]:
categorical_columns_indices = [5, 6, 10, 11, 12, 13, 14, 15, 19, 20, 21, 22, 23, 
                               27, 28, 29, 30, 31, 32, 33, 37, 36, 38, 41, 45, 
                               46, 47, 48, 49, 500, 501, 502, 503, 504, 505, 
                               506, 507, 508, 509, 510, 511]

In [10]:
# Set number of K-Folds
FOLD = 5

# Set seeds for model training
np.random.seed(1)
seeds = np.random.randint(0, 100, size=3)

oof_rmse_score = 0
y_pred_final_cb = 0
counter = 0


for seed in seeds:
    seed_rmse_score = 0
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=FOLD, shuffle=True, random_state=1)
    
    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = CatBoostRegressor(
            objective='RMSE',
            eval_metric='RMSE',
            num_boost_round=8000,
            learning_rate=0.0288,
            reg_lambda=0.00018,
            bootstrap_type='Poisson',
            subsample=0.665,
            max_depth=10, 
            grow_policy='Lossguide',
            min_data_in_leaf=4, 
            max_leaves=664,
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  cat_features=categorical_columns_indices,
                  early_stopping_rounds=200, verbose=1000)

        y_pred = model.predict(val_x)
        y_pred_final_cb += model.predict(Xpredict)
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        seed_rmse_score += score
        oof_rmse_score += score
        print("Seed-{} | Fold-{} | RMSE Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate Log Loss: {}\n\n".format(seed, (seed_rmse_score / FOLD)))

y_pred_final_cb /= float(counter)
oof_rmse_score /= float(counter)
print("Aggregate RMSE Score: {}".format(oof_rmse_score))

0:	learn: 28.6105921	test: 165.2571740	best: 165.2571740 (0)	total: 289ms	remaining: 38m 32s
bestTest = 164.6776931
bestIteration = 106
Shrink model to first 107 iterations.
Seed-37 | Fold-0 | RMSE Score: 164.6776856600358
0:	learn: 86.0331273	test: 17.4177902	best: 17.4177902 (0)	total: 271ms	remaining: 36m 6s
bestTest = 16.78452644
bestIteration = 1
Shrink model to first 2 iterations.
Seed-37 | Fold-1 | RMSE Score: 16.784527294071953
0:	learn: 82.1191309	test: 46.0787760	best: 46.0787760 (0)	total: 284ms	remaining: 37m 55s
bestTest = 41.54656596
bestIteration = 34
Shrink model to first 35 iterations.
Seed-37 | Fold-2 | RMSE Score: 41.54656675007604
0:	learn: 85.4746373	test: 24.5793889	best: 24.5793889 (0)	total: 309ms	remaining: 41m 13s
bestTest = 17.9624328
bestIteration = 65
Shrink model to first 66 iterations.
Seed-37 | Fold-3 | RMSE Score: 17.962432458500302
0:	learn: 85.9148313	test: 17.8217702	best: 17.8217702 (0)	total: 268ms	remaining: 35m 47s
bestTest = 15.12289997
bestIter

## Create submission file

In [11]:
#tmp = np.array([y_pred_final_cb]).T
#y_pred_final = pt.inverse_transform(tmp)
submit_df = pd.DataFrame(y_pred_final_cb, columns=['UnitPrice'])
print(submit_df.shape)
submit_df.head()

(122049, 1)


Unnamed: 0,UnitPrice
0,2.600647
1,2.813788
2,3.783856
3,2.37343
4,8.657672


In [12]:
submit_df.to_csv('/content/drive/My Drive/Colab Notebooks/The Great Hiring Challenge/Experiment-2/Predictions/predictions_v11_CB.csv', index=False)