In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Download Packages

In [None]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 15.8MB/s eta 0:00:01[K     |██▌                             | 20kB 13.5MB/s eta 0:00:01[K     |███▉                            | 30kB 7.8MB/s eta 0:00:01[K     |█████                           | 40kB 3.8MB/s eta 0:00:01[K     |██████▍                         | 51kB 4.6MB/s eta 0:00:01[K     |███████▋                        | 61kB 5.0MB/s eta 0:00:01[K     |████████▉                       | 71kB 5.5MB/s eta 0:00:01[K     |██████████▏                     | 81kB 5.9MB/s eta 0:00:01[K     |███████████▍                    | 92kB 6.3MB/s eta 0:00:01[K     |████████████▊                   | 102kB 6.5MB/s eta 0:00:01[K     |██████████████                  | 112kB 6.5MB/s eta 0:00:01[K     |███████████████▏                | 122kB 6.5MB/s eta 0:00:01[K  

## Import Packages

In [3]:
import pickle
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import IsolationForest
from xgboost import XGBRegressor
#import optuna

## Load datasets from saved file

In [10]:
with open('/content/drive/My Drive/Colab Notebooks/The Great Hiring Challenge/TGHC_Dataset_CB.txt', 'rb') as handle: 
    data = handle.read()
        
processed_dataset = pickle.loads(data)
Xtrain, Ytrain = processed_dataset['Xtrain'], processed_dataset['Ytrain']
Xpredict = processed_dataset['Xpredict']

print("------------------------- Training Dataset -------------------------")
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain shape: (284780, 512)
Ytrain shape: (284780, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (122049, 512)


In [11]:
'''
pt = PowerTransformer(method='yeo-johnson')
pt.fit(Ytrain)
tmp = pt.transform(Ytrain)
del tmp
'''

Ytrain = pd.DataFrame(Ytrain, columns=['UnitPrice'])
del processed_dataset
gc.collect()

375

## Remove Outliers

In [12]:
clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(Xtrain)
y_noano = clf.predict(Xtrain)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

Xtrain = Xtrain.iloc[y_noano[y_noano['Top'] == 1].index.values]
Ytrain = Ytrain.iloc[y_noano[y_noano['Top'] == 1].index.values]
Xtrain.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", Xtrain.shape[0])

Number of Outliers: 43572
Number of rows without outliers: 241208


In [13]:
print("------------------------- Training Dataset -------------------------")
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain shape: (241208, 512)
Ytrain shape: (241208, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (122049, 512)


## Split training data into train/test datasets

In [None]:
Xtrain_new, Xtest, Ytrain_new, Ytest = train_test_split(Xtrain, Ytrain, test_size=0.1, random_state=10)

print("------------------------- Training Dataset -------------------------")
print("Xtrain_new shape: {}".format(Xtrain_new.shape))
print("Ytrain_new shape: {}".format(Ytrain_new.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest shape: {}".format(Xtest.shape))
print("Ytest shape: {}".format(Ytest.shape))

------------------------- Training Dataset -------------------------
Xtrain_new shape: (256302, 512)
Ytrain_new shape: (256302, 1)

------------------------- Test Dataset -------------------------
Xtest shape: (28478, 512)
Ytest shape: (28478, 1)


## Hyperparameter search using Optuna

In [None]:
def objective(trial):
    
    model = XGBRegressor(
        objective='reg:squarederror',
        eval_metric='rmse',
        booster='gbtree',
        sample_type='uniform',
        tree_method='gpu_hist',
        grow_policy='lossguide',
        num_round=8000,
        max_depth=trial.suggest_int("max_depth", 6, 15), 
        max_leaves=trial.suggest_int("max_leaves", 35, 500),
        learning_rate=trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
        subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
        colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 15),
        reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
        verbosity=0
    )

    model.fit(Xtrain_new, Ytrain_new, eval_set=[(Xtest, Ytest)], 
              early_stopping_rounds=100, verbose=False)
    y_pred = model.predict(Xtest, ntree_limit=model.best_ntree_limit)
    rmse = np.sqrt(mean_squared_error(Ytest, y_pred))
    return rmse

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=250)

[32m[I 2020-11-17 13:18:48,258][0m A new study created in memory with name: no-name-c4de4fbd-36b4-450e-a006-c81af8b49854[0m
[32m[I 2020-11-17 13:19:00,296][0m Trial 0 finished with value: 19.59581367423754 and parameters: {'max_depth': 6, 'max_leaves': 348, 'learning_rate': 0.01921075080036617, 'subsample': 0.8170524629179234, 'colsample_bytree': 0.5407562053838675, 'min_child_weight': 12, 'reg_lambda': 0.00040437763567270277}. Best is trial 0 with value: 19.59581367423754.[0m
[32m[I 2020-11-17 13:19:26,919][0m Trial 1 finished with value: 19.499448358865557 and parameters: {'max_depth': 13, 'max_leaves': 392, 'learning_rate': 0.011153830475172828, 'subsample': 0.5537354536542545, 'colsample_bytree': 0.5255700772629568, 'min_child_weight': 3, 'reg_lambda': 0.595921248917796}. Best is trial 1 with value: 19.499448358865557.[0m
[32m[I 2020-11-17 13:19:39,721][0m Trial 2 finished with value: 17.650722275539895 and parameters: {'max_depth': 8, 'max_leaves': 468, 'learning_rate':

KeyboardInterrupt: ignored

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 135
Best trial:
 Value: 11.646402058460788
Params: 
 max_depth: 9
 max_leaves: 242
 learning_rate: 0.06694921652295704
 subsample: 0.9881858040427779
 colsample_bytree: 0.7823109089478195
 min_child_weight: 5
 reg_lambda: 0.0004414605964471387


In [None]:
del Xtrain_new
del Ytrain_new
del Xtest
del Ytest
gc.collect()

89

## Build and validate the model

In [15]:
# Set number of K-Folds
FOLD = 5

# Set seeds for model training
np.random.seed(3)
seeds = np.random.randint(0, 100, size=3)

oof_rmse_score = 0
y_pred_final_xgb = 0
counter = 0


for seed in seeds:
    seed_rmse_score = 0
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = XGBRegressor(
            objective='reg:squarederror',
            eval_metric='rmse',
            booster='gbtree',
            sample_type='uniform',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            num_round=8000,
            max_depth=9, 
            max_leaves=242,
            learning_rate=0.067,
            subsample=0.988,
            colsample_bytree=0.7823,
            min_child_weight=5,
            reg_lambda=0.00044,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  early_stopping_rounds=100, verbose=1000)

        y_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        y_pred_final_xgb += model.predict(Xpredict, ntree_limit=model.best_ntree_limit)
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_rmse_score += score
        seed_rmse_score += score
        print("Seed-{} | Fold-{} | RMSE Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate Log Loss: {}\n\n".format(seed, (seed_rmse_score / FOLD)))

y_pred_final_xgb /= float(counter)
oof_rmse_score /= float(counter)
print("Aggregate RMSE Score: {}".format(oof_rmse_score))

[0]	validation_0-rmse:15.3559
Will train until validation_0-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:31.7018
Seed-24 | Fold-0 | RMSE Score: 15.35591769464343
[0]	validation_0-rmse:17.0291
Will train until validation_0-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:43.4344
Seed-24 | Fold-1 | RMSE Score: 17.029091600463058
[0]	validation_0-rmse:177.42
Will train until validation_0-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:175.894
Seed-24 | Fold-2 | RMSE Score: 175.8906843788416
[0]	validation_0-rmse:39.5464
Will train until validation_0-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:45.9569
Seed-24 | Fold-3 | RMSE Score: 39.54642898344322
[0]	validation_0-rmse:5.94059
Will train until validation_0-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:35.9033
Seed-24 | Fold-4 | RMSE Score: 5.940591112104452

Seed: 24 | Aggregate Log Loss: 50.75254275389914


[0]	validation_0-rmse:181.291
Will train until validation_0-rmse has

## Create submission file

In [None]:
#tmp = np.array([y_pred_final_xgb]).T
#y_pred_final = pt.inverse_transform(tmp)
submit_df = pd.DataFrame(y_pred_final_xgb, columns=['UnitPrice'])
print(submit_df.shape)
submit_df.head()

(122049, 1)


Unnamed: 0,UnitPrice
0,1.558417
1,1.623244
2,2.496832
3,1.182177
4,6.54693


In [None]:
submit_df.to_csv('/content/drive/My Drive/Colab Notebooks/The Great Hiring Challenge/Experiment-2/Predictions/predictions_v10_XGB.csv', index=False)