In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Import Packages

In [2]:
! pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [3]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 23.5MB/s eta 0:00:01[K     |██▌                             | 20kB 17.6MB/s eta 0:00:01[K     |███▉                            | 30kB 15.4MB/s eta 0:00:01[K     |█████                           | 40kB 14.7MB/s eta 0:00:01[K     |██████▍                         | 51kB 11.3MB/s eta 0:00:01[K     |███████▋                        | 61kB 11.4MB/s eta 0:00:01[K     |████████▉                       | 71kB 11.5MB/s eta 0:00:01[K     |██████████▏                     | 81kB 12.2MB/s eta 0:00:01[K     |███████████▍                    | 92kB 13.0MB/s eta 0:00:01[K     |████████████▊                   | 102kB 12.8MB/s eta 0:00:01[K     |██████████████                  | 112kB 12.8MB/s eta 0:00:01[K     |███████████████▏                | 122kB 12.8MB/s eta 0:0

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor
import optuna
import pickle
import warnings
warnings.filterwarnings('ignore')

## Load processed dataset

In [5]:
with open("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Carnival_Wars_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
Xtrain = processed_data['Xtrain']
Ytrain = processed_data['Ytrain']
Xpredict = processed_data['Xpredict']

Ytrain = np.cbrt(Ytrain)

print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))
print("Xpredict shape: {}".format(Xpredict.shape))

Xtrain shape: (6313, 716)
Ytrain shape: (6313,)
Xpredict shape: (3430, 716)


In [6]:
categorical_columns = ['Stall_no','Market_Category','Loyalty_customer',
                       'Product_Category','instock_time','Grade_Bin','Grade',
                       'Demand','Discount_avail','instock_date_year','charges_2 (%)',
                       'instock_date_quarter','Market_Bin','instock_date_month',
                       'instock_date_day_week','instock_date_day_weekend',
                       'clusters_k']
categorical_columns_indices = [Xtrain.columns.get_loc(col) for col in categorical_columns]
print(categorical_columns_indices)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


## Hyperparameter search using Optuna

In [7]:
def objective(trial):
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=7, shuffle=True, random_state=10)
    counter = 0
    rmsle = 0

    for train, val in kfold.split(Xtrain.values, Ytrain.values):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = CatBoostRegressor(
            objective='RMSE',
            eval_metric='RMSE',
            num_boost_round=5000,
            learning_rate=trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
            reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
            bootstrap_type='Poisson',
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            max_depth=trial.suggest_int("max_depth", 6, 15), 
            grow_policy='Lossguide',
            min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 15), 
            max_leaves=trial.suggest_int("max_leaves", 40, 1500),
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  cat_features=categorical_columns_indices,
                  early_stopping_rounds=200, verbose=False)
        y_pred = model.predict(val_x)
        rmsle += np.sqrt(mean_squared_log_error(val_y, y_pred))
    
    score = rmsle / float(counter)
    return score

In [8]:
study = optuna.create_study()
study.optimize(objective, n_trials=250)

[32m[I 2020-12-01 08:14:58,146][0m A new study created in memory with name: no-name-899196c1-1b65-453b-b982-dc8916c75132[0m
[32m[I 2020-12-01 08:18:33,944][0m Trial 0 finished with value: 0.03154559920383443 and parameters: {'learning_rate': 0.021258939742412126, 'reg_lambda': 0.0011615221130014577, 'subsample': 0.5602265935953359, 'max_depth': 9, 'min_data_in_leaf': 9, 'max_leaves': 667}. Best is trial 0 with value: 0.03154559920383443.[0m
[32m[I 2020-12-01 08:28:02,846][0m Trial 1 finished with value: 0.03509306048268023 and parameters: {'learning_rate': 0.023524513386969894, 'reg_lambda': 0.0005252172702328468, 'subsample': 0.9592659720473466, 'max_depth': 8, 'min_data_in_leaf': 1, 'max_leaves': 1400}. Best is trial 0 with value: 0.03154559920383443.[0m
[32m[I 2020-12-01 08:28:59,505][0m Trial 2 finished with value: 0.03149660654238146 and parameters: {'learning_rate': 0.07148744280909876, 'reg_lambda': 0.18312665913254556, 'subsample': 0.7050094261736793, 'max_depth': 6,

KeyboardInterrupt: ignored

In [9]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 19
Best trial:
 Value: 0.03125005566092428
Params: 
 learning_rate: 0.013264771870718988
 reg_lambda: 0.00017638097042118911
 subsample: 0.5065462449656796
 max_depth: 6
 min_data_in_leaf: 8
 max_leaves: 453


## Build and validate the model

In [10]:
# Set number of K-Folds and seeds
FOLD = 7
NUM_SEED = 5

# Set seeds for model training
np.random.seed(1)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_rmsle_score = 0
y_pred_meta_cb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_cb = 0
counter = 0


for sidx, seed in enumerate(seeds):
    seed_rmsle_score = 0
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=FOLD, shuffle=True, random_state=seed)
    
    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        model = CatBoostRegressor(
            objective='RMSE',
            eval_metric='RMSE',
            num_boost_round=8000,
            learning_rate=0.013265,
            reg_lambda=0.0001764,
            bootstrap_type='Poisson',
            subsample=0.50655,
            max_depth=6, 
            grow_policy='Lossguide',
            min_data_in_leaf=8, 
            max_leaves=453,
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  cat_features=categorical_columns_indices,
                  early_stopping_rounds=200, verbose=500)

        y_pred = model.predict(val_x)
        y_pred_meta_cb[val, sidx] = y_pred
        y_pred_final_cb += model.predict(Xpredict)
        
        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        seed_rmsle_score += score
        oof_rmsle_score += score
        print("Seed-{} | Fold-{} | RMSE Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate Log Loss: {}\n\n".format(seed, (seed_rmsle_score / FOLD)))

y_pred_final_cb /= float(counter)
oof_rmsle_score /= float(counter)
print("Aggregate RMSE Score: {}".format(oof_rmsle_score))

0:	learn: 3.9041867	test: 3.7495454	best: 3.7495454 (0)	total: 47.1ms	remaining: 6m 17s
500:	learn: 0.2275996	test: 0.4285438	best: 0.4279851 (459)	total: 12.7s	remaining: 3m 9s
bestTest = 0.4274603387
bestIteration = 782
Shrink model to first 783 iterations.
Seed-37 | Fold-0 | RMSE Score: 0.028050202102291453
0:	learn: 3.8812845	test: 3.8878992	best: 3.8878992 (0)	total: 37.4ms	remaining: 4m 59s
500:	learn: 0.2298898	test: 0.4388140	best: 0.4383449 (494)	total: 12.1s	remaining: 3m 1s
1000:	learn: 0.1869744	test: 0.4346938	best: 0.4343060 (872)	total: 20.1s	remaining: 2m 20s
bestTest = 0.4343059673
bestIteration = 872
Shrink model to first 873 iterations.
Seed-37 | Fold-1 | RMSE Score: 0.03408406124836107
0:	learn: 3.8566235	test: 4.0320213	best: 4.0320213 (0)	total: 37.1ms	remaining: 4m 56s
500:	learn: 0.2259235	test: 0.5010876	best: 0.5007385 (443)	total: 12.5s	remaining: 3m 7s
bestTest = 0.5003851107
bestIteration = 648
Shrink model to first 649 iterations.
Seed-37 | Fold-2 | RMSE S

## Create submission file

In [11]:
predict_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['Product_id'] = predict_df['Product_id']
submit_df['Selling_Price'] = y_pred_final_cb ** 3
submit_df.head()

Unnamed: 0,Product_id,Selling_Price
0,SCHE4YSTDVPVZVXW,3278.202048
1,ACCEGCATKHNRXUHW,2013.485979
2,NKCE6GJ5XVJDXNNZ,11836.869245
3,NKCEB8BK3ZXDHDHM,9432.953072
4,TOPEFDXSAHRNPF94,5698.348741


In [12]:
submit_df.to_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Predictions/prediction_v9_CB.csv", index=False)