In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Import Packages

In [2]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1
!make -j$(nproc)
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

Cloning into 'LightGBM'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20721 (delta 2), reused 4 (delta 1), pack-reused 20706[K
Receiving objects: 100% (20721/20721), 16.12 MiB | 28.86 MiB/s, done.
Resolving deltas: 100% (15123/15123), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'compute'
Cloning into '/content/LightGBM/compute'...
remote: Enumerating objects: 21728, done.        
remote: Total 21728 (delta 0), reused 0 (delta 0), pack-reused 21728        
Receiving objects: 100% (21728/21728), 8.51 MiB | 27.65 MiB/s, done.
Resolving deltas: 100% (17565/17565), done.
Submodule path 'compute': checked out '36c89134d4013b2e5e45bc55656a18bd6141995a'
/content/LightGBM
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working

In [3]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/87/10/06b58f4120f26b603d905a594650440ea1fd74476b8b360dbf01e111469b/optuna-2.3.0.tar.gz (258kB)
[K     |█▎                              | 10kB 27.5MB/s eta 0:00:01[K     |██▌                             | 20kB 14.2MB/s eta 0:00:01[K     |███▉                            | 30kB 13.5MB/s eta 0:00:01[K     |█████                           | 40kB 13.3MB/s eta 0:00:01[K     |██████▍                         | 51kB 10.6MB/s eta 0:00:01[K     |███████▋                        | 61kB 10.8MB/s eta 0:00:01[K     |████████▉                       | 71kB 11.0MB/s eta 0:00:01[K     |██████████▏                     | 81kB 11.2MB/s eta 0:00:01[K     |███████████▍                    | 92kB 12.0MB/s eta 0:00:01[K     |████████████▊                   | 102kB 11.8MB/s eta 0:00:01[K     |██████████████                  | 112kB 11.8MB/s eta 0:00:01[K     |███████████████▏                | 122kB 11.8MB/s eta 0:0

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import optuna
import pickle
import warnings
warnings.filterwarnings('ignore')

## Load processed dataset

In [5]:
with open("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Carnival_Wars_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
Xtrain = processed_data['Xtrain']
Ytrain = processed_data['Ytrain']
Xpredict = processed_data['Xpredict']

Ytrain = np.cbrt(Ytrain)

print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))
print("Xpredict shape: {}".format(Xpredict.shape))

Xtrain shape: (6313, 716)
Ytrain shape: (6313,)
Xpredict shape: (3430, 716)


In [6]:
categorical_columns = ['Stall_no','Market_Category','Loyalty_customer',
                       'Product_Category','instock_time','Grade_Bin','Grade',
                       'Demand','Discount_avail','instock_date_year','charges_2 (%)',
                       'instock_date_quarter','Market_Bin','instock_date_month',
                       'instock_date_day_week','instock_date_day_weekend',
                       'clusters_k']
categorical_columns_indices = [Xtrain.columns.get_loc(col) for col in categorical_columns]
print(categorical_columns_indices)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


## Hyperparameter search using Optuna

In [7]:
def objective(trial):
    
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "device_type": "gpu",
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 40, 1500),
        "max_depth": trial.suggest_int("max_depth", 6, 25),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 15)
    }

    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=7, shuffle=True, random_state=10)
    counter = 0
    rmsle = 0

    for train, val in kfold.split(Xtrain.values, Ytrain.values):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        lgtrain = lgb.Dataset(train_x, label=train_y)
        lgvalidation = lgb.Dataset(val_x, label=val_y)
        
        model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgvalidation], 
                          categorical_feature=categorical_columns_indices,
                          num_boost_round=5000, early_stopping_rounds=200, 
                          verbose_eval=False)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        rmsle += np.sqrt(mean_squared_log_error(val_y, y_pred))
    
    score = rmsle / float(counter)
    return score

In [8]:
study = optuna.create_study()
study.optimize(objective, n_trials=250)

[32m[I 2020-12-01 08:20:31,362][0m A new study created in memory with name: no-name-a93d9929-4592-49e6-9238-b72a21daaa1d[0m
[32m[I 2020-12-01 08:28:51,037][0m Trial 0 finished with value: 0.031088383907620254 and parameters: {'learning_rate': 0.019657689826785583, 'lambda_l2': 0.00012549152755589123, 'num_leaves': 1338, 'max_depth': 15, 'feature_fraction': 0.77935268145205, 'bagging_fraction': 0.5657635729430122, 'bagging_freq': 10, 'min_child_samples': 10}. Best is trial 0 with value: 0.031088383907620254.[0m
[32m[I 2020-12-01 08:36:36,062][0m Trial 1 finished with value: 0.031215768960198998 and parameters: {'learning_rate': 0.0671430878359602, 'lambda_l2': 0.2112859536412777, 'num_leaves': 1182, 'max_depth': 23, 'feature_fraction': 0.9096831022275919, 'bagging_fraction': 0.6849057210168278, 'bagging_freq': 11, 'min_child_samples': 9}. Best is trial 0 with value: 0.031088383907620254.[0m
[32m[I 2020-12-01 08:42:33,841][0m Trial 2 finished with value: 0.031278922171531894 a

KeyboardInterrupt: ignored

In [9]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 11
Best trial:
 Value: 0.030701466881611246
Params: 
 learning_rate: 0.012521380412919617
 lambda_l2: 0.4163606248009528
 num_leaves: 911
 max_depth: 22
 feature_fraction: 0.6850199687453369
 bagging_fraction: 0.6886580955674425
 bagging_freq: 10
 min_child_samples: 10


## Build and validate the model

In [10]:
# Define model hyperparameters
params = {}
params["objective"] = 'regression'
params["metric"] = 'rmse'
params["boosting"] = 'gbdt'
params["device_type"] = 'gpu'
params["learning_rate"] = 0.01252
params["lambda_l2"] = 0.416361
params["num_leaves"] = 911
params["max_depth"] = 22
params["feature_fraction"] = 0.685
params["bagging_fraction"] = 0.689
params["bagging_freq"] = 10
params["min_data_in_leaf"] = 10
params["verbosity"] = -1
num_rounds = 8000

In [11]:
# Set number of K-Folds and seeds
FOLD = 7
NUM_SEED = 5

# Set seeds for model training
np.random.seed(2)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_rmsle_score = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], NUM_SEED))
y_pred_final_lgb = 0
counter = 0


for sidx, seed in enumerate(seeds):
    seed_rmsle_score = 0
    
    # Define K-fold cross validation test harness
    kfold = KFold(n_splits=FOLD, shuffle=True, random_state=seed)
    
    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]

        lgtrain = lgb.Dataset(train_x, label=train_y)
        lgvalidation = lgb.Dataset(val_x, label=val_y)

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          categorical_feature=categorical_columns_indices,
                          early_stopping_rounds=100, verbose_eval=500)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_meta_lgb[val, sidx] = y_pred
        y_pred_final_lgb += model.predict(Xpredict, num_iteration=model.best_iteration)

        score = np.sqrt(mean_squared_log_error(val_y, y_pred))
        oof_rmsle_score += score
        seed_rmsle_score += score
        print("Seed-{} | Fold-{} | RMSE Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate Log Loss: {}\n\n".format(seed, (seed_rmsle_score / FOLD)))

y_pred_final_lgb /= float(counter)
oof_rmsle_score /= float(counter)
print("Aggregate RMSE Score: {}".format(oof_rmsle_score))

Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.196474	valid_1's rmse: 0.454996
Early stopping, best iteration is:
[450]	training's rmse: 0.213583	valid_1's rmse: 0.453625
Seed-40 | Fold-0 | RMSE Score: 0.0319335310053383
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.195659	valid_1's rmse: 0.346443
Early stopping, best iteration is:
[546]	training's rmse: 0.185424	valid_1's rmse: 0.345572
Seed-40 | Fold-1 | RMSE Score: 0.021431238703266048
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.206242	valid_1's rmse: 0.282809
Early stopping, best iteration is:
[529]	training's rmse: 0.194683	valid_1's rmse: 0.281439
Seed-40 | Fold-2 | RMSE Score: 0.018668570077517994
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.2051	valid_1's rmse: 0.268729
Early stopping, best iteration is:
[450]	training's rmse: 0.217615	valid_1's rmse: 0.261699
Seed

## Create submission file

In [12]:
predict_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['Product_id'] = predict_df['Product_id']
submit_df['Selling_Price'] = y_pred_final_lgb ** 3
submit_df.head()

Unnamed: 0,Product_id,Selling_Price
0,SCHE4YSTDVPVZVXW,3336.652922
1,ACCEGCATKHNRXUHW,2024.848163
2,NKCE6GJ5XVJDXNNZ,11859.056076
3,NKCEB8BK3ZXDHDHM,9503.277093
4,TOPEFDXSAHRNPF94,5658.964038


In [13]:
submit_df.to_csv("/content/drive/My Drive/Colab Notebooks/Carnival Wars/Predictions/prediction_v9_LGB.csv", index=False)