In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import ndcg_score, make_scorer

# Modeling

In [2]:
pos = pd.read_csv('pos.csv')

In [3]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [4]:
cat_features = [ "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                "weekday", "month"]
rest = ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"]

target = 'target_score' #click_bool in this case


In [14]:
# avg numeric features per srch_id
numeric_cols = ['prop_starrating','prop_review_score','prop_location_score1','prop_location_score2',
                'prop_log_historical_price','price_usd',]

for i in numeric_cols:
    name = i + '_avg'
    train[name] = train.groupby(['srch_id'])[i].transform('mean')    

In [16]:
for i in numeric_cols:
    name = i + '_avg'
    test[name] = test.groupby(['srch_id'])[i].transform('mean') 

In [17]:
# group split needed for both train-test split and CV
splitter = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state = 7)
split = splitter.split(train, groups=train['srch_id'])
train_inds, test_inds = next(split)

train_df = train.iloc[train_inds]
test_df = train.iloc[test_inds]

X_train = train_df.drop([target], axis=1)
X_test = test_df.drop([target], axis=1)
y_train = train_df[target]
y_test = test_df[target]

In [5]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [29]:
custom_scorer = make_scorer(ndcg_score, k=5, greater_is_better=True)

### **) model with default parameters on validation data

In [7]:
model_2 = lgb.LGBMRegressor()


In [8]:
%%time
model_2.fit(X_train, y_train, categorical_feature=rest)




CPU times: user 3min 32s, sys: 32.9 s, total: 4min 5s
Wall time: 1min 40s


LGBMRegressor()

In [9]:
# %%time
y_pred_2 = model_2.predict(X_test)
y_pred_2

array([-0.06094215, -0.0292679 , -0.04246334, ..., -0.03407786,
       -0.05183003, -0.049999  ])

In [17]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = -y_pred_2
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,position,predictions,predictions_n
119,12,25,0.060942,0.143452
120,12,28,0.029268,0.105455
121,12,24,0.042463,0.121284
122,12,13,0.046966,0.126686
123,12,18,0.067535,0.15136
124,12,3,0.02941,0.105626
125,12,14,0.029594,0.105847
126,12,4,0.048431,0.128443
127,12,22,0.031605,0.108258
128,12,15,0.054236,0.135407


In [28]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values] 
    scores.append(ndcg_score(a1, a2, k=5)) 
print(sum(scores)/len(scores)) #0.39059

0.3905902440826018


## 1) Pointwise LGBM regression (no tuning)

In [355]:
model1 = lgb.LGBMRegressor()

In [356]:
%%time

# fit the same model on whole train data
model1.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



CPU times: user 3min 27s, sys: 32.3 s, total: 3min 59s
Wall time: 1min 57s


LGBMRegressor()

In [357]:
%%time
y_pred1 = model1.predict(test)
y_pred1 # 0,333 NDCG on public leaderboard 

CPU times: user 3min 31s, sys: 26.5 s, total: 3min 57s
Wall time: 1min 35s


array([-0.04420504, -0.0808436 , -0.02696308, ..., -0.07440339,
       -0.05211163, -0.05319732])

## 5) Listwise: LGBMRanker with default parameters

In [18]:
y_train_ = -y_train
y_test_ = -y_test

In [47]:
model6 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1)

In [48]:
%%time
model6.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54305
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100




[1]	training's ndcg@5: 0.299461	valid_1's ndcg@5: 0.292636
[2]	training's ndcg@5: 0.339943	valid_1's ndcg@5: 0.325278
[3]	training's ndcg@5: 0.352922	valid_1's ndcg@5: 0.333621
[4]	training's ndcg@5: 0.36205	valid_1's ndcg@5: 0.340558
[5]	training's ndcg@5: 0.367696	valid_1's ndcg@5: 0.346121
[6]	training's ndcg@5: 0.372135	valid_1's ndcg@5: 0.348248
[7]	training's ndcg@5: 0.37652	valid_1's ndcg@5: 0.351114
[8]	training's ndcg@5: 0.379403	valid_1's ndcg@5: 0.352944
[9]	training's ndcg@5: 0.382513	valid_1's ndcg@5: 0.354109
[10]	training's ndcg@5: 0.384195	valid_1's ndcg@5: 0.356146
[11]	training's ndcg@5: 0.387085	valid_1's ndcg@5: 0.357555
[12]	training's ndcg@5: 0.389873	valid_1's ndcg@5: 0.359794
[13]	training's ndcg@5: 0.392433	valid_1's ndcg@5: 0.360516
[14]	training's ndcg@5: 0.393826	valid_1's ndcg@5: 0.360487
[15]	training's ndcg@5: 0.397919	valid_1's ndcg@5: 0.363979
[16]	training's ndcg@5: 0.39977	valid_1's ndcg@5: 0.365685
[17]	training's ndcg@5: 0.400871	valid_1's ndcg@5: 0

LGBMRanker(metric='ndcg', objective='lambdarank', verbose=1)

In [49]:
y_pred_6 = model6.predict(X_test)
y_pred_6

array([ 0.46175809, -0.7501605 , -0.54465082, ..., -0.26479785,
        0.05585781,  0.35997866])

In [72]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = y_pred_6
# df.head(30)

In [52]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values]
    scores.append(ndcg_score(a1, a2, k=5))
print(sum(scores)/len(scores)) 

0.3724792233297772


In [19]:
y_pred6 = model6.predict(test)
y_pred6 #scored 0,36 on public leaderboard

array([ 0.46165524,  0.88704194, -0.1051241 , ...,  0.01261689,
        0.06631362,  0.17210367])

## Manual try parameters

In [19]:
model66 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1, num_iterations=200)

In [20]:
%%time
model66.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55835
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 106




[1]	training's ndcg@5: 0.296258	valid_1's ndcg@5: 0.291198
[2]	training's ndcg@5: 0.338184	valid_1's ndcg@5: 0.325807
[3]	training's ndcg@5: 0.352287	valid_1's ndcg@5: 0.335562
[4]	training's ndcg@5: 0.361074	valid_1's ndcg@5: 0.340977
[5]	training's ndcg@5: 0.366435	valid_1's ndcg@5: 0.343991
[6]	training's ndcg@5: 0.371134	valid_1's ndcg@5: 0.347469
[7]	training's ndcg@5: 0.376482	valid_1's ndcg@5: 0.349953
[8]	training's ndcg@5: 0.379578	valid_1's ndcg@5: 0.352844
[9]	training's ndcg@5: 0.382103	valid_1's ndcg@5: 0.35425
[10]	training's ndcg@5: 0.385151	valid_1's ndcg@5: 0.355526
[11]	training's ndcg@5: 0.386906	valid_1's ndcg@5: 0.35611
[12]	training's ndcg@5: 0.390398	valid_1's ndcg@5: 0.358514
[13]	training's ndcg@5: 0.39216	valid_1's ndcg@5: 0.358998
[14]	training's ndcg@5: 0.396545	valid_1's ndcg@5: 0.362194
[15]	training's ndcg@5: 0.398581	valid_1's ndcg@5: 0.363613
[16]	training's ndcg@5: 0.399811	valid_1's ndcg@5: 0.364139
[17]	training's ndcg@5: 0.401624	valid_1's ndcg@5: 0

[138]	training's ndcg@5: 0.495981	valid_1's ndcg@5: 0.388184
[139]	training's ndcg@5: 0.496477	valid_1's ndcg@5: 0.388029
[140]	training's ndcg@5: 0.497054	valid_1's ndcg@5: 0.388069
[141]	training's ndcg@5: 0.497778	valid_1's ndcg@5: 0.388444
[142]	training's ndcg@5: 0.498252	valid_1's ndcg@5: 0.388266
[143]	training's ndcg@5: 0.498628	valid_1's ndcg@5: 0.388389
[144]	training's ndcg@5: 0.499009	valid_1's ndcg@5: 0.388348
[145]	training's ndcg@5: 0.499445	valid_1's ndcg@5: 0.388362
[146]	training's ndcg@5: 0.49973	valid_1's ndcg@5: 0.388216
[147]	training's ndcg@5: 0.500163	valid_1's ndcg@5: 0.388521
[148]	training's ndcg@5: 0.50077	valid_1's ndcg@5: 0.388348
[149]	training's ndcg@5: 0.501114	valid_1's ndcg@5: 0.38834
[150]	training's ndcg@5: 0.501566	valid_1's ndcg@5: 0.388351
[151]	training's ndcg@5: 0.502122	valid_1's ndcg@5: 0.388455
[152]	training's ndcg@5: 0.502449	valid_1's ndcg@5: 0.388471
[153]	training's ndcg@5: 0.502723	valid_1's ndcg@5: 0.388168
[154]	training's ndcg@5: 0.

LGBMRanker(metric='ndcg', num_iterations=200, objective='lambdarank', verbose=1)

In [21]:
y_pred_66 = model66.predict(X_test)
y_pred_66

array([ 0.47242995, -0.90038898, -0.56001683, ..., -0.48964774,
        0.04473139,  0.50622928])

In [22]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = y_pred_66
# df.head(30)

In [23]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values]
    scores.append(ndcg_score(a1, a2, k=5))
print(sum(scores)/len(scores)) #0.3733667894875733 previously

0.37352776977571583


In [24]:
model66_ = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1, num_iterations=200)

In [25]:
%%time
model66_.fit(train.drop([target], axis=1), -train[target], verbose=1,
             group=train['srch_id'].value_counts(sort=False).sort_index(),categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53610
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 106
CPU times: user 9min 12s, sys: 54.1 s, total: 10min 6s
Wall time: 4min 19s


LGBMRanker(metric='ndcg', num_iterations=200, objective='lambdarank', verbose=1)

In [26]:
%%time
y_pred66 = model66_.predict(test)
y_pred66 

CPU times: user 6min 1s, sys: 39.9 s, total: 6min 41s
Wall time: 2min 36s


array([ 0.44716628,  0.91615011, -0.27887903, ...,  0.09441369,
        0.05691774,  0.24926985])

## 6) Listwise: LGBMRanker with tuned hyperparameters

In [32]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [35]:
custom_scorer = make_scorer(ndcg_score, k=5, greater_is_better=True)

In [36]:
lgb_11 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1)

In [37]:

random_grid_params = {
    'learning_rate': [0.05, 0.1, 0.15], 
    'n_estimators': [80, 100, 110, 120], 
    'min_child_samples': [17, 20, 23],
    'num_leaves': [28, 31, 34],# large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': ["gbdt", "dart", "goss"], # for better accuracy -> try dart
    'max_bin': [255, 300],#large max_bin helps improve accuracy but might slow down training progress
    'subsample': [1, 0.9],
    'random_state': [42],
    'verbose': [1]
}

random_search = RandomizedSearchCV(lgb_11, random_grid_params, n_iter=2, scoring=custom_scorer, cv=gss)

In [40]:
%%time 
random_search.fit(X_train, y_train_, groups=X_train['srch_id'], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
                  eval_at=5, categorical_feature=rest)


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 977, in fit
    super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 612, in fit
    self._Booster = train(params, train_set,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 2053, in __init__
    train_set.construct()
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 1321, in construct
    self._lazy_init(self.data, label=self.label,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 1141, in _lazy

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55905
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 106
CPU times: user 9min 51s, sys: 4min 39s, total: 14min 31s
Wall time: 9min 41s


RandomizedSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
                   estimator=LGBMRanker(metric='ndcg', objective='lambdarank',
                                        verbose=1),
                   n_iter=2,
                   param_distributions={'boosting_type': ['gbdt', 'dart',
                                                          'goss'],
                                        'learning_rate': [0.05, 0.1, 0.15],
                                        'max_bin': [255, 300],
                                        'min_child_samples': [17, 20, 23],
                                        'n_estimators': [80, 100, 110, 120],
                                        'num_leaves': [28, 31, 34],
                                        'random_state': [42],
                                        'subsample': [1, 0.9], 'verbose': [1]},
                   scoring=make_scorer(ndcg_score, k=5))

In [42]:
random_search.best_params_

{'verbose': 1,
 'subsample': 0.9,
 'random_state': 42,
 'num_leaves': 31,
 'n_estimators': 120,
 'min_child_samples': 20,
 'max_bin': 255,
 'learning_rate': 0.05,
 'boosting_type': 'gbdt'}

In [43]:
model7 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1, subsample=0.9, learning_rate=0.05,
                        n_estimators=300)

In [44]:
%%time
model7.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55835
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 106




[1]	training's ndcg@5: 0.296258	valid_1's ndcg@5: 0.291198
[2]	training's ndcg@5: 0.338347	valid_1's ndcg@5: 0.325639
[3]	training's ndcg@5: 0.350967	valid_1's ndcg@5: 0.335463
[4]	training's ndcg@5: 0.358338	valid_1's ndcg@5: 0.340253
[5]	training's ndcg@5: 0.363304	valid_1's ndcg@5: 0.343129
[6]	training's ndcg@5: 0.367693	valid_1's ndcg@5: 0.34661
[7]	training's ndcg@5: 0.371235	valid_1's ndcg@5: 0.34854
[8]	training's ndcg@5: 0.376208	valid_1's ndcg@5: 0.352454
[9]	training's ndcg@5: 0.378396	valid_1's ndcg@5: 0.35293
[10]	training's ndcg@5: 0.38013	valid_1's ndcg@5: 0.353548
[11]	training's ndcg@5: 0.381131	valid_1's ndcg@5: 0.354361
[12]	training's ndcg@5: 0.383741	valid_1's ndcg@5: 0.357238
[13]	training's ndcg@5: 0.384683	valid_1's ndcg@5: 0.358202
[14]	training's ndcg@5: 0.386477	valid_1's ndcg@5: 0.358803
[15]	training's ndcg@5: 0.388247	valid_1's ndcg@5: 0.360052
[16]	training's ndcg@5: 0.389814	valid_1's ndcg@5: 0.360573
[17]	training's ndcg@5: 0.392594	valid_1's ndcg@5: 0.

[138]	training's ndcg@5: 0.463035	valid_1's ndcg@5: 0.383341
[139]	training's ndcg@5: 0.463402	valid_1's ndcg@5: 0.383638
[140]	training's ndcg@5: 0.463887	valid_1's ndcg@5: 0.383705
[141]	training's ndcg@5: 0.464127	valid_1's ndcg@5: 0.383422
[142]	training's ndcg@5: 0.464507	valid_1's ndcg@5: 0.383377
[143]	training's ndcg@5: 0.464749	valid_1's ndcg@5: 0.383349
[144]	training's ndcg@5: 0.465072	valid_1's ndcg@5: 0.383566
[145]	training's ndcg@5: 0.465426	valid_1's ndcg@5: 0.383284
[146]	training's ndcg@5: 0.465676	valid_1's ndcg@5: 0.383337
[147]	training's ndcg@5: 0.466024	valid_1's ndcg@5: 0.383733
[148]	training's ndcg@5: 0.466291	valid_1's ndcg@5: 0.383427
[149]	training's ndcg@5: 0.46666	valid_1's ndcg@5: 0.383583
[150]	training's ndcg@5: 0.467077	valid_1's ndcg@5: 0.383805
[151]	training's ndcg@5: 0.467615	valid_1's ndcg@5: 0.384001
[152]	training's ndcg@5: 0.468035	valid_1's ndcg@5: 0.384107
[153]	training's ndcg@5: 0.468389	valid_1's ndcg@5: 0.384349
[154]	training's ndcg@5: 

[273]	training's ndcg@5: 0.502141	valid_1's ndcg@5: 0.388422
[274]	training's ndcg@5: 0.502367	valid_1's ndcg@5: 0.388224
[275]	training's ndcg@5: 0.502514	valid_1's ndcg@5: 0.388258
[276]	training's ndcg@5: 0.502728	valid_1's ndcg@5: 0.388299
[277]	training's ndcg@5: 0.502997	valid_1's ndcg@5: 0.388318
[278]	training's ndcg@5: 0.503246	valid_1's ndcg@5: 0.388656
[279]	training's ndcg@5: 0.503493	valid_1's ndcg@5: 0.388593
[280]	training's ndcg@5: 0.503694	valid_1's ndcg@5: 0.388832
[281]	training's ndcg@5: 0.503926	valid_1's ndcg@5: 0.388837
[282]	training's ndcg@5: 0.504135	valid_1's ndcg@5: 0.388724
[283]	training's ndcg@5: 0.504293	valid_1's ndcg@5: 0.388764
[284]	training's ndcg@5: 0.504441	valid_1's ndcg@5: 0.38872
[285]	training's ndcg@5: 0.504629	valid_1's ndcg@5: 0.388917
[286]	training's ndcg@5: 0.504818	valid_1's ndcg@5: 0.388812
[287]	training's ndcg@5: 0.504997	valid_1's ndcg@5: 0.388738
[288]	training's ndcg@5: 0.505277	valid_1's ndcg@5: 0.38886
[289]	training's ndcg@5: 0

LGBMRanker(learning_rate=0.05, metric='ndcg', n_estimators=300,
           objective='lambdarank', subsample=0.9, verbose=1)

In [45]:
y_pred_7 = model7.predict(X_test)
y_pred_7

array([ 0.47556197, -0.89544195, -0.67214764, ..., -0.36919047,
        0.06262034,  0.52796963])

In [46]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = y_pred_7
# df.head(30)

In [47]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values]
    scores.append(ndcg_score(a1, a2, k=5))
print(sum(scores)/len(scores)) #should be better than 0,373

0.37192940880814823


In [48]:
y_pred7 = model7.predict(test)
y_pred7

KeyboardInterrupt: 

# Submission

- by target_score (click_bool)

In [27]:
r = pd.DataFrame(data = -y_pred66, columns=['target_score']) #replace with y_pred7
r.head(20)

In [28]:
r["srch_id"] = test['srch_id']
r["prop_id"] = test['prop_id']

In [29]:
result = r.sort_values(['srch_id','target_score'])[["srch_id","prop_id"]]

In [30]:
# result.head(10)

In [31]:
result.to_csv("sub14.csv", index=False)

-	CatBoost https://colab.research.google.com/drive/1cuFTgBFRVFD8dVP74QkhNZ_9v7sDgx_z 

https://www.kaggle.com/code/danofer/catboost-ranking-ncdg-expedia-search-queries 
-	TF listwise https://www.tensorflow.org/ranking 