In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import ndcg_score, make_scorer

# Modeling

In [2]:
pos = pd.read_csv('pos.csv')

In [4]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [5]:
cat_features = [ "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                "weekday", "month"]
rest = ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"]

target = 'target_score' #click_bool in this case


In [6]:
# group split needed for both train-test split and CV
splitter = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state = 7)
split = splitter.split(train, groups=train['srch_id'])
train_inds, test_inds = next(split)

train_df = train.iloc[train_inds]
test_df = train.iloc[test_inds]

X_train = train_df.drop([target], axis=1)
X_test = test_df.drop([target], axis=1)
y_train = train_df[target]
y_test = test_df[target]

### **) model with default parameters on validation data

In [7]:
model_2 = lgb.LGBMRegressor()


In [8]:
%%time
model_2.fit(X_train, y_train, categorical_feature=rest)




CPU times: user 3min 32s, sys: 32.9 s, total: 4min 5s
Wall time: 1min 40s


LGBMRegressor()

In [9]:
# %%time
y_pred_2 = model_2.predict(X_test)
y_pred_2

array([-0.06094215, -0.0292679 , -0.04246334, ..., -0.03407786,
       -0.05183003, -0.049999  ])

In [17]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = -y_pred_2
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,position,predictions,predictions_n
119,12,25,0.060942,0.143452
120,12,28,0.029268,0.105455
121,12,24,0.042463,0.121284
122,12,13,0.046966,0.126686
123,12,18,0.067535,0.15136
124,12,3,0.02941,0.105626
125,12,14,0.029594,0.105847
126,12,4,0.048431,0.128443
127,12,22,0.031605,0.108258
128,12,15,0.054236,0.135407


In [28]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values] 
    scores.append(ndcg_score(a1, a2, k=5)) 
print(sum(scores)/len(scores)) #0.39059

0.3905902440826018


## 1) Pointwise LGBM regression (no tuning)

In [355]:
model1 = lgb.LGBMRegressor()

In [356]:
%%time

# fit the same model on whole train data
model1.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



CPU times: user 3min 27s, sys: 32.3 s, total: 3min 59s
Wall time: 1min 57s


LGBMRegressor()

In [357]:
%%time
y_pred1 = model1.predict(test)
y_pred1 # 0,333 NDCG on public leaderboard 

CPU times: user 3min 31s, sys: 26.5 s, total: 3min 57s
Wall time: 1min 35s


array([-0.04420504, -0.0808436 , -0.02696308, ..., -0.07440339,
       -0.05211163, -0.05319732])

## 2) Pointwise LGBM regression (hyperparameters tuned)

In [5]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [29]:
custom_scorer = make_scorer(ndcg_score, k=5, greater_is_better=True)

### 2.1) random search

In [11]:
# tune hyperparameters with groupKfold
lgb_1 = lgb.LGBMRegressor()

random_grid_params = {
    'learning_rate': [0.05, 0.1, 0.15], 
    'n_estimators': [80, 100, 110, 120], 
    'min_child_samples': [17, 20, 23],
    'num_leaves': [28, 31, 34],# large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': ["gbdt", "dart", "goss"], # for better accuracy -> try dart
    'max_bin': [255, 300],#large max_bin helps improve accuracy but might slow down training progress
    'subsample': [1, 0.9],
    'random_state': [42],
    'verbose': [1]
}

random_search = RandomizedSearchCV(lgb_1, random_grid_params, n_iter=15, scoring=custom_scorer, cv=gss)


In [12]:
%%time 
random_search.fit(X_train, y_train, groups=X_train['srch_id'], categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61533
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61230
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61152
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54800
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 5h 52min 36s, sys: 29min, total: 6h 21min 37s
Wall time: 2h 14min 14s


RandomizedSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
                   estimator=LGBMRegressor(), n_iter=15,
                   param_distributions={'boosting_type': ['gbdt', 'dart',
                                                          'goss'],
                                        'learning_rate': [0.05, 0.1, 0.15],
                                        'max_bin': [255, 300],
                                        'min_child_samples': [17, 20, 23],
                                        'n_estimators': [80, 100, 110, 120],
                                        'num_leaves': [28, 31, 34],
                                        'random_state': [42],
                                        'subsample': [1, 0.9], 'verbose': [1]},
                   scoring=make_scorer(ndcg5))

In [13]:
print(random_search.best_params_)

{'verbose': 1, 'subsample': 1, 'random_state': 42, 'num_leaves': 34, 'n_estimators': 80, 'min_child_samples': 23, 'max_bin': 300, 'learning_rate': 0.1, 'boosting_type': 'dart'}


### 2.2) grid search

In [23]:
lgb_2 = lgb.LGBMRegressor()

grid_params = {
    'learning_rate': [0.1], 
    'n_estimators': [75, 80, 85], 
    'min_child_samples': [23, 25],
    'num_leaves': [34],
    'boosting_type': ['dart'],
    'max_bin': [300],
    'subsample': [1],
    'random_state': [42],
    'verbose': [1]
}

grid_search = GridSearchCV(lgb_2, grid_params, scoring=custom_scorer, cv=gss)


In [24]:
%%time 
grid_search.fit(X_train, y_train, groups=X_train['srch_id'], categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61847
[LightGBM] [Info] Number of data points in the train set: 3344833, number of used features: 100
[LightGBM] [Info] Start training from score -0.044766
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61789
[LightGBM] [Info] Number of data points in the train set: 3346166, number of used features: 99
[LightGBM] [Info] Start training from score -0.044733
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61706
[LightGBM] [Info] Number of data points in the train set: 3346502, number of used features: 99
[LightGBM] [Info] Start training from score -0.044745
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61847
[LightGBM] [Info] Number of data points in the train set: 3344833, number of used features: 100
[LightGBM] [Info] Start training from score -0.044766
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61789
[LightGBM] [Info] Number of data points in the train set: 3346166, number of used features: 99
[LightGBM] [Info] Start training from score -0.044733
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61706
[LightGBM] [Info] Number of data points in the train set: 3346502, number of used features: 99
[LightGBM] [Info] Start training from score -0.044745
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

GridSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
             estimator=LGBMRegressor(),
             param_grid={'boosting_type': ['dart'], 'learning_rate': [0.1],
                         'max_bin': [300], 'min_child_samples': [23, 25],
                         'n_estimators': [75, 80, 85], 'num_leaves': [34],
                         'random_state': [42], 'subsample': [1],
                         'verbose': [1]},
             scoring=make_scorer(ndcg5))

In [26]:
print(grid_search.best_params_)

{'boosting_type': 'dart', 'learning_rate': 0.1, 'max_bin': 300, 'min_child_samples': 23, 'n_estimators': 75, 'num_leaves': 34, 'random_state': 42, 'subsample': 1, 'verbose': 1}


### 2.3) model with best parameters on validation data

In [27]:
model2 = lgb.LGBMRegressor(boosting_type='dart', learning_rate= 0.1, max_bin= 300, min_child_samples= 23, n_estimators= 75, 
                            num_leaves= 34, random_state= 42, subsample= 1, verbose= 1)


In [28]:
%%time
model2.fit(X_train, y_train, categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54800
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 4min 57s, sys: 27 s, total: 5min 24s
Wall time: 1min 47s


LGBMRegressor(boosting_type='dart', max_bin=300, min_child_samples=23,
              n_estimators=75, num_leaves=34, random_state=42, subsample=1,
              verbose=1)

In [29]:
# %%time
y_pred2 = model2.predict(X_test)
y_pred2

array([-0.05891092, -0.02470499, -0.03123527, ..., -0.03825708,
       -0.0381674 , -0.04850369])

In [51]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = -y_pred2
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.058911,0.112061
120,12,0,0.024705,0.048015
121,12,0,0.031235,0.060242
122,12,0,0.033468,0.064423
123,12,0,0.055979,0.106572
124,12,0,0.024705,0.048015
125,12,0,0.024705,0.048015
126,12,0,0.036114,0.069378
127,12,0,0.025796,0.050058
128,12,0,0.037591,0.072143


In [52]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores))

0.6502984277660459


### 2.4) Model with best parameters on test data trained on training and validation data

In [53]:
model3 = lgb.LGBMRegressor()

In [54]:
%%time
model3.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52285
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 100
[LightGBM] [Info] Start training from score -0.044749
CPU times: user 5min 9s, sys: 50.1 s, total: 5min 59s
Wall time: 2min 33s


LGBMRegressor(boosting_type='dart', max_bin=300, min_child_samples=23,
              n_estimators=75, num_leaves=34, random_state=42, subsample=1,
              verbose=1)

In [55]:
%%time
y_pred3 = model3.predict(test)
y_pred3 

CPU times: user 1min 32s, sys: 34.5 s, total: 2min 6s
Wall time: 1min 12s


array([-0.03317082, -0.07022708, -0.02995487, ..., -0.06802567,
       -0.04854744, -0.04361254])

# ===========

## 3) Pointwise LGBM regression (hyperparameters tuned manually)

In [30]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [31]:
custom_scorer = make_scorer(ndcg_score, k=5, greater_is_better=True)

In [39]:
%%time
it_sc = []
for train_idx, test_idx in gss.split(X_train, y_train, groups=X_train['srch_id']):
    lgb_1 = lgb.LGBMRegressor(learning_rate=0.05)
    lgb_1.fit(X_train.iloc[train_idx], y_train.iloc[train_idx], categorical_feature=rest)
    print("FITTED")
    pred = lgb_1.predict(X_train.iloc[test_idx])
    print(len(pred))
    df = pd.concat([X_train["srch_id"].iloc[test_idx], pos.iloc[test_idx]], axis=1)
    print("FAULT preds")
    df['predictions'] = -pred
    scores = []
    for i in df['srch_id'].unique():
        a1 = [df[df["srch_id"]==i]["position"].values]
        a2 = [df[df["srch_id"]==i]["predictions"].values] 
        scores.append(ndcg_score(a1, a2, k=5)) 
    print(sum(scores)/len(scores))
    it_sc.append(sum(scores)/len(scores))
print(it_sc)
print(sum(it_sc)/len(it_sc))



FITTED
1114548


KeyError: "None of [Int64Index([      0,       1,       2,       3,       4,       5,       6,\n                  7,       8,       9,\n            ...\n            4461196, 4461197, 4461198, 4461199, 4461200, 4461201, 4461202,\n            4461203, 4461204, 4461205],\n           dtype='int64', length=1114548)] are in the [columns]"

In [11]:
# tune hyperparameters with groupKfold
lgb_1 = lgb.LGBMRegressor()

random_grid_params = {
    'learning_rate': [0.05, 0.1, 0.15], 
    'n_estimators': [80, 100, 110, 120], 
    'min_child_samples': [17, 20, 23],
    'num_leaves': [28, 31, 34],# large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': ["gbdt", "dart", "goss"], # for better accuracy -> try dart
    'max_bin': [255, 300],#large max_bin helps improve accuracy but might slow down training progress
    'subsample': [1, 0.9],
    'random_state': [42],
    'verbose': [1]
}

random_search = RandomizedSearchCV(lgb_1, random_grid_params, n_iter=15, scoring=custom_scorer, cv=gss)


In [12]:
%%time 
random_search.fit(X_train, y_train, groups=X_train['srch_id'], categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61533
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61230
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61152
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54800
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 5h 52min 36s, sys: 29min, total: 6h 21min 37s
Wall time: 2h 14min 14s


RandomizedSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
                   estimator=LGBMRegressor(), n_iter=15,
                   param_distributions={'boosting_type': ['gbdt', 'dart',
                                                          'goss'],
                                        'learning_rate': [0.05, 0.1, 0.15],
                                        'max_bin': [255, 300],
                                        'min_child_samples': [17, 20, 23],
                                        'n_estimators': [80, 100, 110, 120],
                                        'num_leaves': [28, 31, 34],
                                        'random_state': [42],
                                        'subsample': [1, 0.9], 'verbose': [1]},
                   scoring=make_scorer(ndcg5))

## 5) Listwise: LGBMRanker with default parameters

In [46]:
y_train_ = -y_train
y_test_ = -y_test

In [47]:
model6 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1)

In [48]:
%%time
model6.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54305
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100




[1]	training's ndcg@5: 0.299461	valid_1's ndcg@5: 0.292636
[2]	training's ndcg@5: 0.339943	valid_1's ndcg@5: 0.325278
[3]	training's ndcg@5: 0.352922	valid_1's ndcg@5: 0.333621
[4]	training's ndcg@5: 0.36205	valid_1's ndcg@5: 0.340558
[5]	training's ndcg@5: 0.367696	valid_1's ndcg@5: 0.346121
[6]	training's ndcg@5: 0.372135	valid_1's ndcg@5: 0.348248
[7]	training's ndcg@5: 0.37652	valid_1's ndcg@5: 0.351114
[8]	training's ndcg@5: 0.379403	valid_1's ndcg@5: 0.352944
[9]	training's ndcg@5: 0.382513	valid_1's ndcg@5: 0.354109
[10]	training's ndcg@5: 0.384195	valid_1's ndcg@5: 0.356146
[11]	training's ndcg@5: 0.387085	valid_1's ndcg@5: 0.357555
[12]	training's ndcg@5: 0.389873	valid_1's ndcg@5: 0.359794
[13]	training's ndcg@5: 0.392433	valid_1's ndcg@5: 0.360516
[14]	training's ndcg@5: 0.393826	valid_1's ndcg@5: 0.360487
[15]	training's ndcg@5: 0.397919	valid_1's ndcg@5: 0.363979
[16]	training's ndcg@5: 0.39977	valid_1's ndcg@5: 0.365685
[17]	training's ndcg@5: 0.400871	valid_1's ndcg@5: 0

LGBMRanker(metric='ndcg', objective='lambdarank', verbose=1)

In [49]:
y_pred_6 = model6.predict(X_test)
y_pred_6

array([ 0.46175809, -0.7501605 , -0.54465082, ..., -0.26479785,
        0.05585781,  0.35997866])

In [72]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = y_pred_6
# df.head(30)

In [52]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values]
    scores.append(ndcg_score(a1, a2, k=5))
print(sum(scores)/len(scores)) 

0.3724792233297772


In [19]:
y_pred6 = model6.predict(test)
y_pred6 #scored 0,36 on public leaderboard

array([ 0.46165524,  0.88704194, -0.1051241 , ...,  0.01261689,
        0.06631362,  0.17210367])

## Manual try parameters

In [79]:
model66 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1, num_iterations=200)

In [80]:
%%time
model66.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54305
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100




[1]	training's ndcg@5: 0.299461	valid_1's ndcg@5: 0.292636
[2]	training's ndcg@5: 0.339943	valid_1's ndcg@5: 0.325278
[3]	training's ndcg@5: 0.352922	valid_1's ndcg@5: 0.333621
[4]	training's ndcg@5: 0.36205	valid_1's ndcg@5: 0.340558
[5]	training's ndcg@5: 0.367696	valid_1's ndcg@5: 0.346121
[6]	training's ndcg@5: 0.372135	valid_1's ndcg@5: 0.348248
[7]	training's ndcg@5: 0.37652	valid_1's ndcg@5: 0.351114
[8]	training's ndcg@5: 0.379403	valid_1's ndcg@5: 0.352944
[9]	training's ndcg@5: 0.382513	valid_1's ndcg@5: 0.354109
[10]	training's ndcg@5: 0.384195	valid_1's ndcg@5: 0.356146
[11]	training's ndcg@5: 0.387085	valid_1's ndcg@5: 0.357555
[12]	training's ndcg@5: 0.389873	valid_1's ndcg@5: 0.359794
[13]	training's ndcg@5: 0.392433	valid_1's ndcg@5: 0.360516
[14]	training's ndcg@5: 0.393826	valid_1's ndcg@5: 0.360487
[15]	training's ndcg@5: 0.397919	valid_1's ndcg@5: 0.363979
[16]	training's ndcg@5: 0.39977	valid_1's ndcg@5: 0.365685
[17]	training's ndcg@5: 0.400871	valid_1's ndcg@5: 0

[138]	training's ndcg@5: 0.495693	valid_1's ndcg@5: 0.386988
[139]	training's ndcg@5: 0.496066	valid_1's ndcg@5: 0.387077
[140]	training's ndcg@5: 0.496495	valid_1's ndcg@5: 0.387055
[141]	training's ndcg@5: 0.496934	valid_1's ndcg@5: 0.386898
[142]	training's ndcg@5: 0.497655	valid_1's ndcg@5: 0.387003
[143]	training's ndcg@5: 0.498067	valid_1's ndcg@5: 0.386671
[144]	training's ndcg@5: 0.498404	valid_1's ndcg@5: 0.386446
[145]	training's ndcg@5: 0.498834	valid_1's ndcg@5: 0.386529
[146]	training's ndcg@5: 0.499142	valid_1's ndcg@5: 0.386503
[147]	training's ndcg@5: 0.499526	valid_1's ndcg@5: 0.386362
[148]	training's ndcg@5: 0.499939	valid_1's ndcg@5: 0.386376
[149]	training's ndcg@5: 0.500508	valid_1's ndcg@5: 0.38647
[150]	training's ndcg@5: 0.50094	valid_1's ndcg@5: 0.386645
[151]	training's ndcg@5: 0.50128	valid_1's ndcg@5: 0.386563
[152]	training's ndcg@5: 0.501859	valid_1's ndcg@5: 0.386751
[153]	training's ndcg@5: 0.502314	valid_1's ndcg@5: 0.386871
[154]	training's ndcg@5: 0.

LGBMRanker(metric='ndcg', num_iterations=200, objective='lambdarank', verbose=1)

In [81]:
y_pred_66 = model66.predict(X_test)
y_pred_66

array([ 0.49892916, -0.77827307, -0.48718981, ..., -0.38726199,
        0.22071585,  0.40486471])

In [82]:
df = pd.concat([X_test["srch_id"], pos.iloc[test_inds]], axis=1)
df['predictions'] = y_pred_66
# df.head(30)

In [83]:
scores = []
for i in df['srch_id'].unique():
    a1 = [df[df["srch_id"]==i]["position"].values]
    a2 = [df[df["srch_id"]==i]["predictions"].values]
    scores.append(ndcg_score(a1, a2, k=5))
print(sum(scores)/len(scores)) #3733

0.3733667894875733


In [84]:
model66_ = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1, num_iterations=200)

In [90]:
%%time
model66_.fit(train.drop([target], axis=1), -train[target], verbose=1,
             group=train['srch_id'].value_counts(sort=False).sort_index(),categorical_feature=rest)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52080
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 100
CPU times: user 8min 1s, sys: 55.6 s, total: 8min 57s
Wall time: 3min 36s


LGBMRanker(metric='ndcg', num_iterations=200, objective='lambdarank', verbose=1)

In [91]:
%%time
y_pred66 = model66_.predict(test)
y_pred66 

array([ 0.59142269,  1.12019842, -0.25803262, ...,  0.03295955,
       -0.05237822,  0.23207979])

## 6) Listwise: LGBMRanker with tuned hyperparameters

In [31]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [61]:
lgb_11 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1)

In [62]:

random_grid_params = {
    'learning_rate': [0.05, 0.1, 0.15], 
    'n_estimators': [80, 100, 110, 120], 
    'min_child_samples': [17, 20, 23],
    'num_leaves': [28, 31, 34],# large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': ["gbdt", "dart", "goss"], # for better accuracy -> try dart
    'max_bin': [255, 300],#large max_bin helps improve accuracy but might slow down training progress
    'subsample': [1, 0.9],
    'random_state': [42],
    'verbose': [1]
}

random_search = RandomizedSearchCV(lgb_11, random_grid_params, n_iter=2, scoring=custom_scorer, cv=gss)

In [65]:
%%time 
random_search.fit(X_train, y_train, groups=X_train['srch_id'], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
                  eval_at=5, categorical_feature=rest)


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 977, in fit
    super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 612, in fit
    self._Booster = train(params, train_set,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/engine.py", line 231, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 2053, in __init__
    train_set.construct()
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 1321, in construct
    self._lazy_init(self.data, label=self.label,
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 1141, in _lazy

LightGBMError: Label should be non-negative (met -1.000000) for ranking task

In [9]:
%%time
model7.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


New categorical_feature is ['prop_country_id', 'prop_id', 'site_id', 'srch_destination_id', 'srch_id', 'visitor_location_country_id']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54305
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100




[1]	training's ndcg@5: 0.299461	valid_1's ndcg@5: 0.292636
[2]	training's ndcg@5: 0.339943	valid_1's ndcg@5: 0.325278
[3]	training's ndcg@5: 0.352922	valid_1's ndcg@5: 0.333621
[4]	training's ndcg@5: 0.36205	valid_1's ndcg@5: 0.340558
[5]	training's ndcg@5: 0.367696	valid_1's ndcg@5: 0.346121
[6]	training's ndcg@5: 0.372135	valid_1's ndcg@5: 0.348248
[7]	training's ndcg@5: 0.37652	valid_1's ndcg@5: 0.351114
[8]	training's ndcg@5: 0.379403	valid_1's ndcg@5: 0.352944
[9]	training's ndcg@5: 0.382513	valid_1's ndcg@5: 0.354109
[10]	training's ndcg@5: 0.384195	valid_1's ndcg@5: 0.356146
[11]	training's ndcg@5: 0.387085	valid_1's ndcg@5: 0.357555
[12]	training's ndcg@5: 0.389873	valid_1's ndcg@5: 0.359794
[13]	training's ndcg@5: 0.392433	valid_1's ndcg@5: 0.360516
[14]	training's ndcg@5: 0.393826	valid_1's ndcg@5: 0.360487
[15]	training's ndcg@5: 0.397919	valid_1's ndcg@5: 0.363979
[16]	training's ndcg@5: 0.39977	valid_1's ndcg@5: 0.365685
[17]	training's ndcg@5: 0.400871	valid_1's ndcg@5: 0

LGBMRanker(metric='ndcg', objective='lambdarank', verbose=1)

In [11]:
y_pred_7 = model7.predict(X_test)
y_pred_7

array([ 0.46175809, -0.7501605 , -0.54465082, ..., -0.26479785,
        0.05585781,  0.35997866])

In [15]:
df = pd.concat([X_test["srch_id"], y_test_], axis=1)
df['predictions'] = y_pred_7
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.461758,0.506286
120,12,0,-0.750161,0.3269
121,12,0,-0.544651,0.357319
122,12,0,-0.536805,0.35848
123,12,0,0.192074,0.466368
124,12,0,-0.639114,0.343336
125,12,0,-0.658146,0.340519
126,12,0,-0.320207,0.390541
127,12,0,-0.696013,0.334914
128,12,0,-0.215528,0.406035


In [18]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores)) 

0.7919465816683966


In [19]:
y_pred7 = model7.predict(test)
y_pred7

array([ 0.46165524,  0.88704194, -0.1051241 , ...,  0.01261689,
        0.06631362,  0.17210367])

# Submission

- by target_score (click_bool)

In [92]:
r = pd.DataFrame(data = -y_pred66, columns=['target_score'])
# r.head(20)

In [93]:
r["srch_id"] = test['srch_id']
r["prop_id"] = test['prop_id']

In [94]:
result = r.sort_values(['srch_id','target_score'])[["srch_id","prop_id"]]

In [95]:
# result.head(10)

In [96]:
result.to_csv("sub13.csv", index=False)

-	CatBoost https://colab.research.google.com/drive/1cuFTgBFRVFD8dVP74QkhNZ_9v7sDgx_z 

https://www.kaggle.com/code/danofer/catboost-ranking-ncdg-expedia-search-queries 
-	TF listwise https://www.tensorflow.org/ranking 