In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import ndcg_score, make_scorer

# Modeling

In [2]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [3]:
cat_features = [ "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                "weekday", "month"]
rest = ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"]

target = 'target_score' #click_bool in this case


In [6]:
# group split needed for both train-test split and CV
splitter = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state = 7)
split = splitter.split(train, groups=train['srch_id'])
train_inds, test_inds = next(split)

train_df = train.iloc[train_inds]
test_df = train.iloc[test_inds]

X_train = train_df.drop([target], axis=1)
X_test = test_df.drop([target], axis=1)
y_train = train_df[target]
y_test = test_df[target]

## 1) Pointwise LGBM regression (no tuning)

In [355]:
model1 = lgb.LGBMRegressor()

In [356]:
%%time

# fit the same model on whole train data
model1.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



CPU times: user 3min 27s, sys: 32.3 s, total: 3min 59s
Wall time: 1min 57s


LGBMRegressor()

In [357]:
%%time
y_pred1 = model1.predict(test)
y_pred1 # 0,333 NDCG on public leaderboard when click_bool used + one-hot-encoding

CPU times: user 3min 31s, sys: 26.5 s, total: 3min 57s
Wall time: 1min 35s


array([-0.04420504, -0.0808436 , -0.02696308, ..., -0.07440339,
       -0.05211163, -0.05319732])

## 2) Pointwise LGBM regression (hyperparameters tuned)

In [5]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [22]:
def dcg_at_k(sorted_labels, k):
    if k > 0:
        k = min(sorted_labels.shape[0], k)
    else:
        k = sorted_labels.shape[0]
    denom = 1./np.log2(np.arange(k)+2.)
    nom = 2**sorted_labels-1.
    dcg = np.sum(nom[:k]*denom)
    return dcg

def ndcg5(scores, labels):
    sort_ind = np.argsort(scores)[::-1]
    sorted_labels = labels[sort_ind]
    ideal_labels = np.sort(labels)[::-1]
    return dcg_at_k(sorted_labels, 5) / dcg_at_k(ideal_labels, 5)

In [7]:
custom_scorer = make_scorer(ndcg5, greater_is_better=True)

### 2.1) random search

In [11]:
# tune hyperparameters with groupKfold
lgb_1 = lgb.LGBMRegressor()

random_grid_params = {
    'learning_rate': [0.05, 0.1, 0.15], 
    'n_estimators': [80, 100, 110, 120], 
    'min_child_samples': [17, 20, 23],
    'num_leaves': [28, 31, 34],# large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': ["gbdt", "dart", "goss"], # for better accuracy -> try dart
    'max_bin': [255, 300],#large max_bin helps improve accuracy but might slow down training progress
    'subsample': [1, 0.9],
    'random_state': [42],
    'verbose': [1]
}

random_search = RandomizedSearchCV(lgb_1, random_grid_params, n_iter=15, scoring=custom_scorer, cv=gss)


In [12]:
%%time 
random_search.fit(X_train, y_train, groups=X_train['srch_id'], categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61533
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61230
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61152
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61960
[LightGBM] [Info] Number of data points in the train set: 3346997, number of used features: 99
[LightGBM] [Info] Start training from score -0.044774
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61648
[LightGBM] [Info] Number of data points in the train set: 3345559, number of used features: 100
[LightGBM] [Info] Start training from score -0.044846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61645
[LightGBM] [Info] Number of data points in the train set: 3343685, number of used features: 99
[LightGBM] [Info] Start training from score -0.044803
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54800
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 5h 52min 36s, sys: 29min, total: 6h 21min 37s
Wall time: 2h 14min 14s


RandomizedSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
                   estimator=LGBMRegressor(), n_iter=15,
                   param_distributions={'boosting_type': ['gbdt', 'dart',
                                                          'goss'],
                                        'learning_rate': [0.05, 0.1, 0.15],
                                        'max_bin': [255, 300],
                                        'min_child_samples': [17, 20, 23],
                                        'n_estimators': [80, 100, 110, 120],
                                        'num_leaves': [28, 31, 34],
                                        'random_state': [42],
                                        'subsample': [1, 0.9], 'verbose': [1]},
                   scoring=make_scorer(ndcg5))

In [13]:
print(random_search.best_params_)

{'verbose': 1, 'subsample': 1, 'random_state': 42, 'num_leaves': 34, 'n_estimators': 80, 'min_child_samples': 23, 'max_bin': 300, 'learning_rate': 0.1, 'boosting_type': 'dart'}


### 2.2) grid search

In [23]:
lgb_2 = lgb.LGBMRegressor()

grid_params = {
    'learning_rate': [0.1], 
    'n_estimators': [75, 80, 85], 
    'min_child_samples': [23, 25],
    'num_leaves': [34],
    'boosting_type': ['dart'],
    'max_bin': [300],
    'subsample': [1],
    'random_state': [42],
    'verbose': [1]
}

grid_search = GridSearchCV(lgb_2, grid_params, scoring=custom_scorer, cv=gss)


In [24]:
%%time 
grid_search.fit(X_train, y_train, groups=X_train['srch_id'], categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61847
[LightGBM] [Info] Number of data points in the train set: 3344833, number of used features: 100
[LightGBM] [Info] Start training from score -0.044766
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61789
[LightGBM] [Info] Number of data points in the train set: 3346166, number of used features: 99
[LightGBM] [Info] Start training from score -0.044733
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61706
[LightGBM] [Info] Number of data points in the train set: 3346502, number of used features: 99
[LightGBM] [Info] Start training from score -0.044745
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61847
[LightGBM] [Info] Number of data points in the train set: 3344833, number of used features: 100
[LightGBM] [Info] Start training from score -0.044766
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61789
[LightGBM] [Info] Number of data points in the train set: 3346166, number of used features: 99
[LightGBM] [Info] Start training from score -0.044733
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 61706
[LightGBM] [Info] Number of data points in the train set: 3346502, number of used features: 99
[LightGBM] [Info] Start training from score -0.044745
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

GridSearchCV(cv=GroupShuffleSplit(n_splits=4, random_state=None, test_size=0.25,
         train_size=None),
             estimator=LGBMRegressor(),
             param_grid={'boosting_type': ['dart'], 'learning_rate': [0.1],
                         'max_bin': [300], 'min_child_samples': [23, 25],
                         'n_estimators': [75, 80, 85], 'num_leaves': [34],
                         'random_state': [42], 'subsample': [1],
                         'verbose': [1]},
             scoring=make_scorer(ndcg5))

In [26]:
print(grid_search.best_params_)

{'boosting_type': 'dart', 'learning_rate': 0.1, 'max_bin': 300, 'min_child_samples': 23, 'n_estimators': 75, 'num_leaves': 34, 'random_state': 42, 'subsample': 1, 'verbose': 1}


### 2.3) model with best parameters on validation data

In [27]:
model2 = lgb.LGBMRegressor(boosting_type='dart', learning_rate= 0.1, max_bin= 300, min_child_samples= 23, n_estimators= 75, 
                            num_leaves= 34, random_state= 42, subsample= 1, verbose= 1)


In [28]:
%%time
model2.fit(X_train, y_train, categorical_feature=rest)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54800
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 4min 57s, sys: 27 s, total: 5min 24s
Wall time: 1min 47s


LGBMRegressor(boosting_type='dart', max_bin=300, min_child_samples=23,
              n_estimators=75, num_leaves=34, random_state=42, subsample=1,
              verbose=1)

In [29]:
# %%time
y_pred2 = model2.predict(X_test)
y_pred2

array([-0.05891092, -0.02470499, -0.03123527, ..., -0.03825708,
       -0.0381674 , -0.04850369])

In [51]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = -y_pred2
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.058911,0.112061
120,12,0,0.024705,0.048015
121,12,0,0.031235,0.060242
122,12,0,0.033468,0.064423
123,12,0,0.055979,0.106572
124,12,0,0.024705,0.048015
125,12,0,0.024705,0.048015
126,12,0,0.036114,0.069378
127,12,0,0.025796,0.050058
128,12,0,0.037591,0.072143


In [52]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores))

0.6502984277660459


### **) model with default parameters on validation data

In [46]:
model_2 = lgb.LGBMRegressor()


In [47]:
%%time
model_2.fit(X_train, y_train, categorical_feature=rest)




CPU times: user 3min 27s, sys: 38.7 s, total: 4min 5s
Wall time: 1min 33s


LGBMRegressor()

In [48]:
# %%time
y_pred_2 = model_2.predict(X_test)
y_pred_2

array([-0.06094215, -0.0292679 , -0.04246334, ..., -0.03407786,
       -0.05183003, -0.049999  ])

In [49]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = -y_pred_2
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.060942,0.143452
120,12,0,0.029268,0.105455
121,12,0,0.042463,0.121284
122,12,0,0.046966,0.126686
123,12,0,0.067535,0.15136
124,12,0,0.02941,0.105626
125,12,0,0.029594,0.105847
126,12,0,0.048431,0.128443
127,12,0,0.031605,0.108258
128,12,0,0.054236,0.135407


In [50]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores))

0.7405544181973313


### 2.4) Model with best parameters on test data trained on training and validation data

In [53]:
model3 = lgb.LGBMRegressor()

In [54]:
%%time
model3.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52285
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 100
[LightGBM] [Info] Start training from score -0.044749
CPU times: user 5min 9s, sys: 50.1 s, total: 5min 59s
Wall time: 2min 33s


LGBMRegressor(boosting_type='dart', max_bin=300, min_child_samples=23,
              n_estimators=75, num_leaves=34, random_state=42, subsample=1,
              verbose=1)

In [55]:
%%time
y_pred3 = model3.predict(test)
y_pred3 

CPU times: user 1min 32s, sys: 34.5 s, total: 2min 6s
Wall time: 1min 12s


array([-0.03317082, -0.07022708, -0.02995487, ..., -0.06802567,
       -0.04854744, -0.04361254])

# ===========

## 3) Model with good parameters on test data trained on training and validation data

In [63]:
model4 = lgb.LGBMRegressor(boosting_type='dart', learning_rate= 0.1, max_bin= 260, random_state= 42, verbose= 1)


In [64]:
%%time
model4.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 51907
[LightGBM] [Info] Number of data points in the train set: 4958347, number of used features: 100
[LightGBM] [Info] Start training from score -0.044749
CPU times: user 7min 28s, sys: 51.5 s, total: 8min 20s
Wall time: 3min 21s


LGBMRegressor(boosting_type='dart', max_bin=260, random_state=42, verbose=1)

In [78]:
%%time
y_pred4 = model4.predict(test)
y_pred4

CPU times: user 1min 53s, sys: 32.6 s, total: 2min 26s
Wall time: 1min 11s


array([-0.03488449, -0.0692179 , -0.03125838, ..., -0.07318662,
       -0.05010118, -0.04608794])

### **) model with good parameters on validation data

In [67]:
model_4 = lgb.LGBMRegressor(boosting_type='dart', learning_rate= 0.1, max_bin= 260, random_state= 42, verbose= 1)


In [68]:
%%time
model_4.fit(X_train, y_train, categorical_feature=rest)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54420
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100
[LightGBM] [Info] Start training from score -0.044765
CPU times: user 6min 54s, sys: 48.2 s, total: 7min 42s
Wall time: 3min 1s


LGBMRegressor(boosting_type='dart', max_bin=260, random_state=42, verbose=1)

In [69]:
# %%time
y_pred_4 = model_4.predict(X_test)
y_pred_4

array([-0.05791199, -0.02639941, -0.03381181, ..., -0.03913637,
       -0.03873836, -0.05061529])

In [71]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = -y_pred_4
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.057912,0.100744
120,12,0,0.026399,0.045467
121,12,0,0.033812,0.058469
122,12,0,0.036452,0.063101
123,12,0,0.058651,0.10204
124,12,0,0.026399,0.045467
125,12,0,0.026399,0.045467
126,12,0,0.039499,0.068445
127,12,0,0.027546,0.047479
128,12,0,0.040725,0.070597


In [72]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores))

0.6483025578095043


# ===========

### **) Manually tried models with good parameters on validation data
does not improve the ndcg

In [167]:
model_5 = lgb.LGBMRegressor(n_estimators = 105)

In [168]:
%%time
model_5.fit(X_train, y_train, categorical_feature=rest)




CPU times: user 3min 27s, sys: 46.3 s, total: 4min 13s
Wall time: 2min 4s


LGBMRegressor(n_estimators=105)

In [169]:
# %%time
y_pred_5 = model_5.predict(X_test)
# y_pred_5

In [170]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = -y_pred_5
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.060821,0.1433
120,12,0,0.029147,0.105298
121,12,0,0.042735,0.121601
122,12,0,0.047238,0.127003
123,12,0,0.067728,0.151587
124,12,0,0.029289,0.105469
125,12,0,0.029473,0.10569
126,12,0,0.048792,0.128868
127,12,0,0.031484,0.108102
128,12,0,0.054612,0.135851


In [171]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores)) 

0.7384794825200066


## 4) Model with good parameters on test data trained on training and validation data

In [172]:
model5 = lgb.LGBMRegressor(n_estimators = 105)


In [173]:
%%time
model5.fit(train.drop([target], axis=1), train[target], categorical_feature=rest)



CPU times: user 3min 36s, sys: 50.3 s, total: 4min 26s
Wall time: 2min 38s


LGBMRegressor(n_estimators=105)

In [174]:
%%time
y_pred5 = model5.predict(test)
y_pred5

CPU times: user 3min 50s, sys: 37.1 s, total: 4min 27s
Wall time: 2min 13s


array([-0.05101309, -0.080704  , -0.02689009, ..., -0.07455775,
       -0.05307171, -0.05305771])

## 5) Listwise: LGBMRanker with default parameters

In [13]:
y_train_ = -y_train
y_test_ = -y_test

In [8]:
model6 = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=1)

0          0
1          0
2          0
3          0
4          0
          ..
4958342    0
4958343    0
4958344    0
4958345   -1
4958346    0
Name: target_score, Length: 4461236, dtype: int64

In [14]:
%%time
model6.fit(X_train, y_train_, eval_set=[(X_train, y_train_), (X_test, y_test_)], eval_group=[X_train['srch_id'].value_counts(sort=False).sort_index(), X_test['srch_id'].value_counts(sort=False).sort_index()], group=X_train['srch_id'].value_counts(sort=False).sort_index(),
            eval_at=5,categorical_feature=rest)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54305
[LightGBM] [Info] Number of data points in the train set: 4461236, number of used features: 100




[1]	training's ndcg@5: 0.299461	valid_1's ndcg@5: 0.292636
[2]	training's ndcg@5: 0.339943	valid_1's ndcg@5: 0.325278
[3]	training's ndcg@5: 0.352922	valid_1's ndcg@5: 0.333621
[4]	training's ndcg@5: 0.36205	valid_1's ndcg@5: 0.340558
[5]	training's ndcg@5: 0.367696	valid_1's ndcg@5: 0.346121
[6]	training's ndcg@5: 0.372135	valid_1's ndcg@5: 0.348248
[7]	training's ndcg@5: 0.37652	valid_1's ndcg@5: 0.351114
[8]	training's ndcg@5: 0.379403	valid_1's ndcg@5: 0.352944
[9]	training's ndcg@5: 0.382513	valid_1's ndcg@5: 0.354109
[10]	training's ndcg@5: 0.384195	valid_1's ndcg@5: 0.356146
[11]	training's ndcg@5: 0.387085	valid_1's ndcg@5: 0.357555
[12]	training's ndcg@5: 0.389873	valid_1's ndcg@5: 0.359794
[13]	training's ndcg@5: 0.392433	valid_1's ndcg@5: 0.360516
[14]	training's ndcg@5: 0.393826	valid_1's ndcg@5: 0.360487
[15]	training's ndcg@5: 0.397919	valid_1's ndcg@5: 0.363979
[16]	training's ndcg@5: 0.39977	valid_1's ndcg@5: 0.365685
[17]	training's ndcg@5: 0.400871	valid_1's ndcg@5: 0

LGBMRanker(metric='ndcg', objective='lambdarank', verbose=1)

In [15]:
y_pred6 = model6.predict(X_test)
y_pred6

array([ 0.46175809, -0.7501605 , -0.54465082, ..., -0.26479785,
        0.05585781,  0.35997866])

In [20]:
df = pd.concat([X_test["srch_id"], y_test_], axis=1)
df['predictions'] = y_pred6
df['predictions_n'] = (df['predictions']-df['predictions'].min())/(df['predictions'].max()-df['predictions'].min())
df.head(30)

Unnamed: 0,srch_id,target_score,predictions,predictions_n
119,12,0,0.461758,0.506286
120,12,0,-0.750161,0.3269
121,12,0,-0.544651,0.357319
122,12,0,-0.536805,0.35848
123,12,0,0.192074,0.466368
124,12,0,-0.639114,0.343336
125,12,0,-0.658146,0.340519
126,12,0,-0.320207,0.390541
127,12,0,-0.696013,0.334914
128,12,0,-0.215528,0.406035


In [23]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5))

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions_n"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores)) 

0.7919465816684291


# Submission

- by target_score

In [24]:
r = pd.DataFrame(data = -y_pred6, columns=['target_score'])
r.head(20)

Unnamed: 0,target_score
0,-0.461758
1,0.750161
2,0.544651
3,0.536805
4,-0.192074
5,0.639114
6,0.658146
7,0.320207
8,0.696013
9,0.215528


In [25]:
r["srch_id"] = test['srch_id']
r["prop_id"] = test['prop_id']

In [30]:
result = r.sort_values(['srch_id','target_score'])[["srch_id","prop_id"]]

In [31]:
result.head(10)

Unnamed: 0,srch_id,prop_id
22,1,95031
13,1,63894
0,1,3180
14,1,72090
11,1,61632
26,1,128871
4,1,24194
9,1,54937
17,1,78599
23,1,99484


In [32]:
result.to_csv("sub9.csv", index=False)