In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import ndcg_score

In [103]:
def _dsg_numerator(rel, use_2pow):
    if use_2pow:
        return 2**rel-1
    else:
        return rel


def calc_ndcg(rel_true, rel_est, n=5, use_2pow=True):
    assert len(rel_est) == len(rel_true)
    rel_true = np.asarray(rel_true)
    rel_est = np.asarray(rel_est)

    discount = 1 / np.log(np.arange(2, len(rel_true) + 2))
    discount[n:] = 0

    dsg_N = discount.dot(_dsg_numerator(rel_est, use_2pow))
    idsg_N = discount.dot(_dsg_numerator(rel_true, use_2pow))

    return dsg_N/idsg_N


def ndcg_dmt(predicted_scores, n=5, use_2pow=True):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return calc_ndcg(ranked_scores, predicted_scores, n=n, use_2pow=use_2pow)


def ndcg_sklearn(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return ndcg_score(np.asarray([ranked_scores]), np.asarray([predicted_scores]), k=n)

In [3]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [77]:
df = pd.read_csv("data/joined_all_features.csv.zip", index_col=0)
df.head()

Unnamed: 0,srch_id,prop_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,...,prop_location_score1_d_srch_destination_id,prop_location_score1_d_srch_destination_id__prop_starrating,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_hist_logdiff_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id
0,427,1,5,219,,,219,2,,1,...,1.2,1.04,0.56,,,,0.21886,0.125626,-0.015505,-0.011613
1,5762,1,5,219,3.5,147.02,219,2,,1,...,1.12,0.96,0.48,,,,,0.129212,-0.017857,-0.014857
2,8178,1,5,219,,,219,2,,1,...,0.85,0.69,0.21,,,,0.283944,0.198177,-0.015625,-0.010634
3,8465,1,5,219,,,219,2,,1,...,1.28,1.12,0.64,,,,0.269329,0.044171,-0.015625,-0.010634
4,10771,1,5,219,,,219,2,,1,...,1.28,1.12,0.64,,,,0.247003,0.450021,-0.016393,-0.014857


In [78]:
df_devel = df.loc[df.is_test==False, [c for c in df.columns if c != 'is_test']]
df_test = df.loc[df.is_test==True, [c for c in df.columns if c != 'is_test']]
df_test.srch_id -= df_devel.srch_id.max()
del df
import gc
gc.collect()

2509

In [79]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705359 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
price_usd_ld_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 169572 out of 4958347 => 3% missing
prop_starrating_w0_d_srch_id missing count 169572 out of 4958347 => 3% missing
prop_review_score_d_srch_destination_id missing count 240658 out of 4958347 => 5% missing
prop_review_score_d_srch_destinat

In [80]:
# split srch_id into train and val
all_srch_ids = df_devel.srch_id.unique()

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 123
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
val_start_idx = int(len(all_srch_ids)*(1-VALIDATION_PROP))
train_ids = all_srch_ids[:val_start_idx]
val_ids = all_srch_ids[val_start_idx:]

In [81]:
df_devel['target_cls'] = df_devel.booking_bool + df_devel.click_bool
df_devel['relevance_score'] = df_devel.booking_bool * 4 + df_devel.click_bool

columns_to_remove = [
    'visitor_location_country_id', # todo: visitor_location_country_id: combine with prop_country_id and keep the N most clicked/booked combinations,
    'visitor_hist_starrating', # todo: too few values, visitor_hist_starrating and visitor_hist_adr_usd standardize
    'visitor_hist_adr_usd', # todo:
    'prop_country_id', # todo:
    'prop_starrating', # normalized
    'prop_review_score', # normalized
    'prop_location_score1', # normalized
    'prop_location_score2', # normalized
    'prop_log_historical_price', # normalized
    'position', # todo: maybe mean/stdev_position_per_prop
    'price_usd', # normalized
    'srch_destination_id',
    'srch_query_affinity_score', # todo: normalize
    'prop_starrating_w0', # normalized
]
predictor_cols = [c for c in df_devel.columns if c not in ('position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score', 'prop_id', 'is_test') and c not in columns_to_remove]

def get_groups(df):
    grp = df.groupby('srch_id').srch_id.count()
    return grp.values

df_devel = df_devel.set_index('srch_id').sort_index().reset_index()

df_data_train = df_devel.loc[df_devel.srch_id.isin(train_ids), predictor_cols]
groups_train = get_groups(df_data_train)
# #df_data_train = df_data_train.dropna() # let's see what happens if we drop all NAs
df_train = df_devel.loc[df_data_train.index]
df_tg_train = df_devel['target_cls'][df_data_train.index]
df_tg_train = df_tg_train.astype(int)
df_data_train.drop(columns=['srch_id'], inplace=True)

df_data_val = df_devel.loc[df_devel.srch_id.isin(val_ids), predictor_cols]
groups_val = get_groups(df_data_val)
#df_data_val = df_data_val.dropna() # let's see what happens if we drop all NAs
df_val = df_devel.loc[df_data_val.index]
df_tg_val = df_devel['target_cls'][df_data_val.index]
df_tg_val = df_tg_val.astype(int)
df_data_val.drop(columns=['srch_id'], inplace=True)

df_test = df_test.set_index('srch_id').sort_index().reset_index()
df_data_test = df_test[predictor_cols]
groups_test = get_groups(df_data_test)
df_data_test.drop(columns=['srch_id'], inplace=True)
#df_data_test = df_data_test.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_test.drop(columns=['srch_id'], inplace=True)


In [82]:
for col in df_data_train.columns:
    print_missing(df_data_train, col)

price_hist_logdiff missing count 641509 out of 4461904 => 14% missing
price_usd_ld_srch_id missing count 30 out of 4461904 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 152763 out of 4461904 => 3% missing
prop_starrating_w0_d_srch_id missing count 152763 out of 4461904 => 3% missing
prop_review_score_d_srch_destination_id missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_destination_id__prop_starrating missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_id missing count 216622 out of 4461904 => 5% missing
prop_location_score2_d_srch_destination_id missing count 981192 out of 4461904 => 22% missing
prop_location_score2_d_srch_destination_id__prop_starrating missing count 981192 out of 4461904 => 22% missing
prop_location_score2_d_srch_id missing count 981192 out of 4461904 => 22% missing
price_hist_logdiff_d_srch_id missing count 641509 out of 4461904 => 14% missing
price_usd_ld_srch_id__prop_starrating missing count

In [83]:
df_data_train

Unnamed: 0,site_id,prop_brand_bool,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,...,prop_location_score1_d_srch_destination_id,prop_location_score1_d_srch_destination_id__prop_starrating,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_hist_logdiff_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id
0,12,1,0,1,0,4,0,1,1,1,...,-0.20,-0.10,-0.10,,,,0.079837,0.317613,-0.002904,0.000321
1,12,0,0,1,0,4,0,1,1,1,...,-1.38,-1.28,-0.69,,,,0.219071,-0.204773,0.001445,-0.007234
2,12,1,0,1,0,4,0,1,1,1,...,0.94,0.60,0.47,0.21395,0.16435,0.10015,-0.013285,0.246623,0.003988,0.018636
3,12,1,0,1,0,4,0,1,1,1,...,-1.38,-1.28,-0.69,-0.03605,-0.03215,-0.02485,-0.106156,0.000000,-0.012760,-0.013064
4,12,1,0,1,0,4,0,1,1,1,...,0.94,0.60,0.47,0.20675,0.15715,0.09655,-0.321949,-0.282042,0.045751,0.063314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,5,1,0,1,21,3,0,1,0,0,...,-0.35,-0.51,-0.17,0.00995,0.00245,-0.00955,,0.126294,-0.019264,-0.022139
4958343,5,1,0,1,21,3,0,1,0,0,...,-1.37,-1.53,-0.68,,,,,-0.533517,0.021516,0.080698
4958344,5,1,0,1,21,3,0,1,0,0,...,-1.37,-1.53,-0.68,-0.05145,-0.05895,-0.04025,,-0.049271,-0.061817,-0.062713
4958345,5,1,0,1,21,3,0,1,0,0,...,0.83,0.67,0.42,,,,,0.046957,-0.026103,-0.050255


In [84]:
from lightgbm import early_stopping, log_evaluation, LGBMRanker

# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=1000,#2000,
    learning_rate=0.1,
    label_gain=[0, 1, 5],
    random_state=42,
    boosting='gbdt',
)

x_val = df_data_val
Y_val = df_tg_val

x_train = df_data_train
Y_train = df_tg_train

early_stopping_callback = early_stopping(stopping_rounds=150, first_metric_only=True)
log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train), (x_val, Y_val)],
    eval_group=[groups_train, groups_val],
    group=groups_train,
    eval_at=5,
    callbacks=[early_stopping_callback, log_evaluation_callback]
)


Training until validation scores don't improve for 150 rounds
[20]	training's ndcg@5: 0.45664	valid_1's ndcg@5: 0.451594
[40]	training's ndcg@5: 0.465216	valid_1's ndcg@5: 0.459242
[60]	training's ndcg@5: 0.470067	valid_1's ndcg@5: 0.46364
[80]	training's ndcg@5: 0.474142	valid_1's ndcg@5: 0.466325
[100]	training's ndcg@5: 0.477759	valid_1's ndcg@5: 0.468456
[120]	training's ndcg@5: 0.480206	valid_1's ndcg@5: 0.470185
[140]	training's ndcg@5: 0.482449	valid_1's ndcg@5: 0.470564
[160]	training's ndcg@5: 0.484623	valid_1's ndcg@5: 0.471188
[180]	training's ndcg@5: 0.486513	valid_1's ndcg@5: 0.471636
[200]	training's ndcg@5: 0.488213	valid_1's ndcg@5: 0.471644
[220]	training's ndcg@5: 0.489721	valid_1's ndcg@5: 0.471763
[240]	training's ndcg@5: 0.491385	valid_1's ndcg@5: 0.472442
[260]	training's ndcg@5: 0.492747	valid_1's ndcg@5: 0.472264
[280]	training's ndcg@5: 0.494199	valid_1's ndcg@5: 0.472022
[300]	training's ndcg@5: 0.495593	valid_1's ndcg@5: 0.472293
[320]	training's ndcg@5: 0.49

In [67]:
x_val.shape[0] == Y_val.shape[0]

True

In [115]:
def get_prediction_df(preds, df):
    df_prediction = df[["srch_id", "prop_id", 'relevance_score']].assign(predicted = preds)
    return df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')

In [116]:
df_pred_val = get_prediction_df(ranker.predict(df_data_val), df_val)
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False)).mean()

0.4727977490902627

In [120]:
df_pred_val

Unnamed: 0,srch_id,prop_id,relevance_score,predicted
264,28,75704,0.0,1.751911
272,28,18016,0.0,1.424686
241,28,5739,0.0,0.568670
270,28,72909,0.0,0.442064
252,28,35767,0.0,0.324610
...,...,...,...,...
4958133,332761,106737,0.0,-2.436698
4958138,332761,25234,0.0,-7.423028
4958120,332761,12711,0.0,-7.452619
4958122,332761,34080,0.0,-7.550448


In [117]:
df_pred_train = get_prediction_df(ranker.predict(df_data_train), df_train)
df_pred_train.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False)).mean()

0.49913780026328347

In [118]:
preds_test = ranker.predict(df_data_test)

df_prediction = df_test[["srch_id", "prop_id"]].assign(predicted = preds_test)
df_prediction = df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')
df_prediction[["srch_id", "prop_id"]].to_csv("data/pred_robert_LGBMRanker.csv", index=None)

In [119]:
df_prediction

Unnamed: 0,srch_id,prop_id,predicted
17,1,28181,0.769946
15,1,99484,0.750483
12,1,54937,0.462135
3,1,34263,-0.096857
23,1,50162,-0.110247
...,...,...,...
4959179,332787,94437,-0.692529
4959181,332787,22854,-8.533068
4959180,332787,35240,-8.829210
4959178,332787,29018,-8.934683
