In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import ndcg_score

In [2]:
def _dsg_numerator(rel, use_2pow):
    if use_2pow:
        return 2**rel-1
    else:
        return rel


def calc_ndcg(rel_true, rel_est, n=5, use_2pow=True):
    assert len(rel_est) == len(rel_true)
    rel_true = np.asarray(rel_true)
    rel_est = np.asarray(rel_est)

    discount = 1 / np.log(np.arange(2, len(rel_true) + 2))
    discount[n:] = 0

    dsg_N = discount.dot(_dsg_numerator(rel_est, use_2pow))
    idsg_N = discount.dot(_dsg_numerator(rel_true, use_2pow))

    return dsg_N/idsg_N


def ndcg_dmt(predicted_scores, n=5, use_2pow=True):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return calc_ndcg(ranked_scores, predicted_scores, n=n, use_2pow=use_2pow)


def ndcg_sklearn(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return ndcg_score(np.asarray([ranked_scores]), np.asarray([predicted_scores]), k=n)

In [3]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [4]:
df = pd.read_csv("data/joined_all_features.csv.zip", index_col=0)
df.head()

Unnamed: 0,srch_id,prop_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,...,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id,price_hist_logdiff_d_srch_id,visitor_hist_adr_usd_logdiff_d_srch_id,srch_query_affinity_score_d_srch_id
0,427,1,5,219,,,219,2,,1,...,0.56,,,,0.125626,-0.015505,-0.011613,0.21886,,
1,5762,1,5,219,3.5,147.02,219,2,,1,...,0.48,,,,0.129212,-0.017857,-0.014857,,-0.11441,
2,8178,1,5,219,,,219,2,,1,...,0.21,,,,0.198177,-0.015625,-0.010634,0.283944,,
3,8465,1,5,219,,,219,2,,1,...,0.64,,,,0.044171,-0.015625,-0.010634,0.269329,,
4,10771,1,5,219,,,219,2,,1,...,0.64,,,,0.450021,-0.016393,-0.014857,0.247003,,


In [5]:
df_devel = df.loc[df.is_test==False, [c for c in df.columns if c != 'is_test']]
df_test = df.loc[df.is_test==True, [c for c in df.columns if c != 'is_test']]
df_test.srch_id -= df_devel.srch_id.max()
# del df
# import gc
# gc.collect()

In [6]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705398 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
comp_better_worse missing count 1716808 out of 4958347 => 35% missing
hist_starrating_diff missing count 4706481 out of 4958347 => 95% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4705398 out of 4958347 => 95% missing
price_usd_ld_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 169572 out of 

In [7]:
# split srch_id into train and val
all_srch_ids = df_devel.srch_id.unique()

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 123
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
val_start_idx = int(len(all_srch_ids)*(1-VALIDATION_PROP))
train_ids = all_srch_ids[:val_start_idx]
val_ids = all_srch_ids[val_start_idx:]

In [81]:
df_devel['target_cls'] = df_devel.booking_bool + df_devel.click_bool
df_devel['relevance_score'] = df_devel.booking_bool * 4 + df_devel.click_bool

columns_to_remove = [
    'visitor_location_country_id', # todo: visitor_location_country_id: combine with prop_country_id and keep the N most clicked/booked combinations,
    'visitor_hist_starrating', # normalized
    'visitor_hist_adr_usd', # normalized
    'prop_country_id', # todo:
    #'prop_starrating', # normalized
    #'prop_review_score', # normalized
    #'prop_location_score1', # normalized
    #'prop_location_score2', # normalized
    'prop_log_historical_price', # normalized
    'position', # todo: maybe mean/stdev_position_per_prop
    'price_usd', # normalized
    'srch_destination_id',
    #'srch_query_affinity_score', # normalized
    'prop_starrating_w0', # normalized

    'prop_location_score1_d_srch_id',
    'prop_location_score1_d_srch_destination_id__prop_starrating',
    'prop_location_score2_d_srch_id',
    'prop_location_score2_d_srch_destination_id__prop_starrating',
    'booking_end_offset',

    #'booking_prob_per_prop_id_d_srch_id',
    #'click_prob_per_prop_id_d_srch_id',
    #'booking_prob_per_prop_id',
    #'click_prob_per_prop_id',
]
predictor_cols = [c for c in df_devel.columns if ((c not in ('position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score', 'prop_id', 'is_test')) and (c not in columns_to_remove))]

def get_groups(df):
    grp = df.groupby('srch_id').srch_id.count()
    return grp.values

df_devel = df_devel.set_index('srch_id').sort_index().reset_index()

df_data_train = df_devel.loc[df_devel.srch_id.isin(train_ids), predictor_cols]
groups_train = get_groups(df_data_train)
# #df_data_train = df_data_train.dropna() # let's see what happens if we drop all NAs
df_train = df_devel.loc[df_data_train.index]
df_tg_train = df_devel['target_cls'][df_data_train.index]
df_tg_train = df_tg_train.astype(int)
df_data_train.drop(columns=['srch_id'], inplace=True)

df_data_val = df_devel.loc[df_devel.srch_id.isin(val_ids), predictor_cols]
groups_val = get_groups(df_data_val)
#df_data_val = df_data_val.dropna() # let's see what happens if we drop all NAs
df_val = df_devel.loc[df_data_val.index]
df_tg_val = df_devel['target_cls'][df_data_val.index]
df_tg_val = df_tg_val.astype(int)
df_data_val.drop(columns=['srch_id'], inplace=True)

df_test = df_test.set_index('srch_id').sort_index().reset_index()
df_data_test = df_test[predictor_cols]
groups_test = get_groups(df_data_test)
df_data_test.drop(columns=['srch_id'], inplace=True)
#df_data_test = df_data_test.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_test.drop(columns=['srch_id'], inplace=True)


In [82]:
for col in df_data_train.columns:
    print_missing(df_data_train, col)

prop_review_score missing count 216622 out of 4461904 => 5% missing
prop_location_score2 missing count 981192 out of 4461904 => 22% missing
srch_query_affinity_score missing count 4175744 out of 4461904 => 94% missing
comp_better_worse missing count 1546315 out of 4461904 => 35% missing
hist_starrating_diff missing count 4234609 out of 4461904 => 95% missing
price_hist_logdiff missing count 641509 out of 4461904 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4233562 out of 4461904 => 95% missing
price_usd_ld_srch_id missing count 30 out of 4461904 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 152763 out of 4461904 => 3% missing
prop_starrating_w0_d_srch_id missing count 152763 out of 4461904 => 3% missing
prop_review_score_d_srch_destination_id missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_destination_id__prop_starrating missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_id missing count 216622 out o

In [83]:
df_data_train

Unnamed: 0,site_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,...,prop_review_score_d_srch_destination_id__prop_starrating,prop_review_score_d_srch_id,prop_location_score1_d_srch_destination_id,prop_location_score2_d_srch_destination_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id,price_hist_logdiff_d_srch_id,visitor_hist_adr_usd_logdiff_d_srch_id,srch_query_affinity_score_d_srch_id
0,12,3,,1,2.20,,0,1,0,4,...,,,-0.20,,0.317613,-0.002904,0.000321,0.079837,,
1,12,2,,0,1.61,,0,1,0,4,...,,,-1.38,,-0.204773,0.001445,-0.007234,0.219071,,
2,12,4,4.5,1,2.77,0.1302,0,1,0,4,...,1.25,0.75,0.94,0.21395,0.246623,0.003988,0.018636,-0.013285,,
3,12,2,3.5,1,1.61,0.0052,0,1,0,4,...,-0.25,-0.25,-1.38,-0.03605,0.000000,-0.012760,-0.013064,-0.106156,,
4,12,4,3.5,1,2.77,0.1266,0,1,0,4,...,-0.75,-0.25,0.94,0.20675,-0.282042,0.045751,0.063314,-0.321949,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,5,3,4.0,1,1.61,0.0471,0,1,21,3,...,0.25,0.25,-0.35,0.00995,0.126294,-0.019264,-0.022139,,,
4958343,5,3,2.5,1,1.10,,0,1,21,3,...,-2.75,-1.25,-1.37,,-0.533517,0.021516,0.080698,,,
4958344,5,3,3.5,1,1.10,0.0164,0,1,21,3,...,-0.75,-0.25,-1.37,-0.05145,-0.049271,-0.061817,-0.062713,,,
4958345,5,3,1.0,1,2.20,,0,1,21,3,...,-5.75,-2.75,0.83,,0.046957,-0.026103,-0.050255,,,


In [84]:
# define a custom objective (to be exactly the same as the one used in the competition)

# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def custom_ndcg(preds, train_data, *largs, **kwargs):
    print('not sure how to do this')
    return None


In [85]:
def monotone_constraints_from_tuples(tpls):
    pass

test = [
#('promotion_flag', 1),
('random_bool', -1),
#('comp_better_cnt', -1),
#('price_hist_logdiff', -1),
#('price_usd_ld_srch_id', -1),
#('prop_starrating_w0_d_srch_id', 1),
#('prop_location_score1_d_srch_id', 1),
#('click_prob_per_prop_id_d_srch_id', 1),
]

d = {col:i for i, col in enumerate(df_data_train.columns)}

mon = np.zeros(len(df_data_train.columns))
for t in test:
    mon[d[t[0]]]=t[1]

mon

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [86]:
from lightgbm import early_stopping, log_evaluation, LGBMRanker

# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    #objective="rank_xendcg",
    objective='lambdarank',
    #lambdarank_truncation_level=16,
    metric="ndcg",
    subsample_for_bin=200000*2,
    n_estimators=10000, # this is the max, early stopping will likely result in fewer
    # feature_fraction=0.8,
    learning_rate=0.1,
    label_gain=[0, 1, 31],
    random_state=42,
    boosting='goss',
    # top_rate=0.13, # goss
    # other_rate=0.10, # goss
    # --
    # boosting='dart',
    # boosting='gbdt',
    # bagging_fraction=0.9,
    # bagging_freq=5,
    # bagging_seed=123,
    # --
    #monotone_constraints=mon,
    #monotone_constraints_method='advanced'
    #extra_trees=True,
    lambda_l2=1e-2,
    #lambda_l1=1e-2,
    #min_data_in_leaf=40,
    max_depth=5,
    deterministic=True,
    num_threads=24
)

x_val = df_data_val
Y_val = df_tg_val

x_train = df_data_train
Y_train = df_tg_train

early_stopping_callback = early_stopping(stopping_rounds=300, first_metric_only=True)
log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train), (x_val, Y_val)],
    eval_group=[groups_train, groups_val],
    group=groups_train,
    eval_at=5,
    callbacks=[early_stopping_callback, log_evaluation_callback],
    categorical_feature=('site_id', 'srch_saturday_night_bool', 'prop_brand_bool', 'same_country'),
)


New categorical_feature is ['prop_brand_bool', 'same_country', 'site_id', 'srch_saturday_night_bool']


Training until validation scores don't improve for 300 rounds
[20]	training's ndcg@5: 0.452977	valid_1's ndcg@5: 0.44927
[40]	training's ndcg@5: 0.46428	valid_1's ndcg@5: 0.460044
[60]	training's ndcg@5: 0.471292	valid_1's ndcg@5: 0.466246
[80]	training's ndcg@5: 0.476263	valid_1's ndcg@5: 0.4705
[100]	training's ndcg@5: 0.479809	valid_1's ndcg@5: 0.471483
[120]	training's ndcg@5: 0.483104	valid_1's ndcg@5: 0.473529
[140]	training's ndcg@5: 0.486034	valid_1's ndcg@5: 0.475519
[160]	training's ndcg@5: 0.488262	valid_1's ndcg@5: 0.477597
[180]	training's ndcg@5: 0.489827	valid_1's ndcg@5: 0.478173
[200]	training's ndcg@5: 0.491677	valid_1's ndcg@5: 0.479018
[220]	training's ndcg@5: 0.493219	valid_1's ndcg@5: 0.479301
[240]	training's ndcg@5: 0.494628	valid_1's ndcg@5: 0.47966
[260]	training's ndcg@5: 0.49597	valid_1's ndcg@5: 0.479447
[280]	training's ndcg@5: 0.497507	valid_1's ndcg@5: 0.480513
[300]	training's ndcg@5: 0.499099	valid_1's ndcg@5: 0.480758
[320]	training's ndcg@5: 0.500453

In [87]:
{f:i for (f, i) in zip(ranker.feature_name_, ranker.feature_importances_)}

{'site_id': 2602,
 'prop_starrating': 218,
 'prop_review_score': 386,
 'prop_brand_bool': 154,
 'prop_location_score1': 1250,
 'prop_location_score2': 1813,
 'promotion_flag': 145,
 'srch_length_of_stay': 509,
 'srch_booking_window': 1050,
 'srch_adults_count': 248,
 'srch_children_count': 248,
 'srch_room_count': 115,
 'srch_saturday_night_bool': 107,
 'srch_query_affinity_score': 572,
 'random_bool': 502,
 'comp_known_cnt': 265,
 'comp_better_worse': 562,
 'booking_start': 693,
 'hist_starrating_diff': 541,
 'price_hist_logdiff': 1725,
 'visitor_hist_adr_usd_logdiff': 573,
 'same_country': 119,
 'booking_prob_per_prop_id': 1271,
 'click_prob_per_prop_id': 1334,
 'price_usd_ld_srch_id': 1676,
 'prop_starrating_w0_d_srch_destination_id': 494,
 'prop_starrating_w0_d_srch_id': 284,
 'prop_review_score_d_srch_destination_id': 528,
 'prop_review_score_d_srch_destination_id__prop_starrating': 600,
 'prop_review_score_d_srch_id': 280,
 'prop_location_score1_d_srch_destination_id': 1408,
 'pr

In [88]:
def get_prediction_df(preds, df):
    df_prediction = df[["srch_id", "prop_id", 'relevance_score']].assign(predicted = preds)
    return df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')

In [89]:
df_pred_val = get_prediction_df(ranker.predict(df_data_val), df_val)

In [90]:
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=True)).mean()

0.4831237555028905

In [91]:
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_sklearn(x)).mean()

0.35651243305766694

In [92]:
# 0.47409365112587953 - 2000 and [0, 1, 31]

In [93]:
df_pred_val

Unnamed: 0,srch_id,prop_id,relevance_score,predicted
264,28,75704,0.0,1.555946
272,28,18016,0.0,1.059619
241,28,5739,0.0,0.499059
270,28,72909,0.0,0.496768
252,28,35767,0.0,0.283327
...,...,...,...,...
4958133,332761,106737,0.0,-3.136569
4958120,332761,12711,0.0,-7.708453
4958138,332761,25234,0.0,-8.040435
4958122,332761,34080,0.0,-8.336368


In [94]:
df_pred_train = get_prediction_df(ranker.predict(df_data_train), df_train)
df_pred_train.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False)).mean()

KeyboardInterrupt: 

In [95]:
preds_test = ranker.predict(df_data_test)

df_prediction = df_test[["srch_id", "prop_id"]].assign(predicted = preds_test)
df_prediction = df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')
df_prediction[["srch_id", "prop_id"]].to_csv("data/pred_robert_LGBMRanker_483.csv", index=None)

In [96]:
df_prediction

Unnamed: 0,srch_id,prop_id,predicted
17,1,28181,0.804102
15,1,99484,0.683826
12,1,54937,0.638853
23,1,50162,0.136617
18,1,24194,-0.071880
...,...,...,...
4959179,332787,94437,-1.505127
4959181,332787,22854,-8.586817
4959178,332787,29018,-9.359678
4959182,332787,32019,-9.672806


TODO: ok so, there is a problem with a big gap in the performance on the validation set and test set. I think this is due to several factors:
* bagging in combination with aggregating over properties and destination. For the validation set to be meaningful we need to ensure that a similar proportion of destination ids are missing from the validation set as the proportion of missing destination ids from the test set
* statistics should only be calculated on the training set (do not include the validation set)
* the score is more in line with sklearn.metrics.ndgc score, coincidence?

Extra features:
* calculate z score or quantile for booking day-of-year relative to property stddev or empirical distribution
* add df (number of observations)
* determine price-usd type depending on the visitor country id and/or site id
** figure out the conditions of when it is total amount of booking (e.g. when there is no correlation between room number and gross booking)
** check if outliers are correlated (maybe there is some currency conversion going on for some countries)
* add all comps