In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [51]:
def _dsg_numerator(rel, use_2pow):
    if use_2pow:
        return 2**rel-1
    else:
        return rel


def calc_ndcg(rel_true, rel_est, n=5, use_2pow=True):
    assert len(rel_est) == len(rel_true)
    rel_true = np.asarray(rel_true)
    rel_est = np.asarray(rel_est)

    discount = 1 / np.log(np.arange(2, len(rel_true) + 2))
    discount[n:] = 0

    dsg_N = discount.dot(_dsg_numerator(rel_est, use_2pow))
    idsg_N = discount.dot(_dsg_numerator(rel_true, use_2pow))

    return dsg_N/idsg_N


def ndcg_dmt(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return calc_ndcg(ranked_scores, predicted_scores, n=n)

from sklearn.metrics import ndcg_score

def ndcg_sklearn(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return ndcg_score(np.asarray([ranked_scores]), np.asarray([predicted_scores]), k=n)

In [None]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [136]:
df = pd.read_csv("data/joined_all_features.csv.zip", index_col=0)
df.head()

Unnamed: 0,srch_id,prop_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,...,prop_location_score1_d_srch_destination_id,prop_location_score1_d_srch_destination_id__prop_starrating,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_hist_logdiff_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id
0,427,1,5,219,,,219,2,,1,...,1.2,1.04,0.56,,,,0.21886,0.125626,-0.015505,-0.011613
1,5762,1,5,219,3.5,147.02,219,2,,1,...,1.12,0.96,0.48,,,,,0.129212,-0.017857,-0.014857
2,8178,1,5,219,,,219,2,,1,...,0.85,0.69,0.21,,,,0.283944,0.198177,-0.015625,-0.010634
3,8465,1,5,219,,,219,2,,1,...,1.28,1.12,0.64,,,,0.269329,0.044171,-0.015625,-0.010634
4,10771,1,5,219,,,219,2,,1,...,1.28,1.12,0.64,,,,0.247003,0.450021,-0.016393,-0.014857


In [138]:
df_devel = df.loc[df.is_test==False, [c for c in df.columns if c != 'is_test']]
df_test = df.loc[df.is_test==True, [c for c in df.columns if c != 'is_test']]
df_test.srch_id -= df_devel.srch_id.max()
del df
import gc
gc.collect()

511

In [139]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705359 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
price_usd_ld_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 169572 out of 4958347 => 3% missing
prop_starrating_w0_d_srch_id missing count 169572 out of 4958347 => 3% missing
prop_review_score_d_srch_destination_id missing count 240658 out of 4958347 => 5% missing
prop_review_score_d_srch_destinat

In [140]:
# split srch_id into train and val
all_srch_ids = df_devel.srch_id.unique()

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 123
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
val_start_idx = int(len(all_srch_ids)*(1-VALIDATION_PROP))
train_ids = all_srch_ids[:val_start_idx]
val_ids = all_srch_ids[val_start_idx:]

In [154]:
df_devel['target_cls'] = df_devel.booking_bool + df_devel.click_bool
df_devel['relevance_score'] = df_devel.booking_bool * 4 + df_devel.click_bool

columns_to_remove = [
    'visitor_location_country_id', # todo: visitor_location_country_id: combine with prop_country_id and keep the N most clicked/booked combinations,
    'visitor_hist_starrating', # todo: too few values, visitor_hist_starrating and visitor_hist_adr_usd standardize
    'visitor_hist_adr_usd', # todo:
    'prop_country_id', # todo:
    'prop_starrating', # normalized
    'prop_review_score', # normalized
    'prop_location_score1', # normalized
    'prop_location_score2', # normalized
    'prop_log_historical_price', # normalized
    'position', # todo: maybe mean/stdev_position_per_prop
    'price_usd', # normalized
    'srch_destination_id',
    'srch_query_affinity_score', # todo: normalize
    'prop_starrating_w0', # normalized
]
predictor_cols = [c for c in df_devel.columns if c not in ('srch_id', 'position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score', 'prop_id', 'is_test') and c not in columns_to_remove]

df_data_train = df_devel.loc[df_devel.srch_id.isin(train_ids), predictor_cols]
#df_data_train = df_data_train.dropna() # let's see what happens if we drop all NAs
df_train = df_devel.loc[df_data_train.index]
df_tg_train = df_devel['target_cls'][df_data_train.index]
df_tg_train = df_tg_train.astype(int)

df_data_val = df_devel.loc[df_devel.srch_id.isin(val_ids), predictor_cols]
#df_data_val = df_data_val.dropna() # let's see what happens if we drop all NAs
df_val = df_devel.loc[df_data_val.index]
df_tg_val = df_devel['target_cls'][df_data_train.index]
df_tg_val = df_tg_val.astype(int)

df_data_test = df_test[predictor_cols]
#df_data_test = df_data_test.dropna()
df_test_filt = df_test.loc[df_data_test.index]

In [155]:
for col in df_data_train.columns:
    print_missing(df_data_train, col)

price_hist_logdiff missing count 641509 out of 4461904 => 14% missing
price_usd_ld_srch_id missing count 30 out of 4461904 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 152763 out of 4461904 => 3% missing
prop_starrating_w0_d_srch_id missing count 152763 out of 4461904 => 3% missing
prop_review_score_d_srch_destination_id missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_destination_id__prop_starrating missing count 216622 out of 4461904 => 5% missing
prop_review_score_d_srch_id missing count 216622 out of 4461904 => 5% missing
prop_location_score2_d_srch_destination_id missing count 981192 out of 4461904 => 22% missing
prop_location_score2_d_srch_destination_id__prop_starrating missing count 981192 out of 4461904 => 22% missing
prop_location_score2_d_srch_id missing count 981192 out of 4461904 => 22% missing
price_hist_logdiff_d_srch_id missing count 641509 out of 4461904 => 14% missing
price_usd_ld_srch_id__prop_starrating missing count

In [156]:
df_data_train

Unnamed: 0,site_id,prop_brand_bool,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,...,prop_location_score1_d_srch_destination_id,prop_location_score1_d_srch_destination_id__prop_starrating,prop_location_score1_d_srch_id,prop_location_score2_d_srch_destination_id,prop_location_score2_d_srch_destination_id__prop_starrating,prop_location_score2_d_srch_id,price_hist_logdiff_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id
0,5,1,0,1,10,2,0,1,1,1,...,1.20,1.040,0.56,,,,0.218860,0.125626,-0.015505,-0.011613
1,5,1,0,1,72,2,0,1,1,0,...,1.12,0.960,0.48,,,,,0.129212,-0.017857,-0.014857
2,5,1,0,1,0,2,0,1,1,0,...,0.85,0.690,0.21,,,,0.283944,0.198177,-0.015625,-0.010634
3,5,1,0,2,4,2,1,1,1,0,...,1.28,1.120,0.64,,,,0.269329,0.044171,-0.015625,-0.010634
4,5,1,0,1,4,1,0,1,0,0,...,1.28,1.120,0.64,,,,0.247003,0.450021,-0.016393,-0.014857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9917522,16,0,0,1,1,1,2,1,1,0,...,-2.20,-2.200,-1.10,-0.0315,-0.04955,-0.0192,-0.031993,-0.054630,0.142857,0.142857
9917523,16,0,0,1,0,1,0,1,1,0,...,-2.20,-2.200,-1.10,-0.0246,-0.04265,-0.0123,0.228085,-0.279281,0.142857,0.142857
9917524,16,0,0,1,17,2,0,1,1,1,...,-0.82,-0.820,-0.41,,,,-0.048378,-0.315089,0.142857,0.142857
9917525,16,0,0,7,2,1,0,1,0,1,...,-0.82,-1.075,-0.41,,,,-0.247723,-0.312395,0.142857,0.142857


In [178]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Train the classifier with the best hyperparameters
clf = HistGradientBoostingClassifier(random_state=123,
                                     min_samples_leaf=20,
                                     max_iter=100 * 1000,
                                     max_leaf_nodes = 31,
                                     max_bins=63,
                                     n_iter_no_change=100,
                                     max_depth=40,
                                     categorical_features=('site_id', 'promotion_flag', 'prop_brand_bool', 'srch_saturday_night_bool', 'random_bool'))

# # downsample the training data for experimentation
# rng = np.random.default_rng(RANDOM_SEED)
# downsample_idx = np.arange(len(df))
# rng.shuffle(downsample_idx)
# downsample_idx = downsample_idx[:len(df)//10]
# len(downsample_idx), len(df)
#x = df_data.iloc[downsample_idx]
#Y = df_tg.iloc[downsample_idx]
x = df_data_train
Y = df_tg_train

clf.fit(x, Y)

In [179]:
def get_avg_ndgc(df, df_data):
    pred = df[['srch_id', 'prop_id', 'relevance_score', 'target_cls']]
    pred = pred.assign(score = clf.predict_proba(df_data)[:, 1:2].sum(axis=1))
    tmp = pred.sort_values(by='score', ascending=False).sort_values(by='srch_id', kind='stable')[['srch_id','target_cls','score', 'relevance_score']]
    ndcg = tmp.groupby('srch_id')['relevance_score'].apply(ndcg_dmt)
    #ndcg = tmp.groupby('srch_id')['relevance_score'].apply(ndcg_sklearn)
    return ndcg.mean()

In [180]:
get_avg_ndgc(df_val, df_data_val)

0.35782641980240215

In [None]:
get_avg_ndgc(df_train, df_data_train)

In [160]:
def get_prediction(df, df_data):
    pred = df[['srch_id', 'prop_id']]
    pred = pred.assign(score = clf.predict_proba(df_data)[:, 1:2].sum(axis=1))
    pred = pred.sort_values(by='score', ascending=False).sort_values(by='srch_id', kind='stable')[['srch_id','prop_id','score']]
    return pred

In [161]:
pred_test = get_prediction(df_test_filt, df_data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred['score'] = clf.predict_proba(df_data)[:, 1:2].sum(axis=1)


In [162]:
pred_test[['srch_id', 'prop_id']].reset_index().drop(columns='index').to_csv('data/submission_3.csv', index=None)

In [68]:
calc_ndcg([5, 1, 0, 0, 0], [1, 5, 0, 0, 0], n=5, use_2pow=False)

0.737826424707602

In [164]:
ndcg_dmt([1, 5, 0, 0, 0], n=5), ndcg_sklearn([1, 5, 0, 0, 0], n=5)



(0.6499594707105907, 0.737826424707602)