In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import ndcg_score

In [2]:
def _dsg_numerator(rel, use_2pow):
    if use_2pow:
        return 2**rel-1
    else:
        return rel


def calc_ndcg(rel_true, rel_est, n=5, use_2pow=True):
    assert len(rel_est) == len(rel_true)
    rel_true = np.asarray(rel_true)
    rel_est = np.asarray(rel_est)

    discount = 1 / np.log(np.arange(2, len(rel_true) + 2))
    discount[n:] = 0

    dsg_N = discount.dot(_dsg_numerator(rel_est, use_2pow))
    idsg_N = discount.dot(_dsg_numerator(rel_true, use_2pow))
    EPS = 1e-6

    return dsg_N/(idsg_N+EPS)


def ndcg_dmt(predicted_scores, n=5, use_2pow=True):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return calc_ndcg(ranked_scores, predicted_scores, n=n, use_2pow=use_2pow)


def ndcg_sklearn(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return ndcg_score(np.asarray([ranked_scores]), np.asarray([predicted_scores]), k=n)

In [3]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [83]:
df_raw = pd.read_csv("data/joined_all_features.csv.zip", index_col=0)
#df = pd.read_csv("data/joined_all_features_revealed.csv.zip", index_col=0)
#df.drop(columns=['srch_id_match'], inplace=True)

In [84]:
df_raw['target_cls'] = df_raw.booking_bool + df_raw.click_bool
df_raw['relevance_score'] = df_raw.booking_bool * 4 + df_raw.click_bool
# a property id can appear in multiple srch_destination_id, to make the combination unique we need a new combined id
df_raw['prop_srch_dest_id']  = df_raw.prop_id * 100000 + df_raw.srch_destination_id
df_raw.head()

Unnamed: 0,srch_id,prop_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,...,prop_location_score2_d_srch_id,price_usd_ld_srch_id__prop_starrating,booking_prob_per_prop_id_d_srch_id,click_prob_per_prop_id_d_srch_id,price_hist_logdiff_d_srch_id,visitor_hist_adr_usd_logdiff_d_srch_id,srch_query_affinity_score_d_srch_id,target_cls,relevance_score,prop_srch_dest_id
0,427,1,5,219,,,219,2,,1,...,,0.125626,-0.015505,-0.011613,0.21886,,,0.0,0.0,106475
1,5762,1,5,219,3.5,147.02,219,2,,1,...,,0.129212,-0.017857,-0.014857,,-0.11441,,0.0,0.0,106475
2,8178,1,5,219,,,219,2,,1,...,,0.198177,-0.015625,-0.010634,0.283944,,,0.0,0.0,106475
3,8465,1,5,219,,,219,2,,1,...,,0.044171,-0.015625,-0.010634,0.269329,,,0.0,0.0,106475
4,10771,1,5,219,,,219,2,,1,...,,0.450021,-0.016393,-0.014857,0.247003,,,0.0,0.0,106475


In [85]:
TOP_NUM=5
# is the position in top
#df['top_bool'] = df['position']<=TOP_NUM

In [138]:
# number of search results - per search id
df_raw = df_raw.merge(df_raw.groupby('srch_id').agg(prop_count = ('srch_id', 'count')), on='srch_id')

In [139]:
# top percentage: out of all the times the property appeared, how many times was it in top
df = df_raw.merge(
    df.query("is_test==0").groupby('prop_srch_dest_id').agg(
        top_prob_srch_prop_id = ('position', lambda x: (x <= TOP_NUM).mean()),
        avg_position_srch_prop_id = ('position', 'mean'),
    ), on='prop_srch_dest_id'
)

In [140]:
# also add average position per property id
df = df.merge(
    df.query("is_test==0").groupby('prop_id').agg(
        top_prob_prop_id = ('position', lambda x: (x <= TOP_NUM).mean()),
        avg_position_prop_id = ('position', 'mean')
    ), on='prop_id'
)

In [141]:
df = df.merge(
    df.groupby('prop_srch_dest_id').agg(
        log_appearance_count_srch_prop_id=('prop_id', lambda x: np.log(len(x))),
        avg_res_len_srch_prop_id=('prop_count', 'mean'),
    ),
    on='prop_srch_dest_id')

df = df.merge(
    df.groupby('prop_id').agg(
        log_appearance_count_prop_id=('prop_id', lambda x: np.log(len(x))),
        avg_res_len_prop_id=('prop_count', 'mean'),
    ),
    on='prop_id')

In [142]:
# Aggregate competitor information - perhaps not

# add flag to indicate if any competitor has availability at a better rate
for i in range(1, 9):
    df[f'comp{i}_known'] = ~(df[f'comp{i}_rate'].isna() | df[f'comp{i}_inv'].isna())
    df[f'comp{i}_better'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate']==-1) & (df[f'comp{i}_inv']<=0)
    df[f'comp{i}_worse'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate']==1) & (df[f'comp{i}_inv']>=0)

df['comp_known_cnt'] = sum([df[f'comp{i}_known'].astype(int) for i in range(1, 9)])
df['comp_better_worse'] = \
    (sum([df[f'comp{i}_better'].astype(int) for i in range(1, 9)])
     -sum([df[f'comp{i}_worse'].astype(int) for i in range(1, 9)]))

comp_rate_cols = [f'comp{i}_rate' for i in range(1, 9)]
df['comp_rate_sum'] = df[comp_rate_cols].fillna(0).sum(axis=1)

comps = [f'comp{i}_' for i in range(1, 9)]
#df.drop(columns=[c for c in df.columns if c[:6] in comps and c not in ('comp2_rate_percent_diff', 'comp5_rate_percent_diff', 'comp8_rate_percent_diff')], inplace=True)
df.drop(columns=[c for c in df.columns if c[:6] in comps], inplace=True)

In [143]:
df

Unnamed: 0,srch_id,prop_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,...,avg_position_srch_prop_id,top_prob_prop_id,avg_position_prop_id,log_appearance_count_srch_prop_id,avg_res_len_srch_prop_id,log_appearance_count_prop_id,avg_res_len_prop_id,comp_known_cnt,comp_better_worse,comp_rate_sum
0,427,1,5,219,,,219,2,,1,...,29.365385,0.0,28.672131,4.60517,32.16,4.795791,31.735537,3,0,0.0
1,5762,1,5,219,3.50,147.02,219,2,,1,...,29.365385,0.0,28.672131,4.60517,32.16,4.795791,31.735537,0,0,0.0
2,8178,1,5,219,,,219,2,,1,...,29.365385,0.0,28.672131,4.60517,32.16,4.795791,31.735537,0,0,0.0
3,8465,1,5,219,,,219,2,,1,...,29.365385,0.0,28.672131,4.60517,32.16,4.795791,31.735537,3,0,0.0
4,10771,1,5,219,,,219,2,,1,...,29.365385,0.0,28.672131,4.60517,32.16,4.795791,31.735537,4,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9651972,110771,104543,32,220,3.85,138.19,220,4,,0,...,6.000000,0.0,6.000000,0.00000,5.00,0.000000,5.000000,0,0,0.0
9651973,110771,106966,32,220,3.85,138.19,220,4,2.5,0,...,3.000000,1.0,3.000000,0.00000,5.00,0.000000,5.000000,0,0,0.0
9651974,110771,115296,32,220,3.85,138.19,220,4,3.5,1,...,2.000000,1.0,2.000000,0.00000,5.00,0.000000,5.000000,0,0,0.0
9651975,110771,131902,32,220,3.85,138.19,220,4,4.5,0,...,1.000000,1.0,1.000000,0.00000,5.00,0.000000,5.000000,0,0,0.0


In [144]:
df_devel = df.loc[df.is_test==False, [c for c in df.columns if c != 'is_test']]
df_test = df.loc[df.is_test==True, [c for c in df.columns if c != 'is_test']]
df_test.srch_id -= df_devel.srch_id.max()
# del df
# import gc
# gc.collect()

In [145]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705398 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
hist_starrating_diff missing count 4706481 out of 4958347 => 95% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4705398 out of 4958347 => 95% missing
price_usd_ld_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 169572 out of 4958347 => 3% missing
prop_starrating_w0_d_srch_id missing count 16957

In [146]:
# split srch_id into train and val
all_srch_ids = df_devel.srch_id.unique()

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 124
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
val_start_idx = int(len(all_srch_ids)*(1-VALIDATION_PROP))
train_ids = all_srch_ids[:val_start_idx]
val_ids = all_srch_ids[val_start_idx:]

In [147]:
columns_to_remove = [
    #'prop_id',
    #'srch_destination_id',
    #'prop_srch_dest_id',

    'visitor_location_country_id', # todo: visitor_location_country_id: combine with prop_country_id and keep the N most clicked/booked combinations,
    'visitor_hist_starrating', # normalized
    'visitor_hist_adr_usd', # normalized
    'prop_country_id', # todo:
    #'prop_starrating', # normalized
    #'prop_review_score', # normalized
    #'prop_location_score1', # normalized
    #'prop_location_score2', # normalized
    'prop_log_historical_price', # normalized
    'position', # todo: maybe mean/stdev_position_per_prop
    'price_usd', # normalized
    #'srch_query_affinity_score', # normalized
    'prop_starrating_w0', # normalized

    'prop_location_score1_d_srch_id',
    'prop_location_score1_d_srch_destination_id__prop_starrating',
    'prop_location_score2_d_srch_id',
    'prop_location_score2_d_srch_destination_id__prop_starrating',

    'booking_prob_per_prop_id_d_srch_id',
    'click_prob_per_prop_id_d_srch_id',
    'booking_prob_per_prop_id',
    'click_prob_per_prop_id',

    # 'site_id',
    # 'prop_starrating',
    # 'prop_review_score',
    # 'prop_brand_bool',
    # 'prop_location_score2',
    # 'promotion_flag',
    # 'srch_length_of_stay',
    # 'srch_booking_window',
    # 'srch_adults_count',
    # 'srch_children_count',
    #'srch_room_count',
    'srch_saturday_night_bool',
    # 'srch_query_affinity_score',
    # 'random_bool',
    # 'comp_known_cnt',
    # 'comp_better_worse',
    'comp_rate_sum',
    # 'hist_starrating_diff',
    # 'price_hist_logdiff',
    # 'visitor_hist_adr_usd_logdiff',
    # 'same_country',
    # 'booking_prob_per_prop_id',
    # 'click_prob_per_prop_id',
    # 'price_usd_ld_srch_id',
    #'prop_starrating_w0_d_srch_destination_id',
    # 'prop_starrating_w0_d_srch_id',
    #'prop_review_score_d_srch_destination_id',
    #'prop_review_score_d_srch_destination_id__prop_starrating',
    # 'prop_review_score_d_srch_id',
    #'prop_location_score1_d_srch_destination_id',
    #'prop_location_score2_d_srch_destination_id',
    # 'price_usd_ld_srch_id__prop_starrating',
    # 'booking_prob_per_prop_id_d_srch_id',
    # 'click_prob_per_prop_id_d_srch_id',
    # 'price_hist_logdiff_d_srch_id',
    # 'visitor_hist_adr_usd_logdiff_d_srch_id',

    #'midstay_week',
    'midstay_month',
    'midstay_dayofyear',
    #'midstay_dayofweek',
    'booking_week',
    'booking_month',
    'booking_dayofyear',
    #'booking_dayofweek' ,
    
    'prop_count',
    'top_prob_srch_prop_id',
    'avg_res_len_srch_prop_id',
    'avg_position_srch_prop_id',
    'log_appearance_count_srch_prop_id',
    #'top_prob_prop_id',
    #'avg_res_len_prop_id'
    #'avg_position_prop_id',
    #'log_appearance_count_prop_id',
]
predictor_cols = [c for c in df_devel.columns if ((c not in ('position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score', 'is_test')) and (c not in columns_to_remove))]

def get_groups(df):
    grp = df.groupby('srch_id').srch_id.count()
    return grp.values

df_devel = df_devel.set_index('srch_id').sort_index().reset_index()

df_data_train = df_devel.loc[df_devel.srch_id.isin(train_ids), predictor_cols]
groups_train = get_groups(df_data_train)
# #df_data_train = df_data_train.dropna() # let's see what happens if we drop all NAs
df_train = df_devel.loc[df_data_train.index]
df_tg_train = df_devel['target_cls'][df_data_train.index]
df_tg_train = df_tg_train.astype(int)
df_data_train.drop(columns=['srch_id'], inplace=True)

df_data_val = df_devel.loc[df_devel.srch_id.isin(val_ids), predictor_cols]
groups_val = get_groups(df_data_val)
#df_data_val = df_data_val.dropna() # let's see what happens if we drop all NAs
df_val = df_devel.loc[df_data_val.index]
df_tg_val = df_devel['target_cls'][df_data_val.index]
df_tg_val = df_tg_val.astype(int)
df_data_val.drop(columns=['srch_id'], inplace=True)

df_test = df_test.set_index('srch_id').sort_index().reset_index()
df_data_test = df_test[predictor_cols]
groups_test = get_groups(df_data_test)
# df_tg_test = df_test['target_cls']
# df_tg_test = df_tg_test.astype(int)
df_data_test.drop(columns=['srch_id'], inplace=True)

df_data_devel = df_devel[predictor_cols]
groups_devel = get_groups(df_data_devel)
df_tg_devel = df_devel['target_cls']
df_tg_devel = df_tg_devel.astype(int)
df_data_devel.drop(columns=['srch_id'], inplace=True)

#df_data_test = df_data_test.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_test.drop(columns=['srch_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_devel.drop(columns=['srch_id'], inplace=True)


In [148]:
# def set_prop_agg_to_NA(df, ids):
#     cols = [c for c in df.columns if '_prop_id' in c]
#     print(cols)
#     df.loc[df.prop_id.isin(ids), cols] = pd.NA
#
# def set_srch_dest_agg_to_NA(df, ids):
#     cols = [c for c in df.columns if '_srch_destination_id' in c]
#     #cols = 'avg_position_srch_prop_id'
#     print(cols)
#     df.loc[df.srch_destination_id.isin(ids), cols] = pd.NA
#
# def set_prop_srch_dest_id_agg_to_NA(df, ids):
#     cols = [c for c in df.columns if '_srch_prop_id' in c]
#     #cols = 'avg_position_srch_prop_id'
#     print(cols)
#     df.loc[df.prop_srch_dest_id.isin(ids), cols] = pd.NA

def set_avg_position_srch_prop_id_to_NA(df, ids):
    cols = [c for c in df.columns if c in ['avg_position_srch_prop_id', 'top_prob_srch_prop_id']]
    print(cols)
    df.loc[df.prop_srch_dest_id.isin(ids), cols] = pd.NA

def set_avg_position_prop_id_to_NA(df, ids):
    cols = [c for c in df.columns if c in ['avg_position_prop_id', 'top_prob_prop_id']]
    print(cols)
    df.loc[df.prop_id.isin(ids), cols] = pd.NA

def set_probs_to_NA(df, ids, negate=False):
    cols = [c for c in df.columns if '_prob_' in c or 'click' in c]
    print(cols)
    if len(cols)==0:
        return
    sel_idx = ~df.prop_srch_dest_id.isin(ids) if negate else df.prop_srch_dest_id.isin(ids)
    df.loc[sel_idx, cols] = pd.NA

# So first we need to remove the click/book probs for all ids in val only
def get_ids_in_one_but_not_the_other(df_in_this, df_not_in_this, col):
    in_this = set(df_in_this[col].unique())
    not_in_this = set(df_not_in_this[col].unique())
    missing = in_this - not_in_this
    print(len(in_this), len(not_in_this), len(missing))
    return missing

# set_prop_srch_dest_id_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_val, df_train, 'prop_srch_dest_id'))
# set_prop_srch_dest_id_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_test, df_train, 'prop_srch_dest_id'))
#
# set_srch_dest_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_val, df_train, 'srch_destination_id'))
# set_srch_dest_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_test, df_train, 'srch_destinationf_id'))
#
# set_prop_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_val, df_data_train, 'prop_id'))
# set_prop_agg_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_test, df_data_train, 'prop_id'))

set_avg_position_srch_prop_id_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_test, df_data_train, 'prop_srch_dest_id'))
set_avg_position_srch_prop_id_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_val, df_data_train, 'prop_srch_dest_id'))

set_avg_position_prop_id_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_test, df_data_train, 'prop_id'))
set_avg_position_prop_id_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_val, df_data_train, 'prop_id'))

# set_probs_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_test, df_data_train, 'prop_id'))
# set_probs_to_NA(df_data_val, get_ids_in_one_but_not_the_other(df_data_val, df_data_train, 'prop_id'))


409514 583459 11338
[]
193941 583459 26916
[]
114620 127783 567
['top_prob_prop_id', 'avg_position_prop_id']
83083 127783 1330
['top_prob_prop_id', 'avg_position_prop_id']


In [149]:
df_data_val.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)
df_data_train.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)
df_data_test.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)
df_data_devel.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_test.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_devel.drop(columns=['prop_srch_dest_id', 'srch_destination_id', 'prop_id'], inplace=True)


In [150]:
for col in df_data_train.columns:
    print_missing(df_data_val, col)

prop_review_score missing count 24166 out of 496238 => 5% missing
prop_location_score2 missing count 108599 out of 496238 => 22% missing
srch_query_affinity_score missing count 464464 out of 496238 => 94% missing
hist_starrating_diff missing count 471743 out of 496238 => 95% missing
price_hist_logdiff missing count 70493 out of 496238 => 14% missing
visitor_hist_adr_usd_logdiff missing count 471669 out of 496238 => 95% missing
price_usd_ld_srch_id missing count 9 out of 496238 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 16769 out of 496238 => 3% missing
prop_starrating_w0_d_srch_id missing count 16769 out of 496238 => 3% missing
prop_review_score_d_srch_destination_id missing count 24166 out of 496238 => 5% missing
prop_review_score_d_srch_destination_id__prop_starrating missing count 24166 out of 496238 => 5% missing
prop_review_score_d_srch_id missing count 24166 out of 496238 => 5% missing
prop_location_score2_d_srch_destination_id missing count 108599 out o

In [151]:
# define a custom objective (to be exactly the same as the one used in the competition)

# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def custom_ndcg(preds, train_data, *largs, **kwargs):
    print('not sure how to do this')
    return None


In [152]:
def monotone_constraints_from_tuples(tpls):
    pass

test = [
#('promotion_flag', 1),
('random_bool', -1),
#('comp_better_cnt', -1),
#('price_hist_logdiff', -1),
#('price_usd_ld_srch_id', -1),
#('prop_starrating_w0_d_srch_id', 1),
#('prop_location_score1_d_srch_id', 1),
#('click_prob_per_prop_id_d_srch_id', 1),
]

d = {col:i for i, col in enumerate(df_data_train.columns)}

mon = np.zeros(len(df_data_train.columns))
for t in test:
    mon[d[t[0]]]=t[1]

mon

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
       -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [153]:
def get_categorical_column(df):
    categorical_features = [
        "day",
        "month",
        "prop_country_id",
        "site_id",
        "visitor_location_country_id",
        'srch_saturday_night_bool',
        'prop_brand_bool',
        'same_country',
        'random_bool',
        'promotion_flag',
        #'midstay_week',
        'midstay_month',
        'midstay_dayofyear',
        'midstay_dayofweek',
        'booking_week',
        'booking_month',
        'booking_dayofyear',
        'booking_dayofweek' ,
#        'prop_starrating',
    ]
    categorical_features = [c for c in categorical_features if c in df.columns.values]
    categorical_features_numbers = [df.columns.get_loc(x) for x in categorical_features]
    return categorical_features_numbers

In [154]:
from lightgbm import early_stopping, log_evaluation, LGBMRanker

# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    #objective="rank_xendcg",
    objective='lambdarank',
    #lambdarank_truncation_level=16,
    metric="ndcg",
    subsample_for_bin=200000*2,
    n_estimators=11500, # this is the max, early stopping will likely result in fewer
    # feature_fraction=0.8,
    learning_rate=0.1,
    #label_gain=[0, 1, 31],
    label_gain=[0, 1, 5],
    random_state=42,
    boosting='goss',
    # top_rate=0.13, # goss
    # other_rate=0.10, # goss
    # --
    # boosting='dart',
    #boosting='gbdt',
    # bagging_fraction=0.9,
    # bagging_freq=5,
    # bagging_seed=123,
    # --
    #monotone_constraints=mon,
    #monotone_constraints_method='advanced',
    #extra_trees=True,
    lambda_l2=1e-3,
    #lambda_l1=1e-2,
    #min_data_in_leaf=40,
    max_depth=5,
    max_position=5,
    deterministic=True,
    num_threads=24,
    lambdarank_truncation_level=5+3,
)

x_val = df_data_val
Y_val = df_tg_val
g_val = groups_val
# x_val = df_data_test
# Y_val = df_tg_test
# g_val = groups_test

x_train = df_data_train
Y_train = df_tg_train
g_train = groups_train
# x_train = df_data_devel
# Y_train = df_tg_devel
# g_train = groups_devel


early_stopping_callback = early_stopping(stopping_rounds=150, first_metric_only=True)
log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train), (x_val, Y_val)],
    eval_group=[g_train, g_val],
    group=g_train,
    eval_at=5,
    callbacks=[early_stopping_callback, log_evaluation_callback],
    categorical_feature=get_categorical_column(x_train),
)


New categorical_feature is [0, 3, 6, 13, 14, 16, 20]






Training until validation scores don't improve for 150 rounds
[20]	training's ndcg@5: 0.38574	valid_1's ndcg@5: 0.378665
[40]	training's ndcg@5: 0.392909	valid_1's ndcg@5: 0.383958
[60]	training's ndcg@5: 0.398335	valid_1's ndcg@5: 0.38896
[80]	training's ndcg@5: 0.402165	valid_1's ndcg@5: 0.391674
[100]	training's ndcg@5: 0.405071	valid_1's ndcg@5: 0.393445
[120]	training's ndcg@5: 0.408158	valid_1's ndcg@5: 0.394243
[140]	training's ndcg@5: 0.410555	valid_1's ndcg@5: 0.396091
[160]	training's ndcg@5: 0.412653	valid_1's ndcg@5: 0.39705
[180]	training's ndcg@5: 0.414569	valid_1's ndcg@5: 0.397865
[200]	training's ndcg@5: 0.41655	valid_1's ndcg@5: 0.398616
[220]	training's ndcg@5: 0.418534	valid_1's ndcg@5: 0.399243
[240]	training's ndcg@5: 0.420147	valid_1's ndcg@5: 0.399076
[260]	training's ndcg@5: 0.421765	valid_1's ndcg@5: 0.399733
[280]	training's ndcg@5: 0.423553	valid_1's ndcg@5: 0.400537
[300]	training's ndcg@5: 0.425003	valid_1's ndcg@5: 0.400772
[320]	training's ndcg@5: 0.4266

In [155]:
{f:i for (f, i) in zip(ranker.feature_name_, ranker.feature_importances_)}

{'site_id': 1944,
 'prop_starrating': 261,
 'prop_review_score': 341,
 'prop_brand_bool': 124,
 'prop_location_score1': 995,
 'prop_location_score2': 1370,
 'promotion_flag': 105,
 'srch_length_of_stay': 312,
 'srch_booking_window': 563,
 'srch_adults_count': 156,
 'srch_children_count': 146,
 'srch_room_count': 57,
 'srch_query_affinity_score': 296,
 'random_bool': 356,
 'booking_dayofweek': 216,
 'midstay_week': 354,
 'midstay_dayofweek': 265,
 'hist_starrating_diff': 249,
 'price_hist_logdiff': 1210,
 'visitor_hist_adr_usd_logdiff': 360,
 'same_country': 83,
 'price_usd_ld_srch_id': 1212,
 'prop_starrating_w0_d_srch_destination_id': 348,
 'prop_starrating_w0_d_srch_id': 206,
 'prop_review_score_d_srch_destination_id': 415,
 'prop_review_score_d_srch_destination_id__prop_starrating': 473,
 'prop_review_score_d_srch_id': 181,
 'prop_location_score1_d_srch_destination_id': 1011,
 'prop_location_score2_d_srch_destination_id': 1249,
 'price_usd_ld_srch_id__prop_starrating': 1011,
 'price

In [156]:
def get_prediction_df(preds, df):
    df_prediction = df[["srch_id", "prop_id", 'relevance_score']].assign(predicted = preds)
    return df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')

In [157]:
df_pred_val = get_prediction_df(ranker.predict(df_data_val), df_val)

In [158]:
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False)).mean()

0.4052536054473705

In [159]:
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_sklearn(x)).mean()

0.30209086095081694

In [111]:
ref_df=pd.read_csv('data/test_revealed.csv')

In [160]:
ref_df['relevance_score'] = ref_df.booking_bool * 4.0 + ref_df.click_bool
#ref_df
df_pred_test = get_prediction_df(
    ranker.predict(df_data_test), 
    df_test[["srch_id", "prop_id"]].set_index(['srch_id', 'prop_id']).join(ref_df.set_index(['srch_id', 'prop_id'])[['relevance_score']], how='left').reset_index()
)

In [161]:
df_pred_test.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False)).mean()

0.39860938178185

In [None]:
df_pred_test.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_sklearn(x)).mean()

In [None]:
# 0.47409365112587953 - 2000 and [0, 1, 31]

In [None]:
preds_test = ranker.predict(df_data_test)

df_prediction = df_test[["srch_id", "prop_id"]].assign(predicted = preds_test)
df_prediction = df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')
#df_prediction[["srch_id", "prop_id"]].to_csv("data/pred_robert_LGBMRanker_1150.csv.zip", index=None)

In [None]:
df_prediction

TODO: ok so, there is a problem with a big gap in the performance on the validation set and test set. I think this is due to several factors:
* bagging in combination with aggregating over properties and destination. For the validation set to be meaningful we need to ensure that a similar proportion of destination ids are missing from the validation set as the proportion of missing destination ids from the test set
* statistics should only be calculated on the training set (do not include the validation set)
* the score is more in line with sklearn.metrics.ndgc score, coincidence?

Extra features:
* calculate z score or quantile for booking day-of-year relative to property stddev or empirical distribution
* add df (number of observations)
* determine price-usd type depending on the visitor country id and/or site id
** figure out the conditions of when it is total amount of booking (e.g. when there is no correlation between room number and gross booking)
** check if outliers are correlated (maybe there is some currency conversion going on for some countries)
* add all comps