In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [4]:
df_raw = pd.read_csv("data/joined_all_features.csv.zip", index_col=0)

In [21]:
df_raw['target_cls'] = df_raw.booking_bool + df_raw.click_bool
df_raw['relevance_score'] = df_raw.booking_bool * 4 + df_raw.click_bool
# a property id can appear in multiple srch_destination_id, to make the combination unique we need a new combined id
df_raw['prop_srch_dest_id']  = df_raw.prop_id * 100000 + df_raw.srch_destination_id
df_raw.head()
df = df_raw

In [22]:
def add_rank(df, groupby, cols):
    return df.groupby(groupby)[cols].apply(lambda x: x.rank(method='min', na_option='keep', axis=0)).reset_index(groupby, drop=True).rename(columns=lambda x: f'{x}_rank')

df = df.join(add_rank(df, 'srch_id', ['price_usd']), how='left')

In [23]:
# add flag to indicate if any competitor has availability at a better rate
for i in range(1, 9):
    df[f'comp{i}_known'] = ~(df[f'comp{i}_rate'].isna() | df[f'comp{i}_inv'].isna())
    df[f'comp{i}_better'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate'] == -1) & (df[f'comp{i}_inv'] <= 0)
    df[f'comp{i}_worse'] = df[f'comp{i}_known'] & (df[f'comp{i}_rate'] == 1) & (df[f'comp{i}_inv'] >= 0)

df['comp_known_cnt'] = sum([df[f'comp{i}_known'].astype(int) for i in range(1, 9)])
df['comp_better_worse'] = \
    (sum([df[f'comp{i}_better'].astype(int) for i in range(1, 9)])
     - sum([df[f'comp{i}_worse'].astype(int) for i in range(1, 9)]))

comp_rate_cols = [f'comp{i}_rate' for i in range(1, 9)]
df['comp_rate_sum'] = df[comp_rate_cols].fillna(0).sum(axis=1)

comps = [f'comp{i}_' for i in range(1, 9)]
#df.drop(columns=[c for c in df.columns if c[:6] in comps and c not in ('comp2_rate_percent_diff', 'comp5_rate_percent_diff', 'comp8_rate_percent_diff')], inplace=True)
df.drop(columns=[c for c in df.columns if c[:6] in comps], inplace=True)

KeyError: 'comp1_rate'

In [24]:
df_devel = df.loc[df.is_test==False, [c for c in df.columns if c != 'is_test']]
df_test = df.loc[df.is_test==True, [c for c in df.columns if c != 'is_test']]
df_test.srch_id -= df_devel.srch_id.max()
# del df
# import gc
# gc.collect()

In [26]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705398 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
hist_starrating_diff missing count 4706481 out of 4958347 => 95% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4705398 out of 4958347 => 95% missing
price_usd_d_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_std_srch_destination_id missing count 230 out of 4958347 => 0% missing
prop_starrating_w0_med_srch_destination_id missing

In [29]:
def add_position_related_features(df):
    ret = {}
    tmp = df.query('random_bool==0')
    ret['prop_srch_dest_id'] = [
        tmp.groupby('prop_srch_dest_id').agg(
            #top_prob_srch_prop_id=('position', lambda x: (x <= TOP_NUM).mean()),
            tg_avg_position_srch_prop_id=('position', 'mean'),
            tg_std_position_srch_prop_id=('position', 'std'),
            #tg_click_prob_srch_prop_id=('click_bool', 'mean'),
            log_appearance_count_srch_prop_id=('position', lambda x: np.log(len(x))),
        ),
        df.groupby('prop_srch_dest_id').agg(
            avg_res_len_srch_prop_id=('srch_res_count', 'mean'),
        ),        
    ]
    # also add average position per property id
    ret['prop_id'] = [
        tmp.groupby('prop_id').agg(
            #top_prob_prop_id=('position', lambda x: (x <= TOP_NUM).mean()),
            tg_avg_position_prop_id=('position', 'mean'),
            tg_std_position_prop_id=('position', 'std'),
            #tg_click_prob_prop_id=('click_bool', 'mean'),
            log_appearance_count_prop_id=('position', lambda x: np.log(len(x))),
        ),
        df.groupby('prop_id').agg(
            avg_res_len_prop_id=('srch_res_count', 'mean'),
        )
    ]
    return ret

In [30]:
df_train = df_devel

for key, tbls in add_position_related_features(df_train).items():
    if type(key)==tuple:
        key = list(key)
    for tbl in tbls:
        df_train = df_train.set_index(key).join(tbl, how='left').reset_index()
        df_val = df_val.set_index(key).join(tbl, how='left').reset_index()
        df_test = df_test.set_index(key).join(tbl, how='left').reset_index()

In [34]:
def set_avg_position_srch_prop_id_to_NA(df, ids):
    #cols = [c for c in df.columns if (c.startswith('tg_') and c.endswith('_srch_prop_id'))]
    cols = [c for c in df.columns if c in ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id',]]#'avg_res_len_srch_prop_id']]
    print(cols)
    df.loc[df.prop_srch_dest_id.isin(ids), cols] = pd.NA

def set_avg_position_prop_id_to_NA(df, ids):
    #cols = [c for c in df.columns if c.startswith('tg_') and c.endswith('_prop_id') and not c.endswith('_srch_prop_id') or (c.startswith('exp_') and c.endswith('__prop_id'))]
    cols = [c for c in df.columns if c in ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id',]]#'avg_res_len_prop_id']]
    print(cols)
    df.loc[df.prop_id.isin(ids), cols] = pd.NA

# So first we need to remove the click/book probs for all ids in val only
def get_ids_in_one_but_not_the_other(df_in_this, df_not_in_this, col):
    in_this = set(df_in_this.set_index(col).index.unique())
    not_in_this = set(df_not_in_this.set_index(col).index.unique())
    missing = in_this - not_in_this
    print(len(in_this), len(not_in_this), len(missing))
    return missing


# set_avg_position_srch_prop_id_to_NA(df_val, get_ids_in_one_but_not_the_other(df_test, df_train, 'prop_srch_dest_id'))
# #set_avg_position_srch_prop_id_to_NA(df_val, get_ids_in_one_but_not_the_other(df_val, df_train, 'prop_srch_dest_id'))
# set_avg_position_prop_id_to_NA(df_val, get_ids_in_one_but_not_the_other(df_test, df_train, 'prop_id'))
# #set_avg_position_prop_id_to_NA(df_val, get_ids_in_one_but_not_the_other(df_val, df_train, 'prop_id'))
    
# #set_avg_position_srch_prop_id_to_NA(df_train, get_ids_in_one_but_not_the_other(df_train, df_val, 'prop_srch_dest_id'))
# #set_avg_position_prop_id_to_NA(df_train, get_ids_in_one_but_not_the_other(df_train, df_val, 'prop_id'))
set_avg_position_srch_prop_id_to_NA(df_train, get_ids_in_one_but_not_the_other(df_train, df_test, 'prop_srch_dest_id'))
set_avg_position_prop_id_to_NA(df_train, get_ids_in_one_but_not_the_other(df_train, df_test, 'prop_id'))

610375 612744 200861
['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'log_appearance_count_srch_prop_id']
129113 129438 7448
['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'log_appearance_count_prop_id']


In [37]:
df_train.fillna(np.NaN, inplace=True)

for col in df_train.columns:
    print_missing(df_train, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
visitor_hist_adr_usd missing count 4705398 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
price_usd missing count 31 out of 4958347 => 0% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
prop_starrating_w0 missing count 169572 out of 4958347 => 3% missing
hist_starrating_diff missing count 4706481 out of 4958347 => 95% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4705398 out of 4958347 => 95% missing
price_usd_d_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_std_srch_destination_id missing count 230 out of 4958347 => 0% missing
prop_starrating_w0_med_srch_destination_id missing

In [39]:
gc.collect()

columns_to_keep = [
    # 'prop_id',
    # 'srch_destination_id',
    # 'prop_srch_dest_id',
    # 'srch_id',

    'site_id',
    #'visitor_location_country_id',
    'visitor_hist_starrating',
    #'visitor_hist_adr_usd',
    'prop_country_id',
    'prop_starrating',
    'prop_review_score',
    'prop_brand_bool',
    'prop_location_score1',
    'prop_location_score2',
    'prop_log_historical_price',
    # 'position',
    #'price_usd',
    'price_usd_rank',
    'promotion_flag',
    'srch_length_of_stay',
    'srch_booking_window',
    'srch_adults_count',
    'srch_children_count',
    #'srch_room_count',
    #'srch_saturday_night_bool',
    'srch_query_affinity_score',
    'random_bool',
    
    #'couple_bool',
    
    # 'click_bool',
    # 'booking_bool',
    # 'prop_starrating_w0',
    # 'booking_week',
    # 'booking_month',
    # 'booking_dayofyear',
    # 'booking_dayofweek',
    'midstay_week',
    #'midstay_month',
    # 'midstay_dayofyear',
    'midstay_dayofweek',
    'hist_starrating_diff',
    'price_hist_logdiff',
    'visitor_hist_adr_usd_logdiff',
    # 'same_country',

    #'srch_res_count',
    'prop_count_per_srch_dest',
    'srch_dest_count_per_prop',

    'price_usd_std_srch_id',
    'price_usd_med_srch_id',
    'price_usd_d_srch_id',
    # 'price_usd_std_srch_id__prop_starrating',
    # 'price_usd_med_srch_id__prop_starrating',
    'price_usd_d_srch_id__prop_starrating',
    # 'price_hist_logdiff_std_srch_id',
    # 'price_hist_logdiff_med_srch_id',
    'price_hist_logdiff_d_srch_id',
    'visitor_hist_adr_usd_logdiff_d_srch_id',

    'comp_known_cnt',
    'comp_better_worse',
    # 'comp_rate_sum',

    'prop_starrating_w0_std_srch_destination_id',
    #'prop_starrating_w0_med_srch_destination_id',
    'prop_starrating_w0_d_srch_destination_id',
    'prop_starrating_w0_std_srch_id',
    #'prop_starrating_w0_med_srch_id',
    'prop_starrating_w0_d_srch_id',
    #
    'prop_review_score_std_srch_destination_id',
    #'prop_review_score_med_srch_destination_id',
    'prop_review_score_d_srch_destination_id',
    # 'prop_review_score_std_srch_destination_id__prop_starrating',
    # 'prop_review_score_med_srch_destination_id__prop_starrating',
    # 'prop_review_score_d_srch_destination_id__prop_starrating',
    'prop_review_score_std_srch_id',
    #'prop_review_score_med_srch_id',
    'prop_review_score_d_srch_id',
    #
    'prop_location_score1_std_srch_destination_id',
    #'prop_location_score1_med_srch_destination_id',
    'prop_location_score1_d_srch_destination_id',
    'prop_location_score1_std_srch_destination_id__prop_starrating',
    'prop_location_score1_med_srch_destination_id__prop_starrating',
    'prop_location_score1_d_srch_destination_id__prop_starrating',
    'prop_location_score1_std_srch_id',
    #'prop_location_score1_med_srch_id',
    'prop_location_score1_d_srch_id',
    #
    #'prop_location_score2_std_srch_destination_id',
    #'prop_location_score2_med_srch_destination_id',
    'prop_location_score2_d_srch_destination_id',
    # 'prop_location_score2_std_srch_destination_id__prop_starrating',
    #'prop_location_score2_med_srch_destination_id__prop_starrating',
    'prop_location_score2_d_srch_destination_id__prop_starrating',
    'prop_location_score2_std_srch_id',
    #'prop_location_score2_med_srch_id',
    'prop_location_score2_d_srch_id',
    #
    # 'srch_query_affinity_score_std_srch_id',
    'srch_query_affinity_score_d_srch_id',
    # 'srch_res_count_med_prop_id',
    # 'srch_res_count_std_prop_id',
    # 'srch_res_count_z_prop_id',
    #
    # 'target_cls',
    # 'relevance_score',
    #
    #'tg_click_prob_srch_prop_id',
    #'tg_avg_position_srch_prop_id_rank',
    'tg_std_position_srch_prop_id',
    'tg_avg_position_srch_prop_id',
    'log_appearance_count_srch_prop_id',
    'avg_res_len_srch_prop_id',
    #'tg_click_prob_prop_id',
    #'tg_avg_position_prop_id_rank',
    #'tg_std_position_prop_id',
    'tg_avg_position_prop_id',
    'log_appearance_count_prop_id',
    'avg_res_len_prop_id',
    ]
predictor_cols = [c for c in df_train.columns if ((c not in ('position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score', 'is_test')) and (c in columns_to_keep))]

New categorical_feature is [0, 5, 9, 15]






Training until validation scores don't improve for 350 rounds
[20]	training's ndcg@5: 0.390848	valid_1's ndcg@5: 0.384895
[40]	training's ndcg@5: 0.400279	valid_1's ndcg@5: 0.392683
[60]	training's ndcg@5: 0.406248	valid_1's ndcg@5: 0.397981
[80]	training's ndcg@5: 0.410201	valid_1's ndcg@5: 0.400702
[100]	training's ndcg@5: 0.413651	valid_1's ndcg@5: 0.402651
[120]	training's ndcg@5: 0.416785	valid_1's ndcg@5: 0.404082
[140]	training's ndcg@5: 0.419265	valid_1's ndcg@5: 0.405395
[160]	training's ndcg@5: 0.421852	valid_1's ndcg@5: 0.406868
[180]	training's ndcg@5: 0.424103	valid_1's ndcg@5: 0.407369
[200]	training's ndcg@5: 0.42608	valid_1's ndcg@5: 0.408289
[220]	training's ndcg@5: 0.428128	valid_1's ndcg@5: 0.408684
[240]	training's ndcg@5: 0.430133	valid_1's ndcg@5: 0.409093
[260]	training's ndcg@5: 0.432157	valid_1's ndcg@5: 0.409684
[280]	training's ndcg@5: 0.433626	valid_1's ndcg@5: 0.409977
[300]	training's ndcg@5: 0.435655	valid_1's ndcg@5: 0.410386
[320]	training's ndcg@5: 0.4

In [53]:
df_train[predictor_cols + ['srch_id', 'prop_id', 'prop_srch_dest_id', 'click_bool', 'booking_bool']].to_csv('data/final_features_train.csv.zip')

In [52]:
df_test[predictor_cols + ['srch_id', 'prop_id', 'prop_srch_dest_id']].to_csv('data/final_features_test.csv.zip')