In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import ndcg_score

In [2]:
def _dsg_numerator(rel, use_2pow):
    if use_2pow:
        return 2**rel-1
    else:
        return rel


def calc_ndcg(rel_true, rel_est, n=5, use_2pow=True, use_log2=False):
    assert len(rel_est) == len(rel_true)
    rel_true = np.asarray(rel_true)
    rel_est = np.asarray(rel_est)

    log_func = np.log2 if use_log2 else np.log
    discount = 1 / log_func(np.arange(2, len(rel_true) + 2))
    discount[n:] = 0

    dsg_N = discount.dot(_dsg_numerator(rel_est, use_2pow))
    idsg_N = discount.dot(_dsg_numerator(rel_true, use_2pow))
    EPS = 1e-6

    return dsg_N/(idsg_N+EPS)


def ndcg_dmt(predicted_scores, n=5, use_2pow=True, use_log2=False):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return calc_ndcg(ranked_scores, predicted_scores, n=n, use_2pow=use_2pow, use_log2=use_log2)


def ndcg_sklearn(predicted_scores, n=5):
    ranked_scores = np.sort(predicted_scores)[::-1] # sorted descending, highest score first
    return ndcg_score(np.asarray([ranked_scores]), np.asarray([predicted_scores]), k=n)

In [3]:
def print_missing(df, col):
    miss_cnt = df[col].isna().sum()
    if miss_cnt == 0:
        return
    total_cnt = len(df[col])
    perc_miss = 100 * miss_cnt / total_cnt
    print(f'{col} missing count {miss_cnt} out of {total_cnt} => {round(perc_miss)}% missing')

In [4]:
df_to_pred = pd.read_csv("data/final_features_test.csv.zip", index_col=0)

In [5]:
df_devel = pd.read_csv("data/final_features_train.csv.zip", index_col=0)
df_devel['target_cls'] = df_devel.booking_bool + df_devel.click_bool
df_devel['relevance_score'] = df_devel.booking_bool * 4 + df_devel.click_bool
df_devel.head()

Unnamed: 0,site_id,visitor_hist_starrating,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,tg_avg_position_prop_id,log_appearance_count_prop_id,avg_res_len_prop_id,srch_id,prop_id,prop_srch_dest_id,click_bool,booking_bool,target_cls,relevance_score
0,12,,219,4,4.5,1,2.77,0.1302,5.2,0,...,10.886756,6.25575,32.303266,1,30184,3018423246,0.0,0.0,0.0,0.0
1,12,,219,3,3.5,1,2.2,0.0356,4.81,0,...,25.63125,5.075174,33.004739,1,44147,4414723246,0.0,0.0,0.0,0.0
2,12,,219,4,4.0,1,2.08,0.015,5.28,0,...,13.072835,6.230481,31.867299,1,89073,8907323246,0.0,0.0,0.0,0.0
3,12,,219,2,,0,1.61,,4.14,0,...,29.170732,4.406719,32.105769,1,50984,5098423246,0.0,0.0,0.0,0.0
4,12,,219,3,,1,2.2,,5.08,0,...,25.28777,5.627621,32.5,1,59267,5926723246,0.0,0.0,0.0,0.0


In [6]:
for col in df_devel.columns:
    print_missing(df_devel, col)

visitor_hist_starrating missing count 4706481 out of 4958347 => 95% missing
prop_review_score missing count 240658 out of 4958347 => 5% missing
prop_location_score2 missing count 1090348 out of 4958347 => 22% missing
prop_log_historical_price missing count 713899 out of 4958347 => 14% missing
srch_query_affinity_score missing count 4640941 out of 4958347 => 94% missing
hist_starrating_diff missing count 4706481 out of 4958347 => 95% missing
price_hist_logdiff missing count 713927 out of 4958347 => 14% missing
visitor_hist_adr_usd_logdiff missing count 4705398 out of 4958347 => 95% missing
price_usd_d_srch_id missing count 31 out of 4958347 => 0% missing
prop_starrating_w0_std_srch_destination_id missing count 230 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 169572 out of 4958347 => 3% missing
prop_starrating_w0_std_srch_id missing count 1528 out of 4958347 => 0% missing
prop_starrating_w0_d_srch_id missing count 169572 out of 4958347 => 3% missing

In [7]:
# split srch_id into train and val
all_srch_ids = df_devel.srch_id.unique()

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 124
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
TEST_PROP = 0.05
TRAIN_PROP = 1 - (VALIDATION_PROP + TEST_PROP)

train_cnt = int(len(all_srch_ids)*(TRAIN_PROP))
train_ids = all_srch_ids[:train_cnt]

val_cnt = int(len(all_srch_ids)*(VALIDATION_PROP))
val_ids = all_srch_ids[train_cnt:train_cnt+val_cnt]

test_cnt = len(all_srch_ids)-val_cnt-train_cnt
test_ids = all_srch_ids[-test_cnt:]

df_train = df_devel.loc[df_devel.srch_id.isin(train_ids)]
df_val = df_devel.loc[df_devel.srch_id.isin(val_ids)]
df_test = df_devel.loc[df_devel.srch_id.isin(test_ids)]

In [8]:
# del df
# del df_raw
import gc
gc.collect()

0

In [9]:
# So first we need to remove the click/book probs for all ids in val only
def get_ids_in_one_but_not_the_other(df_in_this, df_not_in_this, col):
    in_this = set(df_in_this.set_index(col).index.unique())
    not_in_this = set(df_not_in_this.set_index(col).index.unique())
    missing = in_this - not_in_this
    print(len(in_this), len(not_in_this), len(missing))
    return missing

def _set_cols_to_NA_for_ids_in_one_but_not_other(df_in_this, df_not_in_this, groupby, cols):
    ids = get_ids_in_one_but_not_the_other(df_in_this, df_not_in_this, groupby)
    df_in_this.set_index(groupby).loc[[id for id in ids], cols] = pd.NA

# first the test set will have some ids which are not in the training set, as such, these target related values must be removed
_set_cols_to_NA_for_ids_in_one_but_not_other(df_test, df_train,
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_test, df_train,
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])
# same for the validation set
_set_cols_to_NA_for_ids_in_one_but_not_other(df_val, df_train,
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_val, df_train,
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])


# Improvement here: ids in the train set but not in the test or validation set should be removed, to align the data missingness of the test, validation and prediction set
_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_test, 
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_test, 
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])

_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_val, 
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_val, 
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])

66063 127134 796
127098 569116 14041
82693 127134 1290
193371 569116 28036
127134 66063 61867
569116 127098 456059
127134 82693 45731
569116 193371 403781


In [10]:
df_train = df_train.set_index('srch_id').sort_index().reset_index()
df_val = df_val.set_index('srch_id').sort_index().reset_index()
df_test = df_test.set_index('srch_id').sort_index().reset_index()

def get_groups(df):
    grp = df.groupby('srch_id').srch_id.count()
    return grp.values

groups_train = get_groups(df_train)
df_tg_train = df_train['target_cls'].astype(int)

groups_val = get_groups(df_val)
df_tg_val = df_val['target_cls'].astype(int)

groups_test = get_groups(df_test)
df_tg_test = df_test['target_cls'].astype(int)

In [11]:
df_val.fillna(np.NaN, inplace=True)
df_train.fillna(np.NaN, inplace=True)
df_test.fillna(np.NaN, inplace=True)

for col in df_train.columns:
    print_missing(df_train, col)

visitor_hist_starrating missing count 4000934 out of 4215208 => 95% missing
prop_review_score missing count 204668 out of 4215208 => 5% missing
prop_location_score2 missing count 927364 out of 4215208 => 22% missing
prop_log_historical_price missing count 607253 out of 4215208 => 14% missing
srch_query_affinity_score missing count 3945585 out of 4215208 => 94% missing
hist_starrating_diff missing count 4000934 out of 4215208 => 95% missing
price_hist_logdiff missing count 607273 out of 4215208 => 14% missing
visitor_hist_adr_usd_logdiff missing count 3999963 out of 4215208 => 95% missing
price_usd_d_srch_id missing count 22 out of 4215208 => 0% missing
prop_starrating_w0_std_srch_destination_id missing count 212 out of 4215208 => 0% missing
prop_starrating_w0_d_srch_destination_id missing count 144143 out of 4215208 => 3% missing
prop_starrating_w0_std_srch_id missing count 1266 out of 4215208 => 0% missing
prop_starrating_w0_d_srch_id missing count 144143 out of 4215208 => 3% missing


In [13]:
from lightgbm import early_stopping, log_evaluation, reset_parameter, LGBMRanker

gc.collect()

predictor_cols = [c for c in df_train.columns if c not in ('prop_id', 'srch_id', 'prop_srch_dest_id', 'position', 'booking_bool', 'click_bool', 'target_cls', 'relevance_score')]

def get_categorical_column(df):
    categorical_features = [
        "day",
        "month",
        #"prop_country_id",
        "site_id",
        #"visitor_location_country_id",
        'srch_saturday_night_bool',
        'prop_brand_bool',
        'same_country',
        'random_bool',
        'couple_bool',
        'promotion_flag',
        #'midstay_week',
        'midstay_month',
        'midstay_dayofyear',
        #'midstay_dayofweek',
        #'booking_week',
        'booking_month',
        'booking_dayofyear',
        #'booking_dayofweek' ,
#        'prop_starrating',
    ]
    categorical_features = [c for c in categorical_features if c in df.columns.values]
    categorical_features_numbers = [df.columns.get_loc(x) for x in categorical_features]
    return categorical_features_numbers

In [12]:
# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    #objective="rank_xendcg",
    objective='lambdarank',
    metric="ndcg",
    subsample_for_bin=400000,
    n_estimators=16000, # this is the max, early stopping will likely result in fewer
    learning_rate=0.1,
    #label_gain=[0, 1, 31],
    label_gain=[0, 1, 5],
    random_state=42,
    boosting='goss',
    # top_rate=0.13, # goss
    # other_rate=0.10, # goss
    # --
    # boosting='dart',
    # boosting='gbdt',
    #bagging_fraction=0.9,
    #bagging_freq=20,
    # bagging_seed=123,
    # --
    #monotone_constraints=mon,
    #monotone_constraints_method='advanced',
    #extra_trees=True,
    lambda_l2=1e-3,
    #lambda_l1=1e-3,
    min_data_in_leaf=60,
    max_depth=6,
    max_position=5,
    deterministic=True,
    num_threads=24,
    lambdarank_truncation_level=11,
    #path_smooth=3,
    #feature_fraction=0.3,
)

x_val = df_val[predictor_cols]
Y_val = df_tg_val
g_val = groups_val

x_train = df_train[predictor_cols]
Y_train = df_tg_train
g_train = groups_train

early_stopping_callback = early_stopping(stopping_rounds=350, first_metric_only=True)
log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train), (x_val, Y_val)],
    eval_group=[g_train, g_val],
    group=g_train,
    eval_at=5,
    callbacks=[early_stopping_callback, log_evaluation_callback, ],    
    categorical_feature=get_categorical_column(x_train),
)


New categorical_feature is [0, 5, 9, 15]






Training until validation scores don't improve for 350 rounds
[20]	training's ndcg@5: 0.391561	valid_1's ndcg@5: 0.388457
[40]	training's ndcg@5: 0.400577	valid_1's ndcg@5: 0.396132
[60]	training's ndcg@5: 0.406754	valid_1's ndcg@5: 0.399099
[80]	training's ndcg@5: 0.411404	valid_1's ndcg@5: 0.402424
[100]	training's ndcg@5: 0.415041	valid_1's ndcg@5: 0.404909
[120]	training's ndcg@5: 0.418378	valid_1's ndcg@5: 0.406276
[140]	training's ndcg@5: 0.421001	valid_1's ndcg@5: 0.407116
[160]	training's ndcg@5: 0.423438	valid_1's ndcg@5: 0.408503
[180]	training's ndcg@5: 0.42604	valid_1's ndcg@5: 0.40962
[200]	training's ndcg@5: 0.428409	valid_1's ndcg@5: 0.409825
[220]	training's ndcg@5: 0.430787	valid_1's ndcg@5: 0.410776
[240]	training's ndcg@5: 0.432699	valid_1's ndcg@5: 0.410841
[260]	training's ndcg@5: 0.434798	valid_1's ndcg@5: 0.41103
[280]	training's ndcg@5: 0.436914	valid_1's ndcg@5: 0.412041
[300]	training's ndcg@5: 0.438714	valid_1's ndcg@5: 0.411892
[320]	training's ndcg@5: 0.440

In [13]:
{f:i for (f, i) in zip(ranker.feature_name_, ranker.feature_importances_)}

{'site_id': 1695,
 'visitor_hist_starrating': 93,
 'prop_country_id': 202,
 'prop_starrating': 167,
 'prop_review_score': 255,
 'prop_brand_bool': 91,
 'prop_location_score1': 477,
 'prop_location_score2': 796,
 'prop_log_historical_price': 784,
 'promotion_flag': 83,
 'srch_length_of_stay': 213,
 'srch_booking_window': 457,
 'srch_adults_count': 118,
 'srch_children_count': 114,
 'srch_query_affinity_score': 187,
 'random_bool': 296,
 'midstay_week': 267,
 'midstay_dayofweek': 132,
 'hist_starrating_diff': 122,
 'price_hist_logdiff': 848,
 'visitor_hist_adr_usd_logdiff': 243,
 'prop_count_per_srch_dest': 311,
 'srch_dest_count_per_prop': 491,
 'price_usd_std_srch_id': 539,
 'price_usd_med_srch_id': 505,
 'price_usd_d_srch_id': 824,
 'prop_starrating_w0_std_srch_destination_id': 317,
 'prop_starrating_w0_d_srch_destination_id': 265,
 'prop_starrating_w0_std_srch_id': 433,
 'prop_starrating_w0_d_srch_id': 153,
 'prop_review_score_std_srch_destination_id': 302,
 'prop_review_score_d_srch

In [14]:
def get_prediction_df(preds, df):
    df_prediction = df[["srch_id", "prop_id", 'relevance_score']].assign(predicted = preds)
    return df_prediction.sort_values("predicted", ascending=False).sort_values("srch_id", kind='stable')

In [15]:
df_pred_test = get_prediction_df(ranker.predict(df_test[predictor_cols]), df_test)
df_pred_test.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False, use_log2=False)).mean()

0.40948660388477953

In [16]:
df_pred_val = get_prediction_df(ranker.predict(df_val[predictor_cols], predict_raw_score=True, predict_disable_shape_check=True), df_val)
df_pred_val.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False, use_log2=False)).mean()

0.4140946800269657

# We see that the validation score matches the test score so we can train the model without a test set, only validation

In [17]:
### Split

# randomly shuffle all_srch_ids using random seed 42
RANDOM_SEED = 124
rng = np.random.default_rng(RANDOM_SEED)
all_srch_ids.sort()
rng.shuffle(all_srch_ids)

VALIDATION_PROP = 0.1
TRAIN_PROP = 1 - (VALIDATION_PROP)

train_cnt = int(len(all_srch_ids)*(TRAIN_PROP))
train_ids = all_srch_ids[:train_cnt]

val_cnt = int(len(all_srch_ids)*(VALIDATION_PROP))
val_ids = all_srch_ids[train_cnt:train_cnt+val_cnt]

df_train = df_devel.loc[df_devel.srch_id.isin(train_ids)]
df_val = df_devel.loc[df_devel.srch_id.isin(val_ids)]

### Missingness

# same for the validation set
_set_cols_to_NA_for_ids_in_one_but_not_other(df_val, df_train,
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_val, df_train,
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])


# Improvement here: ids in the train set but not in the test or validation set should be removed, to align the data missingness of the test, validation and prediction set
_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_val, 
                                             ['prop_id'], 
                                             ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
_set_cols_to_NA_for_ids_in_one_but_not_other(df_train, df_val, 
                                             ['prop_srch_dest_id'], 
                                             ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])

### Groups

df_train = df_train.set_index('srch_id').sort_index().reset_index()
df_val = df_val.set_index('srch_id').sort_index().reset_index()

groups_train = get_groups(df_train)
df_tg_train = df_train['target_cls'].astype(int)

groups_val = get_groups(df_val)
df_tg_val = df_val['target_cls'].astype(int)

## NAs made the same

df_val.fillna(np.NaN, inplace=True)
df_train.fillna(np.NaN, inplace=True)

### Training

# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    objective='lambdarank',
    metric="ndcg",
    subsample_for_bin=400000,
    n_estimators=16000, # this is the max, early stopping will likely result in fewer
    learning_rate=0.1,
    label_gain=[0, 1, 5],
    random_state=42,
    boosting='goss',
    lambda_l2=1e-3,
    min_data_in_leaf=60,
    max_depth=6,
    max_position=5,
    deterministic=True,
    num_threads=24,
    lambdarank_truncation_level=11,
)

x_val = df_val[predictor_cols]
Y_val = df_tg_val
g_val = groups_val

x_train = df_train[predictor_cols]
Y_train = df_tg_train
g_train = groups_train

early_stopping_callback = early_stopping(stopping_rounds=350, first_metric_only=True)
log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train), (x_val, Y_val)],
    eval_group=[g_train, g_val],
    group=g_train,
    eval_at=5,
    callbacks=[early_stopping_callback, log_evaluation_callback, ],    
    categorical_feature=get_categorical_column(x_train),
)


83083 127783 1330
193941 583459 26916
127783 83083 46030
583459 193941 416434


New categorical_feature is [0, 5, 9, 15]






Training until validation scores don't improve for 350 rounds
[20]	training's ndcg@5: 0.391556	valid_1's ndcg@5: 0.386478
[40]	training's ndcg@5: 0.400053	valid_1's ndcg@5: 0.393166
[60]	training's ndcg@5: 0.406629	valid_1's ndcg@5: 0.397097
[80]	training's ndcg@5: 0.411329	valid_1's ndcg@5: 0.400508
[100]	training's ndcg@5: 0.414872	valid_1's ndcg@5: 0.401939
[120]	training's ndcg@5: 0.41783	valid_1's ndcg@5: 0.403522
[140]	training's ndcg@5: 0.420415	valid_1's ndcg@5: 0.405344
[160]	training's ndcg@5: 0.422774	valid_1's ndcg@5: 0.406206
[180]	training's ndcg@5: 0.425027	valid_1's ndcg@5: 0.407207
[200]	training's ndcg@5: 0.427495	valid_1's ndcg@5: 0.408253
[220]	training's ndcg@5: 0.429597	valid_1's ndcg@5: 0.408467
[240]	training's ndcg@5: 0.431458	valid_1's ndcg@5: 0.40885
[260]	training's ndcg@5: 0.433402	valid_1's ndcg@5: 0.409407
[280]	training's ndcg@5: 0.435508	valid_1's ndcg@5: 0.409938
[300]	training's ndcg@5: 0.437635	valid_1's ndcg@5: 0.410386
[320]	training's ndcg@5: 0.43

# We see that the validation score increased but also the number of steps increased, training the final model without validation set

In [18]:
increase_for_5pct = 1 - 942 / 1118  # the stopping criteria increased this much for a 5 percent increase in training data
# so we double it for another 10% increase in the training data
ESTIMATED_STEPS = 942 * (1 + 2 * increase_for_5pct)
ESTIMATED_STEPS

1238.5867620751342

In [19]:
### Split

### Missingness

# same for the validation set
# _set_cols_to_NA_for_ids_in_one_but_not_other(df_to_pred, df_devel,
#                                              ['prop_id'], 
#                                              ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
# _set_cols_to_NA_for_ids_in_one_but_not_other(df_to_pred, df_devel,
#                                              ['prop_srch_dest_id'], 
#                                              ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])
# not necessary, but sanity check


# Improvement here: ids in the train set but not in the test or validation set should be removed, to align the data missingness of the test, validation and prediction set
# _set_cols_to_NA_for_ids_in_one_but_not_other(df_devel, df_to_pred, 
#                                              ['prop_id'], 
#                                              ['tg_avg_position_prop_id', 'tg_std_position_prop_id', 'tg_click_prob_prop_id', 'log_appearance_count_prop_id'])#, 'avg_res_len_srch_prop_id'])
# _set_cols_to_NA_for_ids_in_one_but_not_other(df_devel, df_to_pred, 
#                                              ['prop_srch_dest_id'], 
#                                              ['tg_avg_position_srch_prop_id', 'tg_std_position_srch_prop_id', 'tg_click_prob_srch_prop_id', 'log_appearance_count_srch_prop_id'])#, 'avg_res_len_srch_prop_id'])

### Groups

df_devel = df_devel.set_index('srch_id').sort_index().reset_index()

groups_devel = get_groups(df_devel)
df_tg_devel = df_devel['target_cls'].astype(int)

## NAs made the same

df_devel.fillna(np.NaN, inplace=True)
df_to_pred.fillna(np.NaN, inplace=True)

### Training

# Train the classifier with the best hyperparameters
ranker = LGBMRanker(
    objective='lambdarank',
    metric="ndcg",
    subsample_for_bin=400000,
    n_estimators=int(ESTIMATED_STEPS),
    learning_rate=0.1,
    label_gain=[0, 1, 5],
    random_state=42,
    boosting='goss',
    lambda_l2=1e-3,
    min_data_in_leaf=60,
    max_depth=6,
    max_position=5,
    deterministic=True,
    num_threads=24,
    lambdarank_truncation_level=11,
)

x_train = df_devel[predictor_cols]
Y_train = df_tg_devel
g_train = groups_devel

log_evaluation_callback = log_evaluation(period=20)

ranker.fit(
    x_train,
    Y_train,
    eval_set=[(x_train, Y_train)],
    eval_group=[g_train],
    group=g_train,
    eval_at=5,
    callbacks=[log_evaluation_callback, ],    
    categorical_feature=get_categorical_column(x_train),
)



[20]	training's ndcg@5: 0.390848
[40]	training's ndcg@5: 0.400279
[60]	training's ndcg@5: 0.406041
[80]	training's ndcg@5: 0.410439
[100]	training's ndcg@5: 0.413842
[120]	training's ndcg@5: 0.416627
[140]	training's ndcg@5: 0.419223
[160]	training's ndcg@5: 0.421468
[180]	training's ndcg@5: 0.423687
[200]	training's ndcg@5: 0.425967
[220]	training's ndcg@5: 0.428241
[240]	training's ndcg@5: 0.430029
[260]	training's ndcg@5: 0.432007
[280]	training's ndcg@5: 0.433617
[300]	training's ndcg@5: 0.435515
[320]	training's ndcg@5: 0.437261
[340]	training's ndcg@5: 0.438873
[360]	training's ndcg@5: 0.440599
[380]	training's ndcg@5: 0.442118
[400]	training's ndcg@5: 0.443595
[420]	training's ndcg@5: 0.445297
[440]	training's ndcg@5: 0.446825
[460]	training's ndcg@5: 0.448139
[480]	training's ndcg@5: 0.44983
[500]	training's ndcg@5: 0.451156
[520]	training's ndcg@5: 0.452406
[540]	training's ndcg@5: 0.453802
[560]	training's ndcg@5: 0.455373
[580]	training's ndcg@5: 0.456684
[600]	training's nd

In [20]:
{f:i for (f, i) in zip(ranker.feature_name_, ranker.feature_importances_)}

{'site_id': 2541,
 'visitor_hist_starrating': 159,
 'prop_country_id': 317,
 'prop_starrating': 180,
 'prop_review_score': 333,
 'prop_brand_bool': 114,
 'prop_location_score1': 741,
 'prop_location_score2': 1149,
 'prop_log_historical_price': 1187,
 'promotion_flag': 118,
 'srch_length_of_stay': 314,
 'srch_booking_window': 694,
 'srch_adults_count': 180,
 'srch_children_count': 170,
 'srch_query_affinity_score': 272,
 'random_bool': 364,
 'midstay_week': 478,
 'midstay_dayofweek': 194,
 'hist_starrating_diff': 203,
 'price_hist_logdiff': 1170,
 'visitor_hist_adr_usd_logdiff': 279,
 'prop_count_per_srch_dest': 465,
 'srch_dest_count_per_prop': 760,
 'price_usd_std_srch_id': 816,
 'price_usd_med_srch_id': 719,
 'price_usd_d_srch_id': 1188,
 'prop_starrating_w0_std_srch_destination_id': 492,
 'prop_starrating_w0_d_srch_destination_id': 380,
 'prop_starrating_w0_std_srch_id': 702,
 'prop_starrating_w0_d_srch_id': 228,
 'prop_review_score_std_srch_destination_id': 490,
 'prop_review_score

In [21]:
import pickle

# Save the model to a pickle file
with open('models/lgbm_ranker_best2_model.pkl', 'wb') as f:
    pickle.dump(ranker, f)

# Training the point-wise model on the same dataset

In [14]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Train the classifier with the best hyperparameters
clf = HistGradientBoostingClassifier(random_state=123,
                                     min_samples_leaf=20,
                                     max_iter=12000,
                                     max_leaf_nodes = 31,
                                     max_bins=255,
                                     l2_regularization=1e-3,
                                     n_iter_no_change=250,
                                     max_depth=6,
                                     categorical_features=get_categorical_column(df_train[predictor_cols]),
                                     validation_fraction=0.1,
                                    )

# # downsample the training data for experimentation
# rng = np.random.default_rng(RANDOM_SEED)
# downsample_idx = np.arange(len(df))
# rng.shuffle(downsample_idx)
# downsample_idx = downsample_idx[:len(df)//10]
# len(downsample_idx), len(df)
#x = df_data.iloc[downsample_idx]
#Y = df_tg.iloc[downsample_idx]

clf.fit(df_train[predictor_cols], df_tg_train)

In [15]:
def get_avg_ndgc(df, df_data):
    pred = df[['srch_id', 'prop_id', 'relevance_score', 'target_cls']]
    #pred = pred.assign(score = clf.predict_proba(df_data)[:, 1:].sum(axis=1))
    pred = pred.assign(score = clf.predict_proba(df_data).dot(np.array([0, 1, 4])))
    tmp = pred.sort_values(by='score', ascending=False).sort_values(by='srch_id', kind='stable')[['srch_id','target_cls','score', 'relevance_score']]
    ndcg = tmp.groupby('srch_id')['relevance_score'].apply(lambda x: ndcg_dmt(x, use_2pow=False))
    #ndcg = tmp.groupby('srch_id')['relevance_score'].apply(ndcg_sklearn)
    return ndcg.mean()

def get_prediction(df, df_data):
    pred = df[['srch_id', 'prop_id']]
    pred = pred.assign(score = clf.predict_proba(df_data)[:, 1:2].sum(axis=1))
    pred = pred.sort_values(by='score', ascending=False).sort_values(by='srch_id', kind='stable')[['srch_id','prop_id','score']]
    return pred

In [16]:
get_avg_ndgc(df_test, df_test[predictor_cols])

0.4031710763995877

In [17]:
get_avg_ndgc(df_val, df_val[predictor_cols])

0.4074453962462336

In [38]:
import pickle

# Assume clf is your trained HistGradientBoostingClassifier
with open('models/hist_gradient_boosting_best.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
get_avg_ndgc(df_train, df_train[predictor_cols])