In [334]:
import pandas as pd
import numpy as np
import time
import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit, RandomizedSearchCV
from sklearn.metrics import ndcg_score, make_scorer

# Modeling

In [346]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [347]:
multi_categorical_features= ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id", 
                             "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                            "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                            "weekday", "month"]


In [348]:
cat_features = [ "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                "weekday", "month"]
rest = ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"]


In [349]:
train_copy = train
test_copy = test

In [350]:
# one-hot encode
for i in cat_features:
    dummy = pd.get_dummies(test_copy[i], prefix=i+'_')
    test_copy = test_copy.join(dummy)  
test_copy = test_copy.drop(cat_features, axis=1)

for i in cat_features:
    dummy = pd.get_dummies(train_copy[i], prefix=i+'_')
    train_copy = train_copy.join(dummy)  
train_copy = train_copy.drop(cat_features, axis=1)

In [351]:
list_1 = list(train_copy.columns)
list_2 = list(test_copy.columns)
for i in list(set(list_2) - set(list_1)):
    print(i)
for i in list(set(list_1) - set(list_2)):
    print(i)

position
target_score


In [352]:
target = 'target_score'
train_copy = train_copy.drop(['position'], axis=1)

In [353]:
train_copy[target] = -train_copy[target] #inverts target to make it easier for submission sorting

In [354]:
train_copy.loc[train_copy["target_score"] == -6, "target_score"] = -1 
#considers only click_bool as target variable, withouh booking_bool

## 1) Pointwise LGBM regression on position (no tuning)

In [355]:
model1 = lgb.LGBMRegressor()

In [356]:
%%time

# fit the same model on whole train data
model1.fit(train_copy.drop([target], axis=1), train_copy[target], categorical_feature=rest)



CPU times: user 3min 27s, sys: 32.3 s, total: 3min 59s
Wall time: 1min 57s


LGBMRegressor()

In [357]:
%%time
y_pred1 = model1.predict(test_copy)
y_pred1 # 0,332 NDCG on public leaderboard when click_bool used

CPU times: user 3min 31s, sys: 26.5 s, total: 3min 57s
Wall time: 1min 35s


array([-0.04420504, -0.0808436 , -0.02696308, ..., -0.07440339,
       -0.05211163, -0.05319732])

## 2) Pointwise LGBM regression on position (hyperparameters tuned)

In [311]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")

In [312]:
multi_categorical_features= ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id", 
                             "comp1_rate", "comp1_inv", "comp2_rate", "comp2_inv", "comp3_rate", "comp3_inv", "comp4_rate", "comp4_inv",
                            "comp5_rate", "comp5_inv", "comp6_rate", "comp6_inv", "comp7_rate", "comp7_inv", "comp8_rate", "comp8_inv",
                            "weekday", "month"]


In [313]:
target = 'target_score'
train = train.drop(['position'], axis=1)

In [314]:
train[target] = -train[target] #inverts target to make it easier for submission sorting

In [315]:
train.loc[train["target_score"] == -6, "target_score"] = -1 
#considers only click_bool as target variable, withouh booking_bool

In [335]:
# group split needed for both train-test split and CV
splitter = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state = 7)
split = splitter.split(train, groups=train['srch_id'])
train_inds, test_inds = next(split)

train_df = train.iloc[train_inds]
test_df = train.iloc[test_inds]

X_train = train_df.drop([target], axis=1)
X_test = test_df.drop([target], axis=1)
y_train = train_df[target]
y_test = test_df[target]

In [336]:
gss = GroupShuffleSplit(n_splits=4, test_size=0.25)

In [342]:
custom_scorer = make_scorer(ndcg5, greater_is_better=True)

### 2.1) random search

In [343]:
# tune hyperparameters with groupKfold
lgb_1 = lgb.LGBMRegressor()

random_grid_params = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [16, 24],
    'verbose': [1]
}
random_search = RandomizedSearchCV(lgb_1, random_grid_params, n_iter=2, scoring=custom_scorer, cv=gss)


In [344]:
%%time 
random_search.fit(X_train, y_train, groups=X_train['srch_id'])


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4046
[LightGBM] [Info] Number of data points in the train set: 2228377, number of used features: 54
[LightGBM] [Info] Start training from score -0.044737
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4046
[LightGBM] [Info] Number of data points in the train set: 2228234, number of used features: 54
[LightGBM] [Info] Start training from score -0.044802
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4046
[LightGBM] [Info] Number of data points in the train set: 2228377, number of used features: 54
[LightGBM] [Info] Start training from score -0.044737
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough

RandomizedSearchCV(cv=GroupShuffleSplit(n_splits=2, random_state=None, test_size=0.5,
         train_size=None),
                   estimator=LGBMRegressor(), n_iter=2,
                   param_distributions={'learning_rate': [0.01, 0.1],
                                        'n_estimators': [16, 24],
                                        'verbose': [1]},
                   scoring=make_scorer(ndcg5))

In [345]:
print(random_search.best_params_)

{'verbose': 1, 'n_estimators': 16, 'learning_rate': 0.1}


### 2.2) grid search

### 2.3) model with best parameters on validation data

### 2.4) Model with best parameters on test data trained on training and validation data

In [142]:
model2 = lgb.LGBMRegressor()

In [171]:
%%time
model2.fit(X_train, y_train, categorical_feature=multi_categorical_features)




CPU times: user 2min 55s, sys: 14.4 s, total: 3min 10s
Wall time: 1min 5s


LGBMRegressor()

In [172]:
# %%time
y_pred2 = model2.predict(X_test)
y_pred2

array([0.10653852, 0.05060416, 0.08014264, ..., 0.1740704 , 0.21434546,
       0.31500774])

In [173]:
y_test

119        0
120        0
121        0
122        0
123        0
          ..
4958336    0
4958337    0
4958338    0
4958339    0
4958340    0
Name: target_score, Length: 497111, dtype: int64

In [256]:
min(y_test)

-6

In [341]:
def dcg_at_k(sorted_labels, k):
    if k > 0:
        k = min(sorted_labels.shape[0], k)
    else:
        k = sorted_labels.shape[0]
    denom = 1./np.log2(np.arange(k)+2.)
    nom = 2**sorted_labels-1.
    dcg = np.sum(nom[:k]*denom)
    return dcg

def ndcg5(scores, labels):
    sort_ind = np.argsort(scores)[::-1]
    sorted_labels = labels[sort_ind]
    ideal_labels = np.sort(labels)[::-1]
    return dcg_at_k(sorted_labels, 5) / dcg_at_k(ideal_labels, 5)

In [174]:
df = pd.concat([X_test["srch_id"], -y_test], axis=1)
df['predictions'] = y_pred2
df.head(4)

Unnamed: 0,srch_id,target_score,predictions
119,12,0,0.106539
120,12,0,0.050604
121,12,0,0.080143
122,12,0,0.10721


In [187]:
scores = []
for i in df['srch_id'].unique():
#     #t1
#     a1 = [df[df["srch_id"]==i]["target_score"].values]
#     a2 = [df[df["srch_id"]==i]["predictions"].values]
#     scores.append(ndcg_score(a1, a2, k=5)

    #t2 - better so far 0,5
    a1 = df[df["srch_id"]==i]["target_score"].values
    a2 = df[df["srch_id"]==i]["predictions"].values
    scores.append(ndcg5(a1, a2))
print(sum(scores)/len(scores))

0.5346345694217429


In [250]:
max(scores)

1.0

In [165]:
sum(scores)/len(scores)

2.294170538302306

In [179]:
print(min(scores))

-1.682370443375792


In [183]:
y_test.unique()

array([ 0, -1, -6])

# Submission

- by target score

In [358]:
r = pd.DataFrame(data = y_pred1, columns=['target_score'])
r.head(20)

Unnamed: 0,target_score
0,-0.044205
1,-0.080844
2,-0.026963
3,-0.030333
4,-0.057418
5,-0.087827
6,-0.02439
7,-0.035417
8,-0.051951
9,-0.120483


In [359]:
r["srch_id"] = test['srch_id']
r["prop_id"] = test['prop_id']

In [360]:
result = r.sort_values(['srch_id','target_score'])[["srch_id","prop_id"]]

In [361]:
result.head(10)

Unnamed: 0,srch_id,prop_id
9,1,54937
23,1,99484
12,1,61934
5,1,28181
1,1,5543
4,1,24194
13,1,63894
16,1,74045
17,1,78599
18,1,82231


In [362]:
result.to_csv("sub6.csv", index=False)