In [1]:
# ! pip install fastparquet
# ! pip install ipynb
# ! pip install lightgbm
# ! pip install xgboost
# ! pip install scikit-optimize
# ! pip install lightgbm --install-option=--gpu

In [2]:
# from ipynb.fs.full.data_merge_utils import reduce_mem_usage, merge_data


In [3]:
import pandas as pd
import swifter
import numpy as np
from glob import glob
import datetime, json
import gc
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
import os


In [4]:
import pyximport
pyximport.install(reload_support=True)
from mrr import mrr as mrr_cython

In [5]:
def check_folder(path, point_allowed_path=False):
    split_folder = os.path.split(path)
    if not point_allowed_path:
        if '.' in split_folder[1]:
            # path is a file
            path = split_folder[0]
    if not os.path.exists(path):
        print(f'{path} folder created')
        os.makedirs(path, exist_ok=True)

In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col)
        col_type = df[col].dtype
                
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
def merge_data(train_data, val_data, merge_data_path_list, done_files=[], merge_cols='', flag=0):
    compulsory_cols = ["query_id", "user_id", "session_id", "product_id"]
    for path in merge_data_path_list:
        print("Merging file...", path.split('/')[-1])
        prev_cols_train = set(train_data.columns.tolist())
        prev_cols_val = set(val_data.columns.tolist())
        d = pd.read_csv(path)
#         d = reduce_mem_usage(d)
        if 'is_click' in d.columns.tolist():
            d = d.drop('is_click', 1)
        if flag==0:
            merge_cols = [d.columns[0]]
            
        for col in d.columns.tolist():
            if col in train_data.columns.tolist() and col not in compulsory_cols and col not in merge_cols:
                d = d.drop(col, 1)
            
        train_data = train_data.merge(d, on=merge_cols, how='left')
        val_data = val_data.merge(d, on=merge_cols, how='left')
#         print("Train: ", train_data.shape, "Val: ", val_data.shape)
        done_files.append(path)
        del d
        gc.collect()
#     print("Train: ", train_data.shape, "Val: ", val_data.shape)
    
    return train_data, val_data


In [8]:
train = pd.read_parquet("../data_phase1/train.parquet")
sub = pd.read_csv("../data_phase1/validation_labelled.csv")
# test = pd.read_parquet("../data_phase2/test.parquet")
test = pd.read_csv("../data_phase2/test_labelled.csv")


In [9]:
t = pd.read_parquet("../data_phase2/test.parquet")
test = t.merge(test, on=["query_id", "product_id"], how="left")
test.head()

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,98e0eab61242fe3a3a1b89e86f8d039c2142d251715e4c...,9c4893579eaa3288b19ea38ee2a1b09c50bcf6055e654d...,a5c080817feadbe1f8b734de7298ecb1a9f25af7263d8f...,acaad5328549fd88eb9dd0307701d77685f58f581e857c...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,931b19eb9eb42f7d672a924ad811ceb9ba5bea5651db82...,product_id,377006e274eb6c69d6277c15c0fb1b5f6755cc4e3c1411...,0.001446,6,0,1
1,7a5977a6080a4d10ef00674a3c082be5e835f6120f4af0...,8850e62730045d14c6ce3dcdf724d7ec57279163e84b2b...,c5339b7bf4f2d00d594c2d142fa916dd560577c48a2c0b...,11545d1e007c4a46cbea3f7a4bcd385b9bf3ec5da3f2c4...,e5d360d8f63d7420e05f82f1ee4e6124dd5c466233de76...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,0.000546,3,3,0
2,d358ea82df3f33cbdc43ab56b948870dbae79009818e8b...,43ab15c21ab524413ebd8f258e6d25090c6ed29262bc01...,ac3351e98b7a0379976b0fff69eee2c137678cd546f63f...,130cd96240984dffb489a0beff5f1a19c35aa3503a437b...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,14fc80e2d6821260d291ff47ce6d8d7534d4cb5aa4ab0d...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,product_id,be8709433a124cd0dd2b4bd94625c11fce002fb9a5a98b...,0.000345,4,0,0
3,47b19f66ae013d4e024affc87e764b1e2cc47a50a310e4...,7b15b0c5148da9c3059a9a029274313503bbd74b1a9b3e...,55457801ffab26697767a2a760391c5e267a1c15ba767c...,0b56fff48269b91b8c528c7be80fa0be72db26d2cde9b0...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,f4b2b5bae8b12dd4f61c448583dc1be3b8d313250bb54c...,0.000877,6,3,0
4,1b4b60c5901518ed9b3dfd874c3e2aed9438af26614c61...,869503954efbebf02deac4ac64c55dc18a5f6c3a89ce9b...,c46ee54191ca86da3d3034fddec1ea229f406af72f8ffa...,e30d9734d1d431babd5bab5bdd678900374d1ee26d4625...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,d4fc57514de1c54e45fe3bac161bfcae2459b1de087c53...,product_id,b91927a2b993d4faaa275f13fb5b81f8dbe706a4b58979...,0.000918,5,2,1


In [10]:
s = pd.read_parquet("../data_phase1/validation.parquet")
sub = s.merge(sub, on=["query_id", "product_id"], how="left")
sub.head()

Unnamed: 0,query_id,user_id,session_id,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,week,week_day,is_click
0,45dbde6284f13d59b04c5d2a5ab2513c896ba8f7a7dde0...,4d66a7c430e1f1f7da454f4e8c4bf3e7cf2435741329c0...,b677570c68f211d9543e96fe46750b66ebdfaa0fb2df0e...,d52833e4925f40de987bb732847a8dbc07c2ba1e33711a...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,95d5a1bd42a07143383299c081524278a16ef5ce55507c...,product_id,c57422a708323b02f078708e3f5b1e841bb07434bc2adc...,0.000453,8,0,0
1,6728f8b02604af603d65faa0a2a4e36307f2498c4bdd88...,9ff86157cc29dd17330d2e714318770884adf936794c66...,d1f1a8e13d81de690a968db95dd1960482143f59e6c69b...,ca58ed0a66cb8990221552d0d93de82713641b66465a86...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,c45f0825291fd1a087ef31abbcf9fd0ef10c915edf3041...,product_id,950d471ac609e50a31ecca7c731e9673a32588deb34da4...,0.614475,9,0,0
2,9d1d782fab18c98c8a8d4dab9cbc0cb3f786b5d5e5fd24...,b67c2b47e18777596747b94d0ae3e4a9f023a406fa5b5e...,d572b385d62d7eaaa4872e07ed2771b107d4db0b547212...,6374ef1b29e3046f9ae1607cd62cde1eb6305d4d9b1de5...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,14fc80e2d6821260d291ff47ce6d8d7534d4cb5aa4ab0d...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,58f6d97120664752ed0851301aa78457fe882f67453c58...,product_id,6209104dbe997824b92e3118504acb2ebc863ae3fc27e0...,0.001199,8,3,1
3,f23d6751b37c235047a64a20ffe732483f487743dc8812...,8e28240e0b0c5629959d76727906afa17ea5d89821cbac...,c5df0c490099d1e4a20173ecbb67880998bc4faf8210a9...,e74667d17676e39561716943bc7cf8ba8a94ce96a0dc41...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,bf2241c08d92d32a6782b4041a2c11ca58882ca88454b3...,702e4598004745673c0f6b50387bef9e1d5f503bd8c1c0...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,f319f71fd9b14532cf715e601fe3b5c2473b58d4302e88...,product_id,ce3d41b4ed9d444fdd9385ee6b1b1639abb497a13c9385...,0.000418,7,6,0
4,fe162740ab90f69597b14253e6f0fc6fb87ad8e2146cd8...,2173a7fa04d32abcb18cf8652e427f13a21156f7ff0bf5...,1e5f3aaf57b4863421d5019fe834f5e6979876f3d44e43...,a85298cf45012b8bfe8e5ec39108361e6083cad362dff3...,06a7f8e972f61aeb0e06335699518079a444e4450ff766...,c9f34437ce0e536fefd11a34b9a411b541d2dabfec872a...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d6538f13ace825448d0af4fa5e58d2d08fa2d0850e5e14...,d74a5cebc23c56af60a5768c22d44b52f598629d4011fa...,ea2f413bd8fda0b91a814a68aa520044b204796991a343...,product_id,05f1468cf95e2cb32ddeff813e8466e3bc7b798724e4e6...,0.000148,4,2,0


In [11]:
sub_labels = sub["is_click"].values
test_labels = test["is_click"].values

In [12]:
train = train.fillna(value={"context_type": "NA"})
sub = sub.fillna(value={"context_type": "NA"})
test = test.fillna(value={"context_type": "NA"})


In [13]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [14]:
train = reduce_mem_usage(train)
sub = reduce_mem_usage(sub)
test = reduce_mem_usage(test)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 360.08 MB
Decreased by 15.9%
Memory usage of dataframe is 89.13 MB
Memory usage after optimization is: 121.67 MB
Decreased by -36.5%
Memory usage of dataframe is 89.13 MB
Memory usage after optimization is: 120.70 MB
Decreased by -35.4%


In [15]:
gc.collect()

93

In [16]:
train.shape, sub.shape, test.shape

((3507990, 16), (687192, 16), (687192, 16))

In [17]:
BASE_PATH = "../preprocessed_data/"

In [18]:
pc_sim_train = pd.read_csv(BASE_PATH+"train_product_context_sim.csv")
pc_sim_sub = pd.read_csv(BASE_PATH+"val_product_context_sim.csv")
pc_sim_test = pd.read_csv(BASE_PATH+"test_product_context_sim.csv")
pc_sim_sub = pc_sim_sub.fillna(0.0)
pc_sim_test = pc_sim_test.fillna(0.0)


In [19]:
pc_sim_test.isnull().sum()

query_id                      0
session_id                    0
user_id                       0
product_id                    0
product_context_similarity    0
dtype: int64

In [20]:
train = train.merge(pc_sim_train, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(pc_sim_sub, on=["query_id", "user_id", "session_id", "product_id"], how="left")
test = test.merge(pc_sim_test, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [21]:
sub["product_context_similarity"].describe()

count    687192.000000
mean          0.725715
std           0.213147
min           0.000000
25%           0.650830
50%           0.772573
75%           0.870120
max           1.000000
Name: product_context_similarity, dtype: float64

In [22]:
test["product_context_similarity"].describe()

count    687192.000000
mean          0.594677
std           0.339222
min           0.000000
25%           0.456010
50%           0.725984
75%           0.845484
max           1.000000
Name: product_context_similarity, dtype: float64

In [23]:
# # sub = sub[["query_id", "product_id", "product_context_similarity", "is_click"]]
# sub = test[["query_id", "product_id", "product_context_similarity", "is_click"]]


In [24]:
# sub = sub[["query_id", "product_id", "product_context_similarity", "is_click"]]
sub = test[["query_id", "product_id", "product_context_similarity", "is_click"]]


In [25]:
temp = pd.read_csv("../submission/LGBClassifier_29_6_21_best_params_v2_test.csv")
# temp = pd.read_csv("../submission/XGBClassifier_27_6_21_best_params.csv")
temp = temp.rename(columns={"rank": "model_rank"})
temp.head(2)

Unnamed: 0,query_id,product_id,preds,model_rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.5,6
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.5,1


In [26]:
sub = sub.merge(temp, on=["query_id", "product_id"], how="left")
sub = sub.sort_values("query_id")
sub.head()

# test = test.merge(temp, on=["query_id", "product_id"], how="left")
# test = test.sort_values("query_id")
# test.head()

Unnamed: 0,query_id,product_id,product_context_similarity,is_click,preds,model_rank
85679,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0,0.5,6
189379,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0,0.5,1
384222,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,1,0.5,5
616173,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0,0.5,2
131171,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0,0.5,4


In [27]:
# sub.to_csv("ensemble_submission.csv", index=False)

In [28]:
max_df = pd.DataFrame(sub.groupby(["query_id"])["product_context_similarity"].max()).reset_index()
# max_df = pd.DataFrame(test.groupby(["query_id"])["product_context_similarity"].max()).reset_index()
max_df = max_df.rename(columns={"product_context_similarity": "max_product_context_similarity"})
max_df.head()

Unnamed: 0,query_id,max_product_context_similarity
0,00000996d07006b045bc134b757f2825b0c79870d41521...,0.999976
1,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,0.0
2,000168140d358496df8ebc627efe3bdd586b4c56339cf3...,0.922838
3,0001a2283d6f0ccad5f1dab18451a63d0f8c49bc62b337...,0.922239
4,0002fd07dd0c3bc94d0ae9f39a816d34aff440548fadd7...,0.893655


In [29]:
sub = sub.merge(max_df, on=["query_id"], how="left")
sub.head()
# test = test.merge(max_df, on=["query_id"], how="left")
# test.head()


Unnamed: 0,query_id,product_id,product_context_similarity,is_click,preds,model_rank,max_product_context_similarity
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0,0.5,6,0.999976
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0,0.5,1,0.999976
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,1,0.5,5,0.999976
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0,0.5,2,0.999976
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0,0.5,4,0.999976


In [30]:
def apply_rank(row, col):
    return (6-row[col])+1


In [31]:
sub["pcs_rank"] = sub.groupby('query_id')['product_context_similarity'].rank(method='first').apply(int)
sub["pcs_rank"] = sub.swifter.apply(lambda x: apply_rank(x, "pcs_rank"), 1)
sub.head(6)
# test["pcs_rank"] = test.groupby('query_id')['product_context_similarity'].rank(method='first').apply(int)
# test["pcs_rank"] = test.swifter.apply(lambda x: apply_rank(x, "pcs_rank"), 1)
# test.head(6)

Unnamed: 0,query_id,product_id,product_context_similarity,is_click,preds,model_rank,max_product_context_similarity,pcs_rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0,0.5,6,0.999976,2
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0,0.5,1,0.999976,5
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,1,0.5,5,0.999976,4
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0,0.5,2,0.999976,6
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0,0.5,4,0.999976,3
5,00000996d07006b045bc134b757f2825b0c79870d41521...,009fbcce12d627c870fc0b262c5ddd4b67e897d34928bf...,0.999976,0,0.5,3,0.999976,1


In [32]:
def get_final_rank(row, threshold=0.52):
    pcs_rank = row["pcs_rank"]
    model_rank = row["model_rank"]
    return pcs_rank if row["max_product_context_similarity"]>threshold else model_rank


In [33]:
def get_final_preds(row, threshold=0.52):
    pcs_rank = row["pcs_rank"]
    model_rank = row["model_rank"]
    return row["product_context_similarity"] if row["max_product_context_similarity"]>threshold else row["preds"]


In [34]:
sub["final_preds"] = sub.swifter.apply(lambda x: get_final_preds(x), 1)
sub["final_rank"] = sub.swifter.apply(lambda x: get_final_rank(x), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




In [35]:
def compute_mean_reciprocal_rank(rs):
    '''
    rs: 2d array
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    '''

    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [36]:
def evaluate(val_df, col):
    grouped_val = val_df.groupby('query_id')
    rss = []
    for _, group in grouped_val:

        scores = group[col]
        sorted_arg = np.flip(np.argsort(scores))
        rss.append( group['is_click'].values[sorted_arg])
        
    mrr = compute_mean_reciprocal_rank(rss)
    print("MRR", mrr)
    return mrr



In [37]:
sub.columns

Index(['query_id', 'product_id', 'product_context_similarity', 'is_click',
       'preds', 'model_rank', 'max_product_context_similarity', 'pcs_rank',
       'final_preds', 'final_rank'],
      dtype='object')

In [38]:
# evaluate(sub, "preds")

In [39]:
import numpy as np

thresholds = np.arange(0.3, 0.8, 0.03)


In [40]:
# sub["final_preds"] = sub.swifter.apply(lambda x: get_final_preds(x, threshold=0.77), 1)
# evaluate(sub, "final_preds")

In [41]:
for threshold in thresholds:
    sub["final_preds"] = sub.swifter.apply(lambda x: get_final_preds(x, threshold=threshold), 1)
    print("Threshold", threshold)
    evaluate(sub, "final_preds")
    
    

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.3
MRR 0.47605778297768314


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.32999999999999996
MRR 0.4760633127277384


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.35999999999999993
MRR 0.47606185753035535


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.3899999999999999
MRR 0.4760627306487852


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.4199999999999999
MRR 0.47605472706317886


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.44999999999999984
MRR 0.47603871989196606


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.4799999999999998
MRR 0.47603362670112576


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5099999999999998
MRR 0.4760186381680811


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5399999999999998
MRR 0.47596028475302393


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5699999999999997
MRR 0.4758769019429796


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5999999999999996
MRR 0.4757590309549587


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6299999999999997
MRR 0.4755384230317001


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6599999999999997
MRR 0.47508149105344644


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6899999999999996
MRR 0.47452385941629127


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7199999999999995
MRR 0.4730727365859906


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7499999999999996
MRR 0.4712322029360062


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7799999999999996
MRR 0.4684505931384533


In [37]:
# for threshold in thresholds:
#     sub["final_preds"] = sub.swifter.apply(lambda x: get_final_preds(x, threshold=threshold), 1)
#     print("Threshold", threshold)
#     evaluate(sub, "final_preds")
    
    

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.4
MRR 0.4760627306487852


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.43000000000000005
MRR 0.4760532718657958


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.4600000000000001
MRR 0.4760289700695002


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.4900000000000001
MRR 0.4760327535826959


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5200000000000001
MRR 0.4760214030431087


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5500000000000002
MRR 0.4759624675490984


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.5800000000000002
MRR 0.4758037055146159


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6100000000000002
MRR 0.47569645746749084


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6400000000000002
MRR 0.4754466000768344


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.6700000000000003
MRR 0.4749627469469959


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7000000000000003
MRR 0.4739544406803339


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7300000000000003
MRR 0.47244249059942484


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7600000000000003
MRR 0.4708602544849183


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))


Threshold 0.7900000000000004
MRR 0.46724583522508994


In [32]:
test["product_context_similarity"].describe()

count    687192.000000
mean          0.594677
std           0.339222
min           0.000000
25%           0.456010
50%           0.725984
75%           0.845484
max           1.000000
Name: product_context_similarity, dtype: float64

In [1]:
# thresholds = np.arange(0.25, 0.9, 0.01).tolist()

# for t in thresholds:
#     sub["final_preds"] = sub.swifter.apply(lambda x: get_final_preds(x, t), 1)
#     m = evaluate(sub)
#     print(t, '->', m)


In [27]:
sub = sub[["query_id", "product_id", "final_rank"]]
sub = sub.rename(columns={"final_rank": "rank"})
sub.head(12)
# test = test[["query_id", "product_id", "final_rank"]]
# test = test.rename(columns={"final_rank": "rank"})
# test.head(12)

Unnamed: 0,query_id,product_id,rank
0,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,d8a205200d4c2151fd2cf070ff7f44999d7a184faad30f...,3
1,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,3d7014b28491366ce149339689c90331bd2c42e251713a...,6
2,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,5ac7d91135ed2a6d44ec48ec586ceacbab3fc67d4bf319...,2
3,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,998800d85572455b61c0e3eea04141a09e8793893aec9c...,5
4,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,1455bc965260200d68107d43cf6282cbae75de2072a4aa...,4
5,00004e487e05ff29ac41a930f9ed972fe4cbf6a7a17794...,021ec483f69638ebc7d84b65d9bec6bb71f6ae809c87f0...,1
6,0000bd4862d5d5d470bebfe7a7b8049e25a5294d5325e7...,5e3c6a8a2a646fe07ecf653032bce7dca6a10061e74431...,1
7,0000bd4862d5d5d470bebfe7a7b8049e25a5294d5325e7...,26644354812bb4f72309900f74c272241933b5ac7e1645...,5
8,0000bd4862d5d5d470bebfe7a7b8049e25a5294d5325e7...,470af2db4a53e14583dc150d4d1efe800a13bca17d6d6e...,4
9,0000bd4862d5d5d470bebfe7a7b8049e25a5294d5325e7...,00c8a71376217298e89988e8b23a9ae0c197a596a3a8bf...,6


In [28]:
# sub.to_csv("heuristic_XGB_30_6_21_v1.csv", index=False)