In [1]:
# ! pip install fastparquet
# ! pip install ipynb
# ! pip install lightgbm
# ! pip install xgboost
# ! pip install scikit-optimize
# ! pip install lightgbm --install-option=--gpu

In [2]:
# from ipynb.fs.full.data_merge_utils import reduce_mem_usage, merge_data


In [3]:
import pandas as pd
import swifter
import numpy as np
from glob import glob
import datetime, json
import gc
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
import os


In [4]:
import pyximport
pyximport.install(reload_support=True)
from mrr import mrr as mrr_cython

In [5]:
def check_folder(path, point_allowed_path=False):
    split_folder = os.path.split(path)
    if not point_allowed_path:
        if '.' in split_folder[1]:
            # path is a file
            path = split_folder[0]
    if not os.path.exists(path):
        print(f'{path} folder created')
        os.makedirs(path, exist_ok=True)

In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col)
        col_type = df[col].dtype
                
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
def merge_data(train_data, val_data, merge_data_path_list, done_files=[], merge_cols='', flag=0):
    compulsory_cols = ["query_id", "user_id", "session_id", "product_id"]
    for path in merge_data_path_list:
        print("Merging file...", path.split('/')[-1])
        prev_cols_train = set(train_data.columns.tolist())
        prev_cols_val = set(val_data.columns.tolist())
        d = pd.read_csv(path)
#         d = reduce_mem_usage(d)
        if 'is_click' in d.columns.tolist():
            d = d.drop('is_click', 1)
        if flag==0:
            merge_cols = [d.columns[0]]
            
        for col in d.columns.tolist():
            if col in train_data.columns.tolist() and col not in compulsory_cols and col not in merge_cols:
                d = d.drop(col, 1)
            
        train_data = train_data.merge(d, on=merge_cols, how='left')
        val_data = val_data.merge(d, on=merge_cols, how='left')
#         print("Train: ", train_data.shape, "Val: ", val_data.shape)
        done_files.append(path)
        del d
        gc.collect()
#     print("Train: ", train_data.shape, "Val: ", val_data.shape)
    
    return train_data, val_data


In [8]:
train = pd.read_parquet("../data_phase1/train.parquet")
sub = pd.read_parquet("../data_phase1/validation.parquet")
test = pd.read_parquet("../data_phase2/test.parquet")


In [9]:
train = train.fillna(value={"context_type": "NA"})
sub = sub.fillna(value={"context_type": "NA"})
test = test.fillna(value={"context_type": "NA"})


In [10]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [11]:
train = reduce_mem_usage(train)
sub = reduce_mem_usage(sub)
test = reduce_mem_usage(test)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 360.08 MB
Decreased by 15.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 115.77 MB
Decreased by -47.2%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 114.80 MB
Decreased by -46.0%


In [12]:
gc.collect()

93

In [13]:
train.shape, sub.shape

((3507990, 16), (687192, 15))

In [14]:
BASE_PATH = "../preprocessed_data/"

In [15]:
pc_sim_train = pd.read_csv(BASE_PATH+"train_product_context_sim.csv")
pc_sim_sub = pd.read_csv(BASE_PATH+"val_product_context_sim.csv")
pc_sim_test = pd.read_csv(BASE_PATH+"test_product_context_sim.csv")
pc_sim_sub = pc_sim_sub.fillna(0.0)
pc_sim_test = pc_sim_test.fillna(0.0)


In [16]:
pc_sim_test.isnull().sum()

query_id                      0
session_id                    0
user_id                       0
product_id                    0
product_context_similarity    0
dtype: int64

In [17]:
train = train.merge(pc_sim_train, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(pc_sim_sub, on=["query_id", "user_id", "session_id", "product_id"], how="left")
test = test.merge(pc_sim_test, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [18]:
sub = test[["query_id", "product_id", "product_context_similarity"]]

In [19]:
temp = pd.read_csv("../submission/XGBClassifier_test_final.csv")
temp = temp.rename(columns={"rank": "model_rank"})
# temp = temp.drop("rank", 1)
temp.head(2)

Unnamed: 0,query_id,product_id,preds,model_rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.505402,6
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.560951,4


In [20]:
# temp1 = pd.read_csv("../submission/XGBClassifier_27_6_21_best_params.csv")
# temp1 = temp1.rename(columns={"preds": "preds2"})
# temp1 = temp1.drop("rank", 1)
# temp1.head(2)

In [21]:
def apply_rank(row, col):
    return (6-row[col])+1


In [22]:
# temp = temp.merge(temp1, on=["query_id", "product_id"], how="left")
# temp["preds"] = (temp["preds1"]+temp["preds2"])/2
# temp["model_rank"] = temp.groupby('query_id')['preds'].rank(method='first').apply(int)
temp["model_rank"] = temp.swifter.apply(lambda x: apply_rank(x, "model_rank"), 1)
# temp = temp.drop(["preds1", "preds2"], 1)
temp.head()

Unnamed: 0,query_id,product_id,preds,model_rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.505402,1
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.560951,3
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.522968,2
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.566412,5
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.564779,4


In [23]:
sub = sub.merge(temp, on=["query_id", "product_id"], how="left")
sub = sub.sort_values("query_id")
sub.head()

Unnamed: 0,query_id,product_id,product_context_similarity,preds,model_rank
85679,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0.505402,1
189379,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0.560951,3
384222,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,0.522968,2
616173,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0.566412,5
131171,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0.564779,4


In [24]:
max_df = pd.DataFrame(sub.groupby(["query_id"])["product_context_similarity"].max()).reset_index()
max_df = max_df.rename(columns={"product_context_similarity": "max_product_context_similarity"})
max_df.head()

Unnamed: 0,query_id,max_product_context_similarity
0,00000996d07006b045bc134b757f2825b0c79870d41521...,0.999976
1,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,0.0
2,000168140d358496df8ebc627efe3bdd586b4c56339cf3...,0.922838
3,0001a2283d6f0ccad5f1dab18451a63d0f8c49bc62b337...,0.922239
4,0002fd07dd0c3bc94d0ae9f39a816d34aff440548fadd7...,0.893655


In [25]:
sub = sub.merge(max_df, on=["query_id"], how="left")
sub.head()

Unnamed: 0,query_id,product_id,product_context_similarity,preds,model_rank,max_product_context_similarity
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0.505402,1,0.999976
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0.560951,3,0.999976
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,0.522968,2,0.999976
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0.566412,5,0.999976
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0.564779,4,0.999976


In [26]:
sub["pcs_rank"] = sub.groupby('query_id')['product_context_similarity'].rank(method='first').apply(int)
sub["pcs_rank"] = sub.swifter.apply(lambda x: apply_rank(x, "pcs_rank"), 1)
sub.head(6)

Unnamed: 0,query_id,product_id,product_context_similarity,preds,model_rank,max_product_context_similarity,pcs_rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.922885,0.505402,1,0.999976,2
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.76691,0.560951,3,0.999976,5
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.845101,0.522968,2,0.999976,4
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.761883,0.566412,5,0.999976,6
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.921114,0.564779,4,0.999976,3
5,00000996d07006b045bc134b757f2825b0c79870d41521...,009fbcce12d627c870fc0b262c5ddd4b67e897d34928bf...,0.999976,0.58349,6,0.999976,1


In [27]:
set(sub.query_id.tolist()) - set(test.query_id.tolist())

set()

In [28]:
def get_final_rank(row, threshold=0.66):
    pcs_rank = row["pcs_rank"]
    model_rank = row["model_rank"]
    return pcs_rank if row["max_product_context_similarity"]>threshold else model_rank


In [29]:
sub["final_rank"] = sub.swifter.apply(lambda x: get_final_rank(x), 1)
# sub["final_rank"] = sub.swifter.apply(lambda x: apply_rank(x, "final_rank"), 1)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=687192.0), HTML(value='')))




In [30]:
sub = sub[["query_id", "product_id", "final_rank"]]
sub = sub.rename(columns={"final_rank": "rank"})
sub.head(12)

Unnamed: 0,query_id,product_id,rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,2
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,5
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,4
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,6
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,3
5,00000996d07006b045bc134b757f2825b0c79870d41521...,009fbcce12d627c870fc0b262c5ddd4b67e897d34928bf...,1
6,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,8a45dc566200a885fa5c9246fd0d2a008fdc760e4a3ba9...,6
7,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,3cb32bcf6e87883cfad9579d9c30f5161024f8fbf4c063...,5
8,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,f452b8c917747e477f4d28e0c34f63172f5f0f46a88daf...,3
9,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,ac5a5054a1e66360c95bec079fbbcaa4bc0e7eaa6133b5...,1


In [31]:
sub.to_csv("heuristic_XGB_01_07_21_v3.csv", index=False)

In [32]:
sub.shape

(687192, 3)