In [1]:
# ! pip install fastparquet
# ! pip install ipynb
# ! pip install lightgbm
# ! pip install xgboost
# ! pip install scikit-optimize
# ! pip install lightgbm --install-option=--gpu

In [2]:
# from ipynb.fs.full.data_merge_utils import reduce_mem_usage, merge_data


In [3]:
import pandas as pd
import swifter
import numpy as np
from glob import glob
import datetime, json
import gc
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
import os


In [4]:
import pyximport
pyximport.install(reload_support=True)
from mrr import mrr as mrr_cython

In [5]:
def check_folder(path, point_allowed_path=False):
    split_folder = os.path.split(path)
    if not point_allowed_path:
        if '.' in split_folder[1]:
            # path is a file
            path = split_folder[0]
    if not os.path.exists(path):
        print(f'{path} folder created')
        os.makedirs(path, exist_ok=True)

In [6]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col)
        col_type = df[col].dtype
                
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [7]:
def merge_data(train_data, val_data, merge_data_path_list, done_files=[], merge_cols='', flag=0):
    compulsory_cols = ["query_id", "user_id", "session_id", "product_id"]
    for path in merge_data_path_list:
        print("Merging file...", path.split('/')[-1])
        prev_cols_train = set(train_data.columns.tolist())
        prev_cols_val = set(val_data.columns.tolist())
        d = pd.read_csv(path)
#         d = reduce_mem_usage(d)
        if 'is_click' in d.columns.tolist():
            d = d.drop('is_click', 1)
        if flag==0:
            merge_cols = [d.columns[0]]
            
        for col in d.columns.tolist():
            if col in train_data.columns.tolist() and col not in compulsory_cols and col not in merge_cols:
                d = d.drop(col, 1)
            
        train_data = train_data.merge(d, on=merge_cols, how='left')
        val_data = val_data.merge(d, on=merge_cols, how='left')
#         print("Train: ", train_data.shape, "Val: ", val_data.shape)
        done_files.append(path)
        del d
        gc.collect()
#     print("Train: ", train_data.shape, "Val: ", val_data.shape)
    
    return train_data, val_data


In [8]:
train = pd.read_parquet("../data_phase1/train.parquet")
sub = pd.read_parquet("../data_phase1/validation.parquet")


In [9]:
train = train.fillna(value={"context_type": "NA"})
sub = sub.fillna(value={"context_type": "NA"})


In [10]:
train = reduce_mem_usage(train)
sub = reduce_mem_usage(sub)


Memory usage of dataframe is 428.22 MB
Memory usage after optimization is: 360.08 MB
Decreased by 15.9%
Memory usage of dataframe is 78.64 MB
Memory usage after optimization is: 115.77 MB
Decreased by -47.2%


In [11]:
gc.collect()

93

In [12]:
train.shape, sub.shape

((3507990, 16), (687192, 15))

# Data Merge

In [13]:
BASE_PATH = "../preprocessed_data/"

In [14]:
pc_sim_train = pd.read_csv(BASE_PATH+"train_product_context_sim.csv")
pc_sim_sub = pd.read_csv(BASE_PATH+"val_product_context_sim.csv")


In [15]:
train = train.merge(pc_sim_train, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(pc_sim_sub, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [16]:
train = train[["query_id", "product_context_similarity", "is_click"]]
train = train.fillna(0)


In [17]:
train.shape, sub.shape

((3507990, 3), (687192, 16))

In [18]:
def get_pred(row, threshold):
    if row["product_context_similarity"]>=threshold:
        return 1
    return 0


In [19]:
dic = {}

In [20]:
def get_metrics(data):
    threshold_range = np.arange(0.7, 0.9, 0.01).tolist()[1:]
    for threshold in threshold_range:
        data["click_pred"] = data.swifter.apply(lambda x: get_pred(x, threshold), 1)
        y_true = data["is_click"].values
        y_pred = data["click_pred"].values
        precision = precision_score(y_true, y_pred, average='micro')
        recall = recall_score(y_true, y_pred, average='micro')
        f1 = f1_score(y_true, y_pred, average='micro')
        print(f"------------THRESHOLD = {threshold}----------------")
        print("PRECISION: ", precision)
        print("RECALL: ", recall)
        print("F1-Score: ", f1)
        dic[threshold] = {"precision": precision, "recall": recall, "F1": f1}
               

In [21]:
get_metrics(train)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.71----------------
PRECISION:  0.483003087237991
RECALL:  0.483003087237991
F1-Score:  0.483003087237991


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.72----------------
PRECISION:  0.5009603790204648
RECALL:  0.5009603790204648
F1-Score:  0.5009603790204648


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.73----------------
PRECISION:  0.518387737707348
RECALL:  0.518387737707348
F1-Score:  0.518387737707348


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.74----------------
PRECISION:  0.5306021396868292
RECALL:  0.5306021396868292
F1-Score:  0.5306021396868292


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.75----------------
PRECISION:  0.540317104666775
RECALL:  0.540317104666775
F1-Score:  0.540317104666775


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.76----------------
PRECISION:  0.5462615343829372
RECALL:  0.5462615343829372
F1-Score:  0.5462615343829372


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.77----------------
PRECISION:  0.5749434861558899
RECALL:  0.5749434861558899
F1-Score:  0.5749434861558899


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.78----------------
PRECISION:  0.5801333527176531
RECALL:  0.5801333527176531
F1-Score:  0.5801333527176531


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.79----------------
PRECISION:  0.5950404647675734
RECALL:  0.5950404647675734
F1-Score:  0.5950404647675734


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8----------------
PRECISION:  0.6146226186505663
RECALL:  0.6146226186505663
F1-Score:  0.6146226186505663


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.81----------------
PRECISION:  0.6389234290861718
RECALL:  0.6389234290861718
F1-Score:  0.6389234290861718


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8200000000000001----------------
PRECISION:  0.6513701578396746
RECALL:  0.6513701578396746
F1-Score:  0.6513701578396746


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8300000000000001----------------
PRECISION:  0.6617062762436609
RECALL:  0.6617062762436609
F1-Score:  0.6617062762436609


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8400000000000001----------------
PRECISION:  0.6676170684637072
RECALL:  0.6676170684637072
F1-Score:  0.6676170684637072


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8500000000000001----------------
PRECISION:  0.6955324844141517
RECALL:  0.6955324844141517
F1-Score:  0.6955324844141517


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8600000000000001----------------
PRECISION:  0.6996981177255351
RECALL:  0.6996981177255351
F1-Score:  0.6996981177255351


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8700000000000001----------------
PRECISION:  0.7098740304276808
RECALL:  0.7098740304276808
F1-Score:  0.7098740304276808


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8800000000000001----------------
PRECISION:  0.7240394071818905
RECALL:  0.7240394071818905
F1-Score:  0.7240394071818905


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.8900000000000001----------------
PRECISION:  0.7450693987155037
RECALL:  0.7450693987155037
F1-Score:  0.7450693987155037


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=3507990.0), HTML(value='')))


------------THRESHOLD = 0.9000000000000001----------------
PRECISION:  0.7593567826590155
RECALL:  0.7593567826590155
F1-Score:  0.7593567826590155


In [22]:
json.dump(dic, open("../heuristic_dict_v2.json", "w"))

In [23]:
import matplotlib.pyplot as plt

In [24]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
temp = train[train["is_click"]==0]
temp['binned'] = pd.cut(temp['product_context_similarity'], bins)
temp.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,query_id,product_context_similarity,is_click,click_pred,binned
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,0.0,0,0,
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,0.793411,0,0,"(0.6, 0.8]"
2,263ea1e38126fe0c7bfbff24a33b1a09f4dac4f8cd4bb4...,0.874803,0,0,"(0.8, 1.0]"
3,3727580d84ce2fbe42ff8bc6f732331f65ea659864a04c...,0.760519,0,0,"(0.6, 0.8]"
4,1fcf5d263785455311cecf2f864eaa2eeca4da488383d9...,0.687951,0,0,"(0.6, 0.8]"


In [25]:
temp["binned"].value_counts()

(0.6, 0.8]    1089490
(0.8, 1.0]    1023144
(0.4, 0.6]     320817
(0.2, 0.4]     103846
(0.0, 0.2]       6271
Name: binned, dtype: int64