In [6]:
import pandas as pd
import swifter
import numpy as np
from glob import glob
import datetime, json
import gc
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
import xgboost as xgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import matplotlib.pyplot as plt
import os, multiprocessing


In [7]:
import pyximport
pyximport.install(reload_support=True)
from mrr import mrr as mrr_cython

In [None]:
pd.options.display.max_rows=None
pd.options.display.max_columns=None


In [8]:
def check_folder(path, point_allowed_path=False):
    split_folder = os.path.split(path)
    if not point_allowed_path:
        if '.' in split_folder[1]:
            # path is a file
            path = split_folder[0]
    if not os.path.exists(path):
        print(f'{path} folder created')
        os.makedirs(path, exist_ok=True)

In [9]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col)
        col_type = df[col].dtype
                
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [10]:
def merge_data(data, merge_data_path_list, done_files=[], merge_cols='', flag=0):
    compulsory_cols = ["query_id", "user_id", "session_id", "product_id"]
    for path in merge_data_path_list:
        print("Merging file...", path.split('/')[-1])
        prev_cols_train = set(data.columns.tolist())
#         prev_cols_val = set(val_data.columns.tolist())
        d = pd.read_csv(path)
#         d = reduce_mem_usage(d)
        if 'is_click' in d.columns.tolist():
            d = d.drop('is_click', 1)
        if flag==0:
            merge_cols = [d.columns[0]]
            
        for col in d.columns.tolist():
            if col in data.columns.tolist() and col not in compulsory_cols and col not in merge_cols:
                d = d.drop(col, 1)
            
        data = data.merge(d, on=merge_cols, how='left')
#         val_data = val_data.merge(d, on=merge_cols, how='left')
#         print("Train: ", train_data.shape, "Val: ", val_data.shape)
        done_files.append(path)
        del d
        gc.collect()
#     print("Train: ", train_data.shape, "Val: ", val_data.shape)
    
    return data


In [11]:
train = pd.read_parquet("../data_phase1/train.parquet")
sub = pd.read_parquet("../data_phase1/validation.parquet")
t = pd.read_parquet("../data_phase2/test.parquet")
test = pd.read_csv("../data_phase2/test_labelled.csv")
test = t.merge(test, on=["query_id", "product_id"], how="left")


In [12]:
train = train.fillna(value={"context_type": "NA"})
sub = sub.fillna(value={"context_type": "NA"})
test = test.fillna(value={"context_type": "NA"})


In [13]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")

In [14]:
train = train.merge(attribute_df, on='product_id', how='left')
sub = sub.merge(attribute_df, on='product_id', how='left')
test = test.merge(attribute_df, on='product_id', how='left')


In [16]:
train = reduce_mem_usage(train)
sub = reduce_mem_usage(sub)
test = reduce_mem_usage(test)


Memory usage of dataframe is 802.92 MB
Memory usage after optimization is: 475.06 MB
Decreased by 40.8%
Memory usage of dataframe is 152.04 MB
Memory usage after optimization is: 146.25 MB
Decreased by 3.8%
Memory usage of dataframe is 157.29 MB
Memory usage after optimization is: 145.92 MB
Decreased by 7.2%


In [17]:
gc.collect()

94

In [19]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
sub['days_elapsed'] = (sub['week'].astype(int)-1)*7 + sub['week_day'].astype(int)
test['days_elapsed'] = (test['week'].astype(int)-1)*7 + test['week_day'].astype(int)


# Data Merge

In [20]:
BASE_PATH = "../preprocessed_data/"
merge_paths = list(set(glob(BASE_PATH+"*.csv")) - \
                   set([BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                        BASE_PATH+"product_click_features_by_session.csv", \
                        BASE_PATH+"product_material_attribute_one-hot.csv", \
                        BASE_PATH+"train_product_context_sim.csv", BASE_PATH+"val_product_context_sim.csv",
                        BASE_PATH+"test_product_context_sim.csv",
                        BASE_PATH+"user_click_percentage_features.csv"]))



In [24]:
train = merge_data(train, merge_paths)
sub = merge_data(sub, merge_paths)
test = merge_data(test, merge_paths)


Merging file... user_tier_features.csv
Merging file... category_id_l2_freq_data.csv
Merging file... user_country_features.csv
Merging file... main_colour_popularity_df.csv
Merging file... query_wise_start_online_date_features.csv
Merging file... category_id_l2_popularity_df.csv
Merging file... second_colour_frequency.csv
Merging file... collection_popularity_df.csv
Merging file... category_id_l1_freq_data.csv
Merging file... season_year_freq_data.csv
Merging file... brand_id_freq_data.csv
Merging file... user_features.csv
Merging file... global_product_frequency.csv
Merging file... main_colour_frequency.csv
Merging file... unique_user_interactions_per_product.csv
Merging file... season_frequency.csv
Merging file... collection_freq_data.csv
Merging file... user_time_features.csv
Merging file... gender_frequency.csv
Merging file... category_id_l1_popularity_df.csv
Merging file... query_wise_price_features.csv
Merging file... brand_popularity_df.csv
Merging file... second_colour_popularit

In [26]:
train = merge_data(train, [BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                                     BASE_PATH+"product_click_features_by_session.csv"], merge_cols=["session_id", "product_id"], flag=1)

sub = merge_data(sub, [BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                                     BASE_PATH+"product_click_features_by_session.csv"], merge_cols=["session_id", "product_id"], flag=1)

test = merge_data(test, [BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                                     BASE_PATH+"product_click_features_by_session.csv"], merge_cols=["session_id", "product_id"], flag=1)



Merging file... session_wise_product_attribute_frequency_agg_features.csv
Merging file... product_click_features_by_session.csv
Merging file... session_wise_product_attribute_frequency_agg_features.csv
Merging file... product_click_features_by_session.csv
Merging file... session_wise_product_attribute_frequency_agg_features.csv
Merging file... product_click_features_by_session.csv


In [27]:
pc_sim_train = pd.read_csv(BASE_PATH+"train_product_context_sim.csv", error_bad_lines=False)
pc_sim_sub = pd.read_csv(BASE_PATH+"val_product_context_sim.csv")
pc_sim_test = pd.read_csv(BASE_PATH+"test_product_context_sim.csv")

pc_sim_train = pc_sim_train.fillna(0)
pc_sim_sub = pc_sim_sub.fillna(0)
pc_sim_test = pc_sim_test.fillna(0)


In [28]:
pc_sim_train.head()

Unnamed: 0,query_id,session_id,user_id,product_id,product_context_similarity
0,92d4dd491a874a2cf92c8d311a44a42b597c64a5ede23d...,2740b0d77b4e6fafd75321f7d0794210afa8bd650955e7...,e5e4c71b1b9456dafece1338762d4ee3db698cf32c384c...,bf056e3841dd3a358c6aacb1f9e74e4c7c4adc62e33b45...,0.0
1,541a93bd95c3f4127a53e6b0d4b41db55ad9cb9e19d34a...,d5feab37634fd140e85b8f98dcb909a8779b4f0417c73a...,fca847f7eb5a5a21991421354b0f26afb4a517e540541f...,c6513ec49c8e04c265c907933799ff76f24c075c6308c4...,0.793411
2,263ea1e38126fe0c7bfbff24a33b1a09f4dac4f8cd4bb4...,18ae37a1a05faa7fd54818794a1a8e44073e00a56fa05f...,90ea15d8d96a9d3e7ab463d990e5f4565cac9477498d37...,b4d5e28da10318aa7776b364528dc92f83ba45326018b5...,0.874803
3,3727580d84ce2fbe42ff8bc6f732331f65ea659864a04c...,10667a5a6047aa173d13997cdcd996cbdaf9b0149f9655...,8f88d89f2a71e2adf42f885fa6adedd09bf039843b535c...,5a36f600d3c01763c28e2dafc53119fba7bcc6a867ab8f...,0.760519
4,1fcf5d263785455311cecf2f864eaa2eeca4da488383d9...,8b39f00bfc9d45ed6f64dd39a72548936a157f9f03e7f4...,6b21688d90dfd9677fa7979dfc6da5b963c0e1e3d68a38...,314d20e9e9ec3e97d1867c8bf8c6feb0c23d918021e175...,0.687951


In [29]:
train = train.merge(pc_sim_train, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(pc_sim_sub, on=["query_id", "user_id", "session_id", "product_id"], how="left")
test = test.merge(pc_sim_test, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [31]:
train["days_elapsed_since_first_user_action"] = train["days_elapsed"] - train["first_user_action_days_elapsed"]
sub["days_elapsed_since_first_user_action"] = sub["days_elapsed"] - sub["first_user_action_days_elapsed"]
test["days_elapsed_since_first_user_action"] = test["days_elapsed"] - test["first_user_action_days_elapsed"]


In [32]:
train["diff_prod_price_from_user_tier_mean"] = train["product_price"] - train["user_tier_mean_product_price"]
train["diff_prod_start_online_date_from_user_tier_mean"] = train["product_price"] - train["user_tier_mean_product_start_online_date"]

sub["diff_prod_price_from_user_tier_mean"] = sub["product_price"] - sub["user_tier_mean_product_price"]
sub["diff_prod_start_online_date_from_user_tier_mean"] = sub["product_price"] - sub["user_tier_mean_product_start_online_date"]

test["diff_prod_price_from_user_tier_mean"] = test["product_price"] - test["user_tier_mean_product_price"]
test["diff_prod_start_online_date_from_user_tier_mean"] = test["product_price"] - test["user_tier_mean_product_start_online_date"]


In [33]:
train["diff_start_online_date_from_user_tier_clicked_mean"] = train["start_online_date"] - train["user_tier_mean_click_product_price"]
train["diff_start_online_date_from_user_tier_clicked_mean"] = train["start_online_date"] - train["user_tier_mean_click_product_start_online_date"]

sub["diff_start_online_date_from_user_tier_clicked_mean"] = sub["start_online_date"] - sub["user_tier_mean_click_product_price"]
sub["diff_start_online_date_from_user_tier_clicked_mean"] = sub["start_online_date"] - sub["user_tier_mean_click_product_start_online_date"]

test["diff_start_online_date_from_user_tier_clicked_mean"] = test["start_online_date"] - test["user_tier_mean_click_product_price"]
test["diff_start_online_date_from_user_tier_clicked_mean"] = test["start_online_date"] - test["user_tier_mean_click_product_start_online_date"]


In [34]:
train["diff_prod_price_from_session_mean"] = train["product_price"] - train["mean_session_product_price"]
train["diff_prod_price_from_query_mean"] = train["product_price"] - train["max_query_price"]

sub["diff_prod_price_from_session_mean"] = sub["product_price"] - sub["mean_session_product_price"]
sub["diff_prod_price_from_query_mean"] = sub["product_price"] - sub["max_query_price"]

test["diff_prod_price_from_session_mean"] = test["product_price"] - test["mean_session_product_price"]
test["diff_prod_price_from_query_mean"] = test["product_price"] - test["max_query_price"]


In [35]:
dic = json.load(open("../preprocessed_data/average_click_values.json", "r"))


In [37]:
train["diff_prod_price_from_click_mean"] = train["product_price"] - dic["mean_click_price"]
sub["diff_prod_price_from_click_mean"] = sub["product_price"] - dic["mean_click_price"]
test["diff_prod_price_from_click_mean"] = test["product_price"] - dic["mean_click_price"]


In [38]:
train["diff_start_online_date_from_session_mean"] = train["start_online_date"] - train["session_start_online_date_mean"]
train["diff_start_online_date_from_query_mean"] = train["start_online_date"] - train["mean_query_start_online_date"]

sub["diff_start_online_date_from_session_mean"] = sub["start_online_date"] - sub["session_start_online_date_mean"]
sub["diff_start_online_date_from_query_mean"] = sub["start_online_date"] - sub["mean_query_start_online_date"]

test["diff_start_online_date_from_session_mean"] = test["start_online_date"] - test["session_start_online_date_mean"]
test["diff_start_online_date_from_query_mean"] = test["start_online_date"] - test["mean_query_start_online_date"]


In [39]:
train["diff_start_online_date_from_click_mean"] = train["start_online_date"] - dic["mean_click_start_online_date"]
sub["diff_start_online_date_from_click_mean"] = sub["start_online_date"] - dic["mean_click_start_online_date"]
test["diff_start_online_date_from_click_mean"] = test["start_online_date"] - dic["mean_click_start_online_date"]


In [40]:
del attribute_df
gc.collect()


44

In [41]:
train_rank_features = pd.read_csv("../preprocessed_data/rank_features/train_rank_features_and_query_prod_list.csv")


In [42]:
sub_rank_features = pd.read_csv("../preprocessed_data/rank_features/val_rank_features_and_query_prod_list.csv")


In [43]:
test_rank_features = pd.read_csv("../preprocessed_data/rank_features/test_rank_features_and_query_prod_list.csv")


In [44]:
train_rank_features = train_rank_features[['query_id', 'user_id', 'session_id', 'product_id', 'price_rank', \
                                           'start_online_date_rank', 'user_step']]
sub_rank_features = sub_rank_features[['query_id', 'user_id', 'session_id', 'product_id', 'price_rank', \
                                           'start_online_date_rank', 'user_step']]
test_rank_features = test_rank_features[['query_id', 'user_id', 'session_id', 'product_id', 'price_rank', \
                                           'start_online_date_rank', 'user_step']]


In [45]:
train = train.merge(train_rank_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(sub_rank_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
test = test.merge(test_rank_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [46]:
user_features = pd.read_csv("../preprocessed_data/user_click_percentage_features.csv")
# sub_user_features = pd.read_csv("../preprocessed_data/user_click_percentage_features.csv")


In [47]:
train = train.merge(user_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(user_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
test = test.merge(user_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [51]:
BASE_PATH = "../preprocessed_data/clickout_features/"

In [52]:
merge_paths = glob(BASE_PATH+"*.csv")
done_files = []


In [53]:
train = merge_data(train, merge_paths)
sub = merge_data(sub, merge_paths)
test = merge_data(test, merge_paths)


Merging file... product_wise_last_clickout_days_elapsed.csv
Merging file... category_id_l2_click_percentage.csv
Merging file... clicked_product_frequency_by_week.csv
Merging file... brand_id_click_percentage.csv
Merging file... product_wise_last_clickout_product_price.csv
Merging file... clicked_product_prices_by_week.csv
Merging file... category_id_l3_click_percentage.csv
Merging file... season_year_click_percentage.csv
Merging file... weekly_mean_product_start_online_date_clicked_out.csv
Merging file... weekly_min_product_start_online_date_clicked_out.csv
Merging file... unique_user_clicks_per_product.csv
Merging file... weekly_max_product_start_online_date_clicked_out.csv
Merging file... main_colour_click_percentage.csv
Merging file... category_id_l1_click_percentage.csv
Merging file... second_colour_click_percentage.csv
Merging file... product_wise_last_clickout_days_elapsed.csv
Merging file... category_id_l2_click_percentage.csv
Merging file... clicked_product_frequency_by_week.cs

In [54]:
is_click = train.is_click.values
train = train.drop(["is_click"], 1)
train["is_click"] = is_click

In [57]:
train = train.sort_values(["query_id"])
sub = sub.sort_values(["query_id"])
test = test.sort_values(["query_id"])


In [58]:
# train_groups = train.groupby("query_id").count().reset_index()["user_id"].values
sub_groups = sub.groupby("query_id").count().reset_index()["user_id"].values
sub_groups.shape

(114532,)

In [59]:
val = train.iloc[-150000:, :]
val_groups = val.groupby("query_id").count().reset_index()["user_id"].values

# train = train.iloc[:3357990]
train_groups = train.groupby("query_id").count().reset_index()["user_id"].values


In [60]:
X=train.copy()
X_val = val.copy()
X_test = test.copy()

In [65]:
# Convert back category to object
for col in sub.columns.tolist():
    if X.dtypes[col]=="O" or "int" in str(X.dtypes[col]) or "float" in str(X.dtypes[col]):
        continue
    if sub.dtypes[col]=="O" or "int" in str(sub.dtypes[col]) or "float" in str(sub.dtypes[col]):
        continue
    if X_val.dtypes[col]=="O" or "int" in str(X_val.dtypes[col]) or "float" in str(X_val.dtypes[col]):
        continue
    if test.dtypes[col]=="O" or "int" in str(test.dtypes[col]) or "float" in str(test.dtypes[col]):
        continue
    if X.dtypes[col]=="category":
        X[col] = X[col].astype(str)
    if sub.dtypes[col]=="category":
        sub[col] = sub[col].astype(str)
    if X_val.dtypes[col]=="category":
        X_val[col] = X_val[col].astype(str)
    if test.dtypes[col]=="category":
        test[col] = test[col].astype(str)
        

In [67]:
full_data = pd.concat([X, X_val, sub], 0)

In [68]:
nan_df = pd.DataFrame(full_data.isnull().sum()).reset_index()
nan_df = nan_df.rename(columns={"index": "feature"})
nan_df["nan_perc"] = (nan_df[0]/len(full_data))*100.0
remove_cols = nan_df[nan_df["nan_perc"]>=30]["feature"].values.tolist()
nan_df[nan_df["nan_perc"]>=30]

Unnamed: 0,feature,0,nan_perc
137,user_brand_click_percentage,3927802,90.394418
138,user_category_id_l1_click_percentage,3814531,87.7876
139,user_category_id_l2_click_percentage,3875324,89.18669
140,user_category_id_l3_click_percentage,3851299,88.633779
141,user_season_click_percentage,3830605,88.157527
164,start_online_date_mean_clicked_out_week_1,3225048,74.221241
165,start_online_date_mean_clicked_out_week_2,1461699,33.639535
171,start_online_date_mean_clicked_out_week_8,1762452,40.561063
172,start_online_date_min_clicked_out_week_1,3225048,74.221241
173,start_online_date_min_clicked_out_week_2,1461699,33.639535


In [69]:
remove_cols = remove_cols + ["week", "week_day", "material_values", "attribute_values"]
remove_cols = list(set(remove_cols) - set(["is_click"]))
remove_cols

['start_online_date_mean_clicked_out_week_1',
 'attribute_values',
 'start_online_date_min_clicked_out_week_8',
 'user_category_id_l2_click_percentage',
 'start_online_date_max_clicked_out_week_1',
 'start_online_date_max_clicked_out_week_2',
 'start_online_date_max_clicked_out_week_8',
 'user_category_id_l3_click_percentage',
 'start_online_date_mean_clicked_out_week_2',
 'start_online_date_mean_clicked_out_week_8',
 'user_brand_click_percentage',
 'material_values',
 'week_day',
 'user_category_id_l1_click_percentage',
 'user_season_click_percentage',
 'start_online_date_min_clicked_out_week_2',
 'week',
 'start_online_date_min_clicked_out_week_1']

# Training Transformations and Code

## Categorical Encoding

In [72]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    

In [73]:
# cat_cols = train.select_dtypes('object').columns.tolist()[4:]

cols = train.columns.tolist()
cat_cols = cols[3:12] + cols[16:25]
cat_cols = cat_cols+['gender']
cat_cols

['product_id',
 'page_type',
 'previous_page_type',
 'device_category',
 'device_platform',
 'user_tier',
 'user_country',
 'context_type',
 'context_value',
 'main_colour',
 'second_colour',
 'season',
 'collection',
 'category_id_l1',
 'category_id_l2',
 'category_id_l3',
 'brand_id',
 'season_year',
 'gender']

In [74]:
cat_encoder = MultiColumnLabelEncoder(columns = cat_cols)


In [75]:
X = cat_encoder.fit_transform(X)

In [76]:
X_val = cat_encoder.transform(X_val)

In [77]:
X_sub = cat_encoder.transform(sub)

In [78]:
X_test = cat_encoder.transform(test)

In [79]:
cat_encoder1 = MultiColumnLabelEncoder(columns = ['gender'])
X = cat_encoder1.fit_transform(X)
X_val = cat_encoder1.transform(X_val)
X_sub = cat_encoder1.transform(X_sub)
X_test = cat_encoder1.transform(X_test)


In [80]:
X_train_ids = X["query_id"].values

In [81]:
remove_cols = remove_cols + ["query_id", "session_id", "user_id"]

In [82]:
# X=train.copy()
y = X['is_click'].values
X = X.drop(remove_cols+['is_click'], 1)

In [83]:
X.head(2)

Unnamed: 0,product_id,page_type,previous_page_type,device_category,device_platform,user_tier,user_country,context_type,context_value,product_price,gender,main_colour,second_colour,season,collection,category_id_l1,category_id_l2,category_id_l3,brand_id,season_year,start_online_date,days_elapsed,user_tier_mean_product_price,user_tier_max_product_price,user_tier_min_product_price,user_tier_mean_product_start_online_date,user_tier_max_product_start_online_date,user_tier_min_product_start_online_date,user_tier_mean_click_product_price,user_tier_max_click_product_price,user_tier_min_click_product_price,user_tier_mean_click_product_start_online_date,user_tier_max_click_product_start_online_date,user_tier_min_click_product_start_online_date,category_id_l2_perc,user_country_mean_product_price,user_country_max_product_price,user_country_min_product_price,user_country_mean_product_start_online_date,user_country_max_product_start_online_date,user_country_min_product_start_online_date,user_country_mean_click_product_price,user_country_max_click_product_price,user_country_min_click_product_price,user_country_mean_click_product_start_online_date,user_country_max_click_product_start_online_date,user_country_min_click_product_start_online_date,main_colour_impression_count,main_colour_click_count,main_colour_popularity_score,mean_query_start_online_date,max_query_start_online_date,min_query_start_online_date,category_id_l2_impression_count,category_id_l2_click_count,category_id_l2_popularity_score,second_colour_perc,collection_impression_count,collection_click_count,collection_popularity_score,category_id_l1_perc,season_year_perc,brand_id_perc,user_mean_interacted_product_price,user_max_interacted_product_price,user_min_interacted_product_price,user_country_mean_interacted_product_start_online_date,user_max_interacted_product_start_online_date,user_min_interacted_product_start_online_date,user_mean_clicked_product_price,user_max_clicked_product_price,user_min_clicked_product_price,user_mean_clicked_product_start_online_date,user_max_clicked_product_start_online_date,user_min_clicked_product_start_online_date,global_product_frequency,global_product_percentage,#impressions_for_product,main_colour_perc,#unique_users_interacted,season_perc,collection_perc,first_user_action_days_elapsed,single_session_user_flag,gender_perc,category_id_l1_impression_count,category_id_l1_click_count,category_id_l1_popularity_score,mean_query_price,max_query_price,min_query_price,brand_impression_count,brand_click_count,brand_popularity_score,second_colour_impression_count,second_colour_click_count,second_colour_popularity_score,session_season_frequency,session_collection_frequency,session_category_id_l1_frequency,session_category_id_l2_frequency,session_brand_id_frequency,main_colour_freq_in session,second_colour_freq_in session,gender_freq_in session,session_start_online_date_mean,session_start_online_date_max,session_start_online_date_min,mean_session_start_online_date,count_session_items,product_session_frequency,product_session_click_proportion,session_impression_count,mean_session_product_price,max_session_product_price,min_session_product_price,product_context_similarity,days_elapsed_since_first_user_action,diff_prod_price_from_user_tier_mean,diff_prod_start_online_date_from_user_tier_mean,diff_start_online_date_from_user_tier_clicked_mean,diff_prod_price_from_session_mean,diff_prod_price_from_query_mean,diff_prod_price_from_click_mean,diff_start_online_date_from_session_mean,diff_start_online_date_from_query_mean,diff_start_online_date_from_click_mean,price_rank,start_online_date_rank,user_step,last_clickout_days_elapsed,percentage_category_id_l2_click,click_frequency_week_1,click_frequency_week_2,click_frequency_week_3,click_frequency_week_4,click_frequency_week_5,click_frequency_week_6,click_frequency_week_7,click_frequency_week_8,percentage_brand_id_click,last_clickout_product_price,clickout_product_mean_price_week_1,clickout_product_mean_price_week_2,clickout_product_mean_price_week_3,clickout_product_mean_price_week_4,clickout_product_mean_price_week_5,clickout_product_mean_price_week_6,clickout_product_mean_price_week_7,clickout_product_mean_price_week_8,percentage_category_id_l3_click,percentage_season_year_click,start_online_date_mean_clicked_out_week_3,start_online_date_mean_clicked_out_week_4,start_online_date_mean_clicked_out_week_5,start_online_date_mean_clicked_out_week_6,start_online_date_mean_clicked_out_week_7,start_online_date_min_clicked_out_week_3,start_online_date_min_clicked_out_week_4,start_online_date_min_clicked_out_week_5,start_online_date_min_clicked_out_week_6,start_online_date_min_clicked_out_week_7,#unique_users_clicked,start_online_date_max_clicked_out_week_3,start_online_date_max_clicked_out_week_4,start_online_date_max_clicked_out_week_5,start_online_date_max_clicked_out_week_6,start_online_date_max_clicked_out_week_7,percentage_main_colour_click,percentage_category_id_l1_click,percentage_second_colour_click
1362479,49382,0,16,1,0,4,177,4,123289,0.000466,3,9,15,4,3,24,149,462,127,15,573.0,41,0.001904,1.0,0.0,425.663298,3214.041667,0.0,0.001987,0.934,3e-06,426.511957,3214.041667,120.0,0.752292,0.001128,0.934,3e-06,414.871604,3214.041667,0.0,0.001143,0.934,3e-06,413.281893,3214.041667,125.0,82970,11843,0.142738,310.194444,573.041667,133.0,31560,4424.0,0.140177,84.251267,1574209,227540,0.144542,4.653481,18.76822,0.259917,0.000721,0.001765,0.000165,334.546875,1365.041667,133.0,0.000719,0.001238,0.00036,270.229167,477.0,156.0,4,9.5e-05,4,1.977745,4,63.919515,37.524212,36,1,55.442219,195222,27131,0.138975,0.00099,0.001765,0.000465,10904,1640.0,0.150404,3534494,506813.0,0.143391,12,12,12,12,12,1,12,12,314.947917,573.041667,133.0,314.947917,12,1,0.0,2,0.001093,0.001816,0.000465,0.377172,5,-0.001438,-425.662832,146.488043,-0.000627,-0.001299,-0.001322,258.052083,262.805556,158.25,2,6,3,31.0,0.726028,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.268729,0.000508,0.0,0.0,0.000509,0.0,0.000508,0.000466,0.0,0.0,50.157327,18.639563,573.041667,,573.041667,573.041667,,573.041667,,573.041667,573.041667,,1,573.041667,,573.041667,573.041667,,1.964344,4.531199,84.426809
2993879,156947,0,16,1,0,4,177,4,123289,0.001068,3,2,15,4,1,20,145,536,565,19,262.0,41,0.001904,1.0,0.0,425.663298,3214.041667,0.0,0.001987,0.934,3e-06,426.511957,3214.041667,120.0,5.117633,0.001128,0.934,3e-06,414.871604,3214.041667,0.0,0.001143,0.934,3e-06,413.281893,3214.041667,125.0,429952,62540,0.145458,310.194444,573.041667,133.0,214694,32820.0,0.152869,84.251267,2070924,296997,0.143413,18.875868,68.638977,0.213125,0.000721,0.001765,0.000165,334.546875,1365.041667,133.0,0.000719,0.001238,0.00036,270.229167,477.0,156.0,4,9.5e-05,4,10.248709,3,63.919515,49.364342,36,1,55.442219,791877,119682,0.151137,0.00099,0.001765,0.000465,8941,1302.0,0.145621,3534494,506813.0,0.143391,12,12,12,12,12,1,12,12,314.947917,573.041667,133.0,314.947917,12,1,0.0,2,0.001093,0.001816,0.000465,0.807692,5,-0.000836,-425.66223,-164.511957,-2.5e-05,-0.000697,-0.00072,-52.947917,-48.194444,-152.75,4,3,3,41.0,5.409223,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.216791,0.001068,0.0,0.0,0.0,0.0,0.0,0.001068,0.001068,0.0,2.390571,69.095123,,,,262.041667,262.041667,,,,262.041667,262.041667,1,,,,262.041667,262.041667,10.356073,19.85114,84.426809


In [84]:
nan_df_filter = pd.DataFrame(X.isnull().sum()).reset_index()
nan_df_filter = nan_df_filter.rename(columns={"index": "feature"})
nan_df_filter["nan_perc"] = (nan_df_filter[0]/len(X))*100.0
nan_df_filter = nan_df_filter[nan_df_filter["nan_perc"]>0]
filter_nan_cols = nan_df_filter["feature"].values.tolist()


In [86]:
dic = {}

for col in filter_nan_cols:
    if col=='product_context_similarity':
        X[col] = X[col].astype(float)
        X[col] = X[col].fillna(0)    
    else:
        X[col] = X[col].astype(float)
        X[col] = X[col].fillna(X[col].mean())
        dic[col] = X[col].mean()
    

In [87]:
# json.dump(dic, open("../train_meta/column_mean_imputation.json", "w"))

In [88]:
X=reduce_mem_usage(X)
X_val=reduce_mem_usage(X_val)
X_sub=reduce_mem_usage(X_sub)
X_test=reduce_mem_usage(X_test)


Memory usage of dataframe is 4523.09 MB
Memory usage after optimization is: 1184.30 MB
Decreased by 73.8%
Memory usage of dataframe is 215.58 MB
Memory usage after optimization is: 77.00 MB
Decreased by 64.3%
Memory usage of dataframe is 986.97 MB
Memory usage after optimization is: 326.13 MB
Decreased by 67.0%
Memory usage of dataframe is 987.62 MB
Memory usage after optimization is: 381.17 MB
Decreased by 61.4%


In [89]:
# X_val = val.copy()
X_val_ids = X_val["query_id"].values
y_val = X_val['is_click'].values
X_val = X_val.drop(remove_cols+['is_click'], 1)
for col in filter_nan_cols:
    if col=='product_context_similarity':
        X_val[col] = X_val[col].astype(float)
        X_val[col] = X_val[col].fillna(0)   
    else:
        X_val[col] = X_val[col].astype(float)
        X_val[col] = X_val[col].fillna(dic[col])


In [90]:
# X_sub = sub.copy()
X_sub_ids = X_sub["query_id"].values
X_sub = X_sub.drop(remove_cols, 1)
for col in filter_nan_cols:
    if col=='product_context_similarity':
        X_sub[col] = X_sub[col].astype(float)
        X_sub[col] = X_sub[col].fillna(0)   
    else:
        X_sub[col] = X_sub[col].astype(float)
        X_sub[col] = X_sub[col].fillna(dic[col])


In [91]:
y_test = X_test['is_click'].values

X_test_ids = X_test["query_id"].values
X_test = X_test.drop(remove_cols, 1)
for col in filter_nan_cols:
    if col=='product_context_similarity':
        X_test[col] = X_test[col].astype(float)
        X_test[col] = X_test[col].fillna(0)   
    else:
        X_test[col] = X_test[col].astype(float)
        X_test[col] = X_test[col].fillna(dic[col])


In [99]:
X_test = X_test.drop(["is_click"], 1)

In [93]:
FIXED_PARAMS={'boosting_type':'gbdt',
             'objective': 'binary:logistic',
             'metric': 'ndcg',
             'is_unbalance':True,
             'bagging_freq':10,
             'boosting':'gbdt',
             'num_boost_round':300,
             'early_stopping_rounds':30,
#              'scale_pos_weight': sample_pos_weight,
             'categorical_feature': cat_cols,
             'importance_type': 'split',
             'random_state': None,
#              'device_type': 'gpu',
             'n_jobs': -1,
             'silent': False,
             'print_every': 100000,
             'first_only': True}



In [94]:
_best_so_far = 0

def callbak(obj):
    global _best_so_far
    print("$$", obj)
    if obj.evaluation_result_list[0][1] > _best_so_far:
        _best_so_far = obj.evaluation_result_list[0][1]
#         if _best_so_far > 0.6765:
        print('xgboost iteration {} mrr is {}'. format(obj.iteration, _best_so_far))
        

In [95]:
def compute_mean_reciprocal_rank(rs):
    '''
    rs: 2d array
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    '''

    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [96]:
def evaluate(val_df, clf):
#     val_df['scores'] = clf.predict(d_val)
    grouped_val = val_df.groupby('query_id')
    rss = []
    for _, group in grouped_val:

        scores = group.scores
        sorted_arg = np.flip(np.argsort(scores))
        rss.append( group['label'].values[sorted_arg])
        
    mrr = compute_mean_reciprocal_rank(rss)
    return mrr



In [101]:
d_train = xgb.DMatrix(data=X, label=y, silent=True, nthread=-1, feature_names=X.columns.tolist())
d_val = xgb.DMatrix(data=X_val, label=y_val, silent=True, nthread=-1, feature_names=X.columns.tolist())
d_sub = xgb.DMatrix(X_sub, nthread=-1, feature_names=X.columns.tolist())
d_test = xgb.DMatrix(X_test, nthread=-1, feature_names=X_test.columns.tolist())


In [102]:
# params_dict = {'learning_rate': 0.05, 
#                'booster': 'gbtree', 
#                'tree_method': 'gpu_hist', 
#                'max_leaves': 425, 
#                'max_depth': 15, 
#                'nthread': 11, 
#                'subsample': 0.6540872693621809, 
#                'colsample_bytree': 0.6268987054552499, 
#                'colsample_bylevel': 0.5047724070285782, 
#                'min_child_weight': 13, 
#                'reg_alpha': 2.7949851499014, 
#                'reg_lambda': 7.104006290199863, 
#                'scale_pos_weight': 25, 
#                'objective': 'binary:logistic', 
#                'eval_metric': 'logloss', 
#                'random_state': 5478, 
#                'verbosity': 0}


params_dict = {'learning_rate': 0.001, 
               'booster': 'gbtree', 
               'tree_method': 'gpu_hist', 
               'max_leaves': 50, 
               'max_depth': 192, 
               'metrics': ['logloss', 'auc', 'error', 'rmse'], 
               'nthread': 11, 
               'subsample': 0.7127215313565193, 
               'colsample_bytree': 0.8438805661596704, 
               'colsample_bylevel': 0.36748400898771283, 
               'min_child_weight': 6, 
               'reg_alpha': 100.0, 
               'reg_lambda': 51.71977329460361, 
               'scale_pos_weight': 35, 
               'objective': 'binary:logistic', 
               'eval_metric': ['logloss', 'auc', 'error', 'rmse'], 
               'random_state': 5478, 
               'verbosity': 0}


In [104]:
clf = xgb.Booster()
clf.load_model("../model_meta/XGBClassifier/full_val/2021-07-09_18:50_0.4107/XGBClassifier.txt")


In [105]:
# val_preds = clf.predict(d_val)
test_preds = clf.predict(d_test)

In [106]:
# train_preds = clf.predict(d_train)
# temp_val = pd.DataFrame({"query_id": X_val_ids, "scores": val_preds, "label": y_val})
temp_test = pd.DataFrame({"query_id": X_test_ids, "scores": test_preds, "label": y_test})
evaluate(temp_test, clf)



0.40123167906494833

In [108]:
sub_preds = clf.predict(d_test)

In [111]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [112]:
sub_preds = [sigmoid(each) for each in sub_preds.tolist()]

In [114]:
# sub['preds'] = sub_preds
test['preds'] = sub_preds

In [116]:
# submission = sub[["query_id", "product_id", "product_context_similarity", "preds"]]
submission = test[["query_id", "product_id", "preds"]]
submission.head(2)

Unnamed: 0,query_id,product_id,preds
85679,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.506083
189379,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.595668


In [117]:
submission["rank"] = submission.groupby('query_id')['preds'].rank(method='first').apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [118]:
# submission = submission.drop(["preds"], 1).reset_index()
submission.head(100).reset_index(drop=True)

Unnamed: 0,query_id,product_id,preds,rank
0,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.506083,1
1,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.595668,3
2,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.522683,2
3,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.598409,5
4,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.605272,6
5,00000996d07006b045bc134b757f2825b0c79870d41521...,009fbcce12d627c870fc0b262c5ddd4b67e897d34928bf...,0.597356,4
6,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,8a45dc566200a885fa5c9246fd0d2a008fdc760e4a3ba9...,0.595362,5
7,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,3cb32bcf6e87883cfad9579d9c30f5161024f8fbf4c063...,0.595901,6
8,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,f452b8c917747e477f4d28e0c34f63172f5f0f46a88daf...,0.587538,3
9,00009794cfc2ea322c5b4969570ea541e1f63c0ec91e84...,ac5a5054a1e66360c95bec079fbbcaa4bc0e7eaa6133b5...,0.504301,1


In [119]:
# submission = submission.drop(["preds"], 1)

In [120]:
def apply_rank(row):
    return (6-row['rank'])+1

submission["rank"] = submission.swifter.apply(lambda x: apply_rank(x), 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [121]:
submission.head(6)

Unnamed: 0,query_id,product_id,preds,rank
85679,00000996d07006b045bc134b757f2825b0c79870d41521...,eec511e70b3423d23bf07b10f7710f7d3d6bd24f2d00ee...,0.506083,6
189379,00000996d07006b045bc134b757f2825b0c79870d41521...,b7ce11578ac78da80b8afbe384dd5a90ed269e83ecb325...,0.595668,4
384222,00000996d07006b045bc134b757f2825b0c79870d41521...,d5a043852bc53331c656b9102334e90f2e5238f638062b...,0.522683,5
616173,00000996d07006b045bc134b757f2825b0c79870d41521...,3eebef491448c1c1e38d696538b3e38245a63a6896f717...,0.598409,2
131171,00000996d07006b045bc134b757f2825b0c79870d41521...,cc6ce8a7f2a3c3216597d7813a221cabf0f531690c8147...,0.605272,1
12390,00000996d07006b045bc134b757f2825b0c79870d41521...,009fbcce12d627c870fc0b262c5ddd4b67e897d34928bf...,0.597356,3


In [122]:
submission.to_csv("../submission/XGBClassifier_test_full_val_10_07_21.csv", index=False)