In [None]:
import pandas as pd
import swifter
import numpy as np
from glob import glob
import datetime, json
import gc
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
import xgboost as xgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import os, multiprocessing
import random


In [None]:
import pyximport
pyximport.install(reload_support=True)
from mrr import mrr as mrr_cython

In [None]:
ITERATIONS = 5000

In [None]:
def check_folder(path, point_allowed_path=False):
    split_folder = os.path.split(path)
    if not point_allowed_path:
        if '.' in split_folder[1]:
            # path is a file
            path = split_folder[0]
    if not os.path.exists(path):
        print(f'{path} folder created')
        os.makedirs(path, exist_ok=True)

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
#         print(col)
        col_type = df[col].dtype
                
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
def merge_data(train_data, val_data, merge_data_path_list, done_files=[], merge_cols='', flag=0):
    compulsory_cols = ["query_id", "user_id", "session_id", "product_id"]
    for path in merge_data_path_list:
        print("Merging file...", path.split('/')[-1])
        prev_cols_train = set(train_data.columns.tolist())
        prev_cols_val = set(val_data.columns.tolist())
        d = pd.read_csv(path)
#         d = reduce_mem_usage(d)
        if 'is_click' in d.columns.tolist():
            d = d.drop('is_click', 1)
        if flag==0:
            merge_cols = [d.columns[0]]
            
        for col in d.columns.tolist():
            if col in train_data.columns.tolist() and col not in compulsory_cols and col not in merge_cols:
                d = d.drop(col, 1)
            
        train_data = train_data.merge(d, on=merge_cols, how='left')
        val_data = val_data.merge(d, on=merge_cols, how='left')
#         print("Train: ", train_data.shape, "Val: ", val_data.shape)
        done_files.append(path)
        del d
        gc.collect()
#     print("Train: ", train_data.shape, "Val: ", val_data.shape)
    
    return train_data, val_data


In [None]:
pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [None]:
train = pd.read_parquet("../data_phase1/train.parquet")
sub = pd.read_parquet("../data_phase1/validation.parquet")


In [None]:
train = train.fillna(value={"context_type": "NA"})
sub = sub.fillna(value={"context_type": "NA"})


In [None]:
attribute_df = pd.read_parquet("../data_phase1/attributes.parquet")

In [None]:
train = train.merge(attribute_df, on='product_id', how='left')
sub = sub.merge(attribute_df, on='product_id', how='left')


In [None]:
train.head(2)

In [None]:
train = reduce_mem_usage(train)
sub = reduce_mem_usage(sub)


In [None]:
gc.collect()

In [None]:
train.shape, sub.shape

In [None]:
train['days_elapsed'] = (train['week'].astype(int)-1)*7 + train['week_day'].astype(int)
sub['days_elapsed'] = (sub['week'].astype(int)-1)*7 + sub['week_day'].astype(int)


# Data Merge

In [None]:
BASE_PATH = "../preprocessed_data/"
merge_paths = list(set(glob(BASE_PATH+"*.csv")) - \
                   set([BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                        BASE_PATH+"product_click_features_by_session.csv", \
                        BASE_PATH+"product_material_attribute_one-hot.csv", \
                        BASE_PATH+"train_product_context_sim.csv", BASE_PATH+"val_product_context_sim.csv",
                        BASE_PATH+"test_product_context_sim.csv",
                        BASE_PATH+"user_click_percentage_features.csv"]))



In [None]:
train, sub = merge_data(train, sub, merge_paths)


In [None]:
train.memory_usage(deep=True).sum()/(1024*1024*1024), sub.memory_usage(deep=True).sum()/(1024*1024*1024)

In [None]:
train, sub = merge_data(train, sub, [BASE_PATH+"session_wise_product_attribute_frequency_agg_features.csv", \
                                     BASE_PATH+"product_click_features_by_session.csv"], merge_cols=["session_id", "product_id"], flag=1)



In [None]:
pc_sim_train = pd.read_csv(BASE_PATH+"train_product_context_sim.csv", error_bad_lines=False)
pc_sim_sub = pd.read_csv(BASE_PATH+"val_product_context_sim.csv")

pc_sim_train = pc_sim_train.fillna(0)
pc_sim_sub = pc_sim_sub.fillna(0)


In [None]:
pc_sim_train.head()

In [None]:
train = train.merge(pc_sim_train, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(pc_sim_sub, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [None]:
train["days_elapsed_since_first_user_action"] = train["days_elapsed"] - train["first_user_action_days_elapsed"]
sub["days_elapsed_since_first_user_action"] = sub["days_elapsed"] - sub["first_user_action_days_elapsed"]


In [None]:
train["diff_prod_price_from_user_tier_mean"] = train["product_price"] - train["user_tier_mean_product_price"]
train["diff_prod_start_online_date_from_user_tier_mean"] = train["product_price"] - train["user_tier_mean_product_start_online_date"]

sub["diff_prod_price_from_user_tier_mean"] = sub["product_price"] - sub["user_tier_mean_product_price"]
sub["diff_prod_start_online_date_from_user_tier_mean"] = sub["product_price"] - sub["user_tier_mean_product_start_online_date"]


In [None]:
train["diff_start_online_date_from_user_tier_clicked_mean"] = train["start_online_date"] - train["user_tier_mean_click_product_price"]
train["diff_start_online_date_from_user_tier_clicked_mean"] = train["start_online_date"] - train["user_tier_mean_click_product_start_online_date"]

sub["diff_start_online_date_from_user_tier_clicked_mean"] = sub["start_online_date"] - sub["user_tier_mean_click_product_price"]
sub["diff_start_online_date_from_user_tier_clicked_mean"] = sub["start_online_date"] - sub["user_tier_mean_click_product_start_online_date"]


In [None]:
train["diff_prod_price_from_session_mean"] = train["product_price"] - train["mean_session_product_price"]
train["diff_prod_price_from_query_mean"] = train["product_price"] - train["max_query_price"]

sub["diff_prod_price_from_session_mean"] = sub["product_price"] - sub["mean_session_product_price"]
sub["diff_prod_price_from_query_mean"] = sub["product_price"] - sub["max_query_price"]


In [None]:
dic = json.load(open("../preprocessed_data/average_click_values.json", "r"))


In [None]:
train["diff_prod_price_from_click_mean"] = train["product_price"] - dic["mean_click_price"]
sub["diff_prod_price_from_click_mean"] = sub["product_price"] - dic["mean_click_price"]


In [None]:
train["diff_start_online_date_from_session_mean"] = train["start_online_date"] - train["session_start_online_date_mean"]
train["diff_start_online_date_from_query_mean"] = train["start_online_date"] - train["mean_query_start_online_date"]

sub["diff_start_online_date_from_session_mean"] = sub["start_online_date"] - sub["session_start_online_date_mean"]
sub["diff_start_online_date_from_query_mean"] = sub["start_online_date"] - sub["mean_query_start_online_date"]


In [None]:
train["diff_start_online_date_from_click_mean"] = train["start_online_date"] - dic["mean_click_start_online_date"]
sub["diff_start_online_date_from_click_mean"] = sub["start_online_date"] - dic["mean_click_start_online_date"]


In [None]:
del attribute_df
gc.collect()


In [None]:
train_rank_features = pd.read_csv("../preprocessed_data/rank_features/train_rank_features_and_query_prod_list.csv")


In [None]:
sub_rank_features = pd.read_csv("../preprocessed_data/rank_features/val_rank_features_and_query_prod_list.csv")


In [None]:
train_rank_features = train_rank_features[['query_id', 'user_id', 'session_id', 'product_id', 'price_rank', \
                                           'start_online_date_rank', 'user_step']]
sub_rank_features = sub_rank_features[['query_id', 'user_id', 'session_id', 'product_id', 'price_rank', \
                                           'start_online_date_rank', 'user_step']]


In [None]:
train = train.merge(train_rank_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(sub_rank_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [None]:
train_user_features = pd.read_csv("../preprocessed_data/user_click_percentage_features.csv")
sub_user_features = pd.read_csv("../preprocessed_data/user_click_percentage_features.csv")


In [None]:
train = train.merge(train_user_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")
sub = sub.merge(sub_user_features, on=["query_id", "user_id", "session_id", "product_id"], how="left")


In [None]:
BASE_PATH = "../preprocessed_data/clickout_features/"

In [None]:
merge_paths = glob(BASE_PATH+"*.csv")
done_files = []


In [None]:
train, sub = merge_data(train, sub, merge_paths)


In [None]:
is_click = train.is_click.values
train = train.drop(["is_click"], 1)
train["is_click"] = is_click

In [None]:
train.memory_usage(deep=True).sum()//(1024*1024*1024), sub.memory_usage(deep=True).sum()//(1024*1024*1024)

In [None]:
train = train.sort_values(["query_id"])
sub = sub.sort_values(["query_id"])


In [None]:
train_groups = train.groupby("query_id").count().reset_index()["user_id"].values
sub_groups = sub.groupby("query_id").count().reset_index()["user_id"].values
sub_groups.shape

In [None]:
# train_groups = train_sample.groupby("query_id").count().reset_index()["user_id"].values


In [None]:
X=train.copy()


In [None]:
# Convert back category to object
for col in sub.columns.tolist():
    if X.dtypes[col]=="O" or "int" in str(train.dtypes[col]) or "float" in str(train.dtypes[col]):
        continue
    if sub.dtypes[col]=="O" or "int" in str(train.dtypes[col]) or "float" in str(train.dtypes[col]):
        continue
#     if X_val.dtypes[col]=="O" or "int" in str(train.dtypes[col]) or "float" in str(train.dtypes[col]):
#         continue
    if X.dtypes[col]=="category":
        X[col] = X[col].astype(str)
    if sub.dtypes[col]=="category":
        sub[col] = sub[col].astype(str)
        

In [None]:
full_data = pd.concat([X, sub], 0)

In [None]:
nan_df = pd.DataFrame(full_data.isnull().sum()).reset_index()
nan_df = nan_df.rename(columns={"index": "feature"})
nan_df["nan_perc"] = (nan_df[0]/len(full_data))*100.0
remove_cols = nan_df[nan_df["nan_perc"]>=30]["feature"].values.tolist()
nan_df[nan_df["nan_perc"]>=30]

In [None]:
remove_cols = remove_cols + ["week", "week_day", "material_values", "attribute_values"]
remove_cols

In [None]:
X.memory_usage(deep=True).sum()/(1024*1024*1024)

In [None]:
del train

In [None]:
gc.collect()

# Training Transformations and Code

## Categorical Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    

In [None]:
# cat_cols = train.select_dtypes('object').columns.tolist()[4:]

cols = X.columns.tolist()
cat_cols = cols[3:12] + cols[16:25]
cat_cols = cat_cols+['gender']
cat_cols

In [None]:
cat_encoder = MultiColumnLabelEncoder(columns = cat_cols)


In [None]:
X = cat_encoder.fit_transform(X)

In [None]:
X_sub = cat_encoder.transform(sub)

In [None]:
cat_encoder1 = MultiColumnLabelEncoder(columns = ['gender'])
X = cat_encoder1.fit_transform(X)
X_sub = cat_encoder1.transform(X_sub)


In [None]:
remove_cols = remove_cols + ["query_id", "session_id", "user_id"]

In [None]:
# X=train.copy()
y = X['is_click'].values
X = X.drop(remove_cols+['is_click'], 1)

In [None]:
X.head(2)

In [None]:
nan_df_filter = pd.DataFrame(X.isnull().sum()).reset_index()
nan_df_filter = nan_df_filter.rename(columns={"index": "feature"})
nan_df_filter["nan_perc"] = (nan_df_filter[0]/len(X))*100.0
nan_df_filter = nan_df_filter[nan_df_filter["nan_perc"]>0]
filter_nan_cols = nan_df_filter["feature"].values.tolist()


In [None]:
dic = {}

for col in filter_nan_cols:
    if col=='product_context_similarity':
        X[col] = X[col].astype(float)
        X[col] = X[col].fillna(0)    
    else:
        X[col] = X[col].astype(float)
        X[col] = X[col].fillna(X[col].mean())
        dic[col] = X[col].mean()
    

In [None]:
# json.dump(dic, open("../train_meta/column_mean_imputation.json", "w"))

In [None]:
X=reduce_mem_usage(X)
X_sub=reduce_mem_usage(X_sub)


In [None]:
X.shape, y.shape, sum(train_groups)

In [None]:
def compute_mean_reciprocal_rank(rs):
    '''
    rs: 2d array
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    '''

    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [None]:
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),

    search_spaces = {
        'learning_rate': (0.001, 0.4, 'log-uniform'),
        'min_child_weight': (0, 15),
        'max_depth': (10, 200),
        'max_leaves': (10, 200),
        'subsample': (0.6, 1.0, 'uniform'),
        'colsample_bytree': (0.3, 1.0, 'uniform'),
        'colsample_bylevel': (0.3, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': (1, 35)
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)

In [None]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [None]:
result = bayes_cv_tuner.fit(X, y, callback=status_print)