# My script in Instacart Market Basket Analysis

In [None]:
import jnius_config
jnius_config.set_classpath('.', 'JavaLibrary2/build/classes/')

import gc
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt
import timeit
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 


np.random.seed(98)

def load_data(path_data):
    '''
    --------------------------------order_product--------------------------------
    * Unique in order_id + product_id
    '''
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    '''
    --------------------------------order--------------------------------
    * This file tells us which set (prior, train, test) an order belongs
    * Unique in order_id
    * order_id in train, prior, test has no intersection
    * this is the #order_number order of this user
    '''
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})

    #  order in prior, train, test has no duplicate
    #  order_ids_pri = priors.order_id.unique()
    #  order_ids_trn = train.order_id.unique()
    #  order_ids_tst = orders[orders.eval_set == 'test']['order_id'].unique()
    #  print(set(order_ids_pri).intersection(set(order_ids_trn)))
    #  print(set(order_ids_pri).intersection(set(order_ids_tst)))
    #  print(set(order_ids_trn).intersection(set(order_ids_tst)))

    '''
    --------------------------------product--------------------------------
    * Unique in product_id
    '''
    products = pd.read_csv(path_data + 'products.csv', 
                           dtype={
                                'product_id': np.uint32,
                                'order_id': np.int32,
                                'aisle_id': np.uint8,
                                'department_id': np.uint8},
                                usecols=['product_id', 'aisle_id', 'department_id'])
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

path_data = ''
priors, train, orders, products, aisles, departments, sample_submission = load_data(path_data)

print('priors:',priors.shape)
print('train:',train.shape)
print('orders:',orders.shape)
print('products:',products.shape)
print('aisles:',aisles.shape)
print('deparments:',departments.shape)
print('sample submission',sample_submission.shape)
orders.days_since_prior_order = orders.days_since_prior_order.fillna(60)

### Select part or all of the data for feature engineering

In [None]:
train = train.merge(right=orders[['order_id','user_id']],on='order_id')
# LET'S FOCUS ANALYSIS ON THE FIRST 20000 USERS SHALL WE
# JUST DELETE THIS PART IN ORDER TO GET ALL USERS

priors = priors.merge(right=orders[['order_id','user_id']],on='order_id')

# This is for all data

priors.drop('user_id',axis=1,inplace=True)
priors.head()

# This is for performance


# orders=orders[orders.user_id>186000]
# train=train[train.user_id>186000]
# priors=priors[priors.user_id>186000]
# priors.drop('user_id',axis=1,inplace=True)
# priors.head()

# This is for feature engineering study

# orders=orders[orders.user_id>196000]
# train=train[train.user_id>196000]
# priors=priors[priors.user_id>196000]
# priors.drop('user_id',axis=1,inplace=True)
# priors.head()


# sample_users = orders[orders.eval_set!='prior'].user_id
# sample_users = sample_users.sample(frac=0.03,random_state=98)
# sample_users = pd.DataFrame(sample_users)

# orders = orders[orders.isin({'user_id': sample_users.user_id.tolist()}).user_id==True]
# priors = priors[priors.isin({'user_id': sample_users.user_id.tolist()}).user_id==True]
# train = train[train.isin({'user_id': sample_users.user_id.tolist()}).user_id==True]

# priors.drop('user_id',axis=1,inplace=True)
# priors.head()


In [None]:
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    '''Create statistical columns, group by [N columns] and compute stats on [N column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       agg_dict: python dictionary

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       {real_column_name: {your_specified_new_column_name : method}}
       agg_dict = {'user_id':{'prod_tot_cnts':'count'},
                   'reordered':{'reorder_tot_cnts_of_this_prod':'sum'},
                   'user_buy_product_times': {'prod_order_once':lambda x: sum(x==1),
                                              'prod_order_more_than_once':lambda x: sum(x==2)}}
       ka_add_stats_features_1_vs_n(train, ['product_id'], agg_dict)
    '''
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    '''Create statistical columns, group by [N columns] and compute stats on [1 column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       target_columns_list: list_like
          column you want to compute stats, need to be a list with only one element
       methods_list: list_like
          methods that you want to use, all methods that supported by groupby in Pandas

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       ka_add_stats_features_n_vs_1(train, group_columns_list=['x0'], target_columns_list=['x10'])
    '''
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new

# Engineer product specific features :

In [None]:
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id')
priors_orders_detail.days_since_prior_order = priors_orders_detail.days_since_prior_order.fillna(60)

print(priors_orders_detail[:5])

In [None]:
temp = priors_orders_detail.groupby('order_id').agg({'add_to_cart_order' : 'max'}).reset_index()
temp.rename(index=str,columns={'add_to_cart_order':'max_cart'},inplace=True)
print(temp[:5])
priors_orders_detail = priors_orders_detail.merge(temp,on='order_id',how='left')
priors_orders_detail['relative_cart']=priors_orders_detail.add_to_cart_order/priors_orders_detail.max_cart
priors_orders_detail.drop('max_cart',axis=1,inplace=True)
priors_orders_detail.head()

In [None]:

#print(priors.head())
# Products information ----------------------------------------------------------------
# add order information to priors set


# create new variables
# _user_buy_product_times: The user is buying the item several times
priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1

#print(priors_orders_detail[:5])
#print(priors_orders_detail.shape)


# _prod_tot_cnts: The total number of times the item was purchased, indicating the degree of liking
# _reorder_tot_cnts_of_this_prod: The total number of times this item was purchased again
### I think the following two are very difficult to understand, consider changing++++++++++++++++++++++++++
# _prod_order_once: The total number of times the item was purchased once
# _prod_order_more_than_once: The total number of times the item has been purchased more than once
agg_dict = {'user_id':{'_prod_tot_cnts':'count',
                       '_prod_unique_users': lambda x: x.nunique()}, 
            'reordered':{'_prod_reorder_tot_cnts':'sum',
                         '_prod_average_reorder':'mean',
                         '_prod_average_reorder_excl_first': 
                         lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                         (sum(priors_orders_detail.ix[x.index,'order_number'] > 1))},
            'days_since_prior_order':{'_prod_average_need_time':'mean'},
            'order_dow':{'_prod_average_order_dow':'mean',
                                     '_prod_std_order_dow':'std'},
            'order_number':{'_prod_average_order_number':'mean'}, 
            'order_hour_of_day':{'_prod_average_order_hour_of_day':'mean',
                         '_prod_std_order_hour_of_day':'std'},
            'add_to_cart_order':{'_prod_average_add_to_cart_order':'mean',
                         '_prod_std_add_to_cart_order':'std'},
            'relative_cart': {'_prod_relative_cart':'mean'},
            '_user_buy_product_times': {'_prod_average_user_buy_product_times':'mean',
                                        '_prod_std_user_buy_product_times':'std',
                                        '_prod_buy_first_time_total_cnt':lambda x: sum(x==1),
                                        '_prod_buy_second_time_total_cnt':lambda x: sum(x==2),
                                        '_prod_buy_more_than_once_total_cnt':lambda x: sum(x>=1)}}
prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)



# _prod_reorder_prob: This indicator is not well understood
# _prod_reorder_ratio: Commodity Repurchase Rate
prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt
prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts
prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt
prd['_prod_conversion'] = prd._prod_buy_first_time_total_cnt / prd._prod_buy_more_than_once_total_cnt

print(prd.shape)
print(prd.head())


In [None]:
print(priors_orders_detail.shape)
agg_dict ={'order_number':['max']}
priors_orders_detail_tmp = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict)
priors_orders_detail_tmp.rename(index=str,columns={'max':'us_last_order_number'},inplace=True)
priors_orders_detail = priors_orders_detail.merge(right=priors_orders_detail_tmp,on='user_id')


In [None]:
print(prd.shape)
prd = prd.merge(right=products, how='inner', on='product_id')
print(prd.shape)
prd.head()

In [None]:
# Add word2vec features:
product_vector_df = pd.read_csv('product_vectors_25.csv')
product_vector_df.head()

prd = prd.merge(right=product_vector_df,on=['product_id'],how='left')
print(prd.shape)
prd.head()
del product_vector_df

# User features

In [None]:
# _user_total_orders: The total number of orders for the user
# May consider adding other statistical indicators ++++++++++++++++++++++++++
# _user_sum_days_since_prior_order: From the last purchase time (and), this can only be calculated inside the orders table，
# priors_orders_detail is not at
# order level上面unique
# _user_mean_days_since_prior_order: From the last purchase time (mean)
agg_dict_2 = {'order_number':{'_user_total_orders':'max'},
              'order_dow':{'_user_average_order_dow':'mean',
                                     '_user_std_order_dow':'std'},
              'order_hour_of_day':{'_user_average_order_hour_of_day':'mean',
                         '_user_std_order_hour_of_day':'std'},
              'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', 
                                        '_user_average_days_since_prior_order':
                                       lambda x: (sum(x)+1)/(len(x)-1)}}
            
#sum(orders[orders.eval_set == 'prior'].ix[x.index,'order_number']>1)

users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

# _user_reorder_ratio: reorder The total number of times / the first single after the total number of post-purchase
# _user_total_products: The total number of items purchased by the user
# _user_distinct_products: The number of unique merchandise purchased by the user
agg_dict_3 = {'reordered':
              {'_user_reorder_ratio': 
               lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                         sum(priors_orders_detail.ix[x.index,'order_number'] > 1)},
              '_user_buy_product_times': {'_user_average_user_buy_product_times':'mean'},
              'product_id':{'_user_total_products':'count', 
                            '_user_distinct_products': lambda x: x.nunique()}}


us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
users = users.merge(us, how='inner')

# The average number of items per single
# The largest number of goods per list, the least number of commodities ++++++++++++++
users['_user_average_basket'] = users._user_total_products / users._user_total_orders

us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

users = users.merge(us, how='inner')

print(users.shape)
users[:5]

In [None]:
print(orders[orders.eval_set!='prior'].shape)
users = users.merge(right=orders[orders.eval_set!='prior'][['user_id',
                                  'order_dow','order_hour_of_day',
                                  'days_since_prior_order']],on='user_id',how='left')
users.rename(columns={'order_dow':'user_predicted_order_dow',
                          'order_hour_of_day':'user_predicted_order_hour_of_day',
                          'days_since_prior_order':'user_predicted_days_since_prior_order'}, inplace=True)


print(users.shape)

users.head()

In [None]:
# Add word2vec features:
user_vector_df = pd.read_csv('user_vectors_20.csv')
print(user_vector_df[:5])

users = users.merge(right=user_vector_df,on=['user_id'],how='left')
print(users.shape)
users.head()
del user_vector_df

In [None]:
# This feature indicates whether this user's general liking of instacart is going up or down
# I think this is a good feature
from scipy.stats import linregress
temp2 = orders.sort_values(['user_id','order_number'])

temp2 = temp2.groupby('user_id').agg({'days_since_prior_order':
                                      lambda x:linregress(list(range(len(x[1:]))),x[1:])[0]})

temp2.rename(index=str,columns={'days_since_prior_order':'order_trend'},inplace=True)
temp2.reset_index(inplace=True)
temp2.user_id = temp2.user_id.astype(int)
users = users.merge(right=temp2,on='user_id',how='left')
del temp2
print(users.columns)
users.head()

In [None]:
# # What is this user's none order rate?
temp5 = priors_orders_detail[priors_orders_detail.order_number!=1].groupby(['user_id','order_id']).agg({'reordered': 'max'}).reset_index()
temp5 =  temp5.groupby(['user_id']).agg({'reordered':lambda x: 1-np.mean(x)})
temp5.reset_index(inplace=True)
temp5.rename(index=str,columns={'reordered':'user_none_rate'},inplace=True)
users = users.merge(right=temp5,on=['user_id'],how='left')
users.head()
del temp5

In [None]:
users['user_none_rate'].hist(bins=20)

In [None]:
# How does a typical non-user look like and what's the similarity of our user to this user?

from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

start_time = timeit.default_timer()
# What does the average user who orders this product look like and whatis its similarity to our user:
none_users  = users[users.user_none_rate>0.3]
none_users.to_csv('none_users.csv')
none_user_means = none_users[['uv_1',
                            'uv_2',
                            'uv_3', 
                            'uv_4',
                            'uv_5', 
                            'uv_6', 
                            'uv_7', 
                            'uv_8', 
                            'uv_9', 
                            'uv_10',
                            'uv_11',
                            'uv_12', 
                            'uv_13', 
                            'uv_14',
                            'uv_15', 
                            'uv_16', 
                            'uv_17', 
                            'uv_18', 
                            'uv_19', 
                            'uv_20']].mean(axis=0)
#none_user_means.reset_index(inplace=True)
print(none_user_means.tolist())
print(type(none_user_means))

temp2 = pd.DataFrame(users[['user_id','uv_1', 'uv_2', 'uv_3', 'uv_4',
                            'uv_5', 'uv_6', 'uv_7', 'uv_8', 'uv_9', 'uv_10','uv_11', 
                            'uv_12', 'uv_13', 'uv_14','uv_15', 'uv_16', 'uv_17', 
                            'uv_18', 'uv_19', 'uv_20']].apply(lambda x: cosine(x[1:21],none_user_means.tolist()),axis=1,raw=True),columns=['user_none_user_sim'])

temp2.head()
users['user_none_user_sim'] = temp2['user_none_user_sim']
del temp2

# Database

In [None]:
# There should be a lot of variables that can be added here
# _up_order_count: The number of times a user buys the item
# _up_first_order_number: The number of orders the user purchased for the first time
# _up_last_order_number: The last time the user purchased the order
# _up_average_cart_position: The item is added to the average position in the shopping basket
agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                              '_up_first_order_number': 'min', 
                              '_up_last_order_number':'max'}, 
              'order_dow':{'_up_average_order_dow':'mean'},
              'reordered':{'_up_average_reordered':'mean'},
              'order_hour_of_day':{'_up_average_order_hour_of_day':'mean'},
              'add_to_cart_order':{'_up_average_cart_position': 'mean'},
              'relative_cart':{'_up_relative_cart': 'mean'},
             'days_since_prior_order':{'_up_average_days_since_prior_order':
                                       'mean'}}

data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, 
                                                      group_columns_list=['user_id', 'product_id'], 
                                                      agg_dict=agg_dict_4,only_new_feature=True)

print(priors_orders_detail.shape)
print(data.shape)

## MERGE ALL PRODUCTS USERS AND DATA
data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id')


# The number of times the product was purchased / the total number of orders
# The last time a product was purchased - the last time the item was purchased
# The number of times the item was purchased / the first purchase of the item to the last purchase of the order
data['_up_order_rate'] = data._up_order_count / data._user_total_orders
data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

#USER PRODUCT SIMILARITY


print(data[:5])
print(data.columns)
# add user_id to train set
# ADD PREDICTION LABEL TO DATA SET
data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')
data.reordered.fillna(0,inplace=True)

# release Memory
# del train, prd, usersre
# gc.collect()
# release Memory



In [None]:
from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

start_time = timeit.default_timer()
# What does the average user who orders this product look like and whatis its similarity to our user:
typical_user_means  = data[data._up_order_rate>0.65].groupby('product_id').agg({'uv_1':'mean',
                                                        'uv_2':'mean', 
                                                        'uv_3':'mean', 
                                                        'uv_4':'mean',
                                                        'uv_5':'mean', 
                                                        'uv_6':'mean', 
                                                        'uv_7':'mean', 
                                                        'uv_8':'mean', 
                                                        'uv_9':'mean', 
                                                        'uv_10':'mean',
                                                        'uv_11':'mean',
                                                        'uv_12':'mean', 
                                                        'uv_13':'mean', 
                                                        'uv_14':'mean',
                                                        'uv_15':'mean', 
                                                        'uv_16':'mean', 
                                                        'uv_17':'mean', 
                                                        'uv_18':'mean', 
                                                        'uv_19':'mean', 
                                                        'uv_20':'mean'})
typical_user_means.reset_index(inplace=True)
print(type(typical_user_means))
print(typical_user_means.shape)
#print(typical_user_means[:5])

temp = data[['user_id','product_id','uv_1', 'uv_2', 'uv_3', 'uv_4',
       'uv_5', 'uv_6', 'uv_7', 'uv_8', 'uv_9', 'uv_10','uv_11', 'uv_12', 'uv_13', 'uv_14',
       'uv_15', 'uv_16', 'uv_17', 'uv_18', 'uv_19', 'uv_20']].merge(right = typical_user_means,on='product_id',how='left')
temp = temp[['user_id','product_id','uv_1_x', 'uv_2_x', 'uv_3_x', 'uv_4_x',
       'uv_5_x', 'uv_6_x', 'uv_7_x', 'uv_8_x', 'uv_9_x', 'uv_10_x','uv_11_x', 'uv_12_x', 'uv_13_x', 'uv_14_x',
       'uv_15_x', 'uv_16_x', 'uv_17_x', 'uv_18_x', 'uv_19_x', 'uv_20_x','uv_1_y', 'uv_2_y', 'uv_3_y', 'uv_4_y',
       'uv_5_y', 'uv_6_y', 'uv_7_y', 'uv_8_y', 'uv_9_y', 'uv_10_y','uv_11_y', 'uv_12_y', 'uv_13_y', 'uv_14_y',
       'uv_15_y', 'uv_16_y', 'uv_17_y', 'uv_18_y', 'uv_19_y', 'uv_20_y']]
print(temp.shape)
print(temp[:5])

temp_2 = pd.DataFrame(temp.apply(lambda x: cosine(x[2:22],x[22:42]),axis=1,raw=True),columns=['user_typical_user_sim'])

temp_2.user_typical_user_sim.fillna(1,inplace=True)

data['user_typical_user_sim'] = temp_2['user_typical_user_sim']


print(timeit.default_timer() - start_time )

del temp_2,temp

In [None]:
# Add order streaks as a feature:
order_streaks = pd.read_csv('order_streaks.csv')
order_streaks.sort_values(['user_id','product_id'],inplace=True)
order_streaks[order_streaks.user_id==196002].head()
data = data.merge(right=order_streaks[['user_id','product_id','order_streak']],on=['user_id','product_id'],how='left')

del order_streaks
print(data.shape)
data.head()

In [None]:
def hour_diff(month1, month2):
    m_min = min(month1, month2)
    m_max = max(month1, month2)
    diff = m_max - m_min
    return diff if diff <= 12 else m_min + 24 - m_max

# def day_diff(day1, day2):
#     m_min = min(day1, day2)
#     m_max = max(day1, day2)
#     diff = m_max - m_min
#     return diff if diff <= 12 else m_min + 24 - m_max

def day_diff(month1, month2):
    return(min(abs(month1 - month2), month1 - month2 + 7, month2 - month1 + 7))

def time_diff(day1,hour1, day2,hour2):
    time1 = day1*24+hour1
    
    time2 = day2*24+hour2
    
    return(min(abs(time1 - time2), time1 - time2 + 168, time2 - time1 + 168))


time_diff(4,16,5,23)

In [None]:
# # Is this a typical order for this user? and is this a typical time for this product user
# from scipy.spatial.distance import cosine
start_time = timeit.default_timer()


data['user_product_hour_diff'] = data[['_prod_average_order_hour_of_day',
                                  'user_predicted_order_hour_of_day',]].apply(lambda x: hour_diff(x[0],x[1]),axis=1,raw=True)


data['user_product_day_diff'] = data[['_prod_average_order_dow',
                                  'user_predicted_order_dow']].apply(lambda x: day_diff(x[0],x[1]),axis=1,raw=True)

data['user_product_time_diff'] = data[['_prod_average_order_dow',
                                  '_prod_average_order_hour_of_day',
                                  'user_predicted_order_dow',
                                  'user_predicted_order_hour_of_day',]].apply(lambda x: time_diff(x[0],x[1],x[2],x[3]),axis=1,raw=True)


data['up_predicted_order_hour_diff'] = data[['_up_average_order_hour_of_day', 
                                      'user_predicted_order_hour_of_day']].apply(lambda x: hour_diff(x[0],x[1]),axis=1,raw=True)

data['up_predicted_order_day_diff'] = data[['_up_average_order_dow', 
                                      'user_predicted_order_dow']].apply(lambda x: day_diff(x[0],x[1]),axis=1,raw=True)

data['up_predicted_order_time_diff'] = data[['_up_average_order_dow',
                                       '_up_average_order_hour_of_day', 
                                      'user_predicted_order_dow',
                                      'user_predicted_order_hour_of_day']].apply(lambda x: time_diff(x[0],x[1],x[2],x[3]),axis=1,raw=True)



data['user_predicted_hour_diff'] = data[['_user_average_order_hour_of_day',
                                  'user_predicted_order_hour_of_day',]].apply(lambda x: hour_diff(x[0],x[1]),axis=1,raw=True)

data['user_predicted_day_diff'] = data[['_user_average_order_dow',
                                  'user_predicted_order_dow']].apply(lambda x: day_diff(x[0],x[1]),axis=1,raw=True)

data['user_predicted_time_diff'] = data[['_user_average_order_dow',
                                  '_user_average_order_hour_of_day',
                                  'user_predicted_order_dow',
                                  'user_predicted_order_hour_of_day']].apply(lambda x: time_diff(x[0],x[1],x[2],x[3]),axis=1,raw=True)


print(timeit.default_timer() - start_time )

# data[['user_id','product_id','up_cos_similarity','product_predicted_order_sim']].to_csv('user_product_order_similarities.csv')

In [None]:
# User product trends

from scipy.stats import linregress

tmp1 = orders[orders.eval_set=='prior'].sort_values(['user_id','order_number'])
tmp1 = tmp1.groupby('user_id').agg({'days_since_prior_order': lambda x : x.tolist()})
tmp1.reset_index(inplace=True)

tmp2 =  priors_orders_detail.sort_values(['user_id','order_number'])
tmp2 =  tmp2.groupby(['user_id', 
                    'product_id']).apply(lambda x: [1 if item in x.order_number.tolist() else 0 for item in range(1,x.us_last_order_number.iloc[0]+1)])
tmp2= pd.DataFrame(tmp2).reset_index()
tmp2.columns.values[2] = 'order_binary'

temp3 = tmp2.merge(right=tmp1,on='user_id',how='left')


In [None]:
def sumgaps(binarylist,gaplist):
    sumgap = []
    temp = 0
    for binary,gap in zip(binarylist,gaplist):
        if binary==0:
            temp+=gap
        else:
            sumgap.append(temp+gap)
            temp=0   
    return sumgap

print('test_sumgaps:',sumgaps([0,0,0,1,0,0],[-1,2,3,5,4,4]))
#del tmp1,tmp2

def lastgap(binarylist,gaplist):
    sumgap = []
    temp = 0
    for binary,gap in zip(binarylist,gaplist):
        if binary==0:
            temp+=gap
        else:
            temp=0   
    return temp

print('test_sumgaps:',lastgap([0,0,0,1,1,0],[-1,2,3,5,6,7]))


temp3['up_gaps'] = temp3[['order_binary','days_since_prior_order']].apply(lambda x: sumgaps(x[0],x[1]),axis=1)
temp3['up_average_need_time'] = temp3['up_gaps'].apply(lambda x: np.mean(x[1:]))
temp3['up_trend'] = temp3['up_gaps'].apply(lambda x:linregress(list(range(len(x)-1)),x[1:])[0] if len(x)>2 else float('nan'))
temp3['up_time_first_order'] = temp3['up_gaps'].apply(lambda x: x[0])
temp3['up_ordered_since'] = temp3[['order_binary','days_since_prior_order']].apply(lambda x: lastgap(x[0],x[1]),axis=1)



data = data.merge(right=temp3[['user_id','product_id','up_trend','up_time_first_order','up_ordered_since','up_average_need_time']],on=['user_id','product_id'],how='left')


In [None]:
# Inject recursivity from binary labels

def window_transform_series(series,window_size):
    # containers for input/output pairs
    X = []
    y = []
    z = []
    # go through the time series in single steps and add input output pairs
    for i in range(0,len(series)-window_size):
        y.append(series[i+window_size])
        X.append(series[i:(i+window_size)])
    z.append(series[-window_size:])
    #reshape each 
    X = np.asarray(X)
    X.shape = (np.shape(X)[0:2])
    y = np.asarray(y)
    y.shape = (len(y),1)
    z = np.asarray(z)
    z.shape = (np.shape(z)[0:2])
    
    return X,y,z

from sklearn.linear_model import LinearRegression

def calculate_recursive_prob(binary_list,window):
    x,y,z = window_transform_series(binary_list,window) 
    lg = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=-1)
    lg.fit(x,y)

    return np.log10(lg.predict(z)[0][0])

temp3['recursive_prob_2'] = temp3['order_binary'].apply(lambda x: calculate_recursive_prob(x,2) if len(x)>2 else float('nan'))
temp3['recursive_prob_3'] = temp3['order_binary'].apply(lambda x: calculate_recursive_prob(x,3) if len(x)>4 else float('nan'))


data = data.merge(right=temp3[['user_id','product_id','recursive_prob_2','recursive_prob_3']],on=['user_id','product_id'],how='left')
#temp3

In [None]:
#Product trends

temp4 = temp3.groupby('product_id').agg({'up_average_need_time':'mean',
                                         'up_trend':'mean',
                                         'up_time_first_order':'mean'})
temp4.reset_index(inplace=True)
temp4.rename(index=str,columns={'up_average_need_time':'product_specific_need_time',
                                'up_trend':'product_trend',
                                'up_time_first_order':'product_time_first_order'},inplace=True)

data = data.merge(right=temp4[['product_id',
                               'product_trend',
                               'product_specific_need_time',
                               'product_time_first_order']],on=['product_id'],how='left')

data['up_late']=(data._user_sum_days_since_prior_order>data.product_time_first_order)*1
data.head()

# data[['user_id',
#       'product_id',
#       'up_trend',
#       'time_first_order',
#       'up_trend_binary',
#       'product_trend',
#       'product_time_first_order',
#       'product_trend_binary',
#       'up_late']].to_csv('user_product_trends.csv')
del temp3,temp4,tmp1,tmp2

In [None]:
temp = data.groupby('user_id').agg({'aisle_id': lambda x :x.nunique(),
                             'department_id':lambda x :x.nunique(),
                           'product_id': lambda x :x.nunique()}).reset_index().rename(index=str,columns={'aisle_id':'unique_aisles',
                                                                    'department_id':'unique_departments',
                                                                    'product_id':'unique_products'})
temp['user_uniuqe_aisle_ratio'] = temp.unique_departments/temp.unique_products
temp['user_uniuqe_depart_ratio'] = temp.unique_aisles/temp.unique_products
data = data.merge(right=temp[['user_id','user_uniuqe_aisle_ratio','user_uniuqe_depart_ratio']],on='user_id',how='left')
data.head()

In [None]:
temp = priors_orders_detail.merge(right=prd[['product_id',
                                                 'aisle_id',
                                                 'department_id']],on='product_id',how='left')

temp2 = temp[temp.order_number==temp.us_last_order_number].groupby('user_id').agg({'aisle_id': lambda x :x.nunique(),
                            'department_id':lambda x :x.nunique(),
                           'product_id': lambda x :x.nunique()}).reset_index().rename(index=str,columns={'aisle_id':'unique_aisles',
                                                                    'department_id':'unique_departments',
                                                                    'product_id':'unique_products'})
temp2['user_last_unique_aisle_ratio'] = temp2.unique_aisles/temp2.unique_products
temp2['user_last_unique_depart_ratio'] = temp2.unique_departments/temp2.unique_products

data = data.merge(right=temp2[['user_id','user_last_unique_aisle_ratio','user_last_unique_depart_ratio']],on='user_id',how='left')
del temp,temp2
data.head()

In [None]:
temp = data[['user_id','product_id','wv_1', 'wv_2', 'wv_3', 'wv_4',
       'wv_5', 'wv_6', 'wv_7', 'wv_8', 'wv_9', 'wv_10','wv_11', 'wv_12', 'wv_13', 'wv_14',
       'wv_15', 'wv_16', 'wv_17', 'wv_18', 'wv_19', 'wv_20']].groupby('user_id').mean().reset_index()

temp = data[['user_id','product_id','wv_1', 'wv_2', 'wv_3', 'wv_4',
       'wv_5', 'wv_6', 'wv_7', 'wv_8', 'wv_9', 'wv_10','wv_11', 'wv_12', 'wv_13', 'wv_14',
       'wv_15', 'wv_16', 'wv_17', 'wv_18', 'wv_19', 'wv_20']].merge(right=temp,on='user_id',how='left')

temp_2 = pd.DataFrame(temp.apply(lambda x: cosine(x[2:22],x[22:42]),axis=1,raw=True),columns=['prod_typical_prod_sim'])

data['prod_typical_prod_sim'] = temp_2['prod_typical_prod_sim']
del temp,temp_2

In [None]:
temp = data[data.reordered==0]
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(temp['_prod_tot_cnts'], normed=True, bins=100)
plt.ylabel('prob');
axes = plt.gca()
axes.set_xlim([0,10000])

### Seperate training and testing data and write to file

In [None]:
del priors_orders_detail,orders,prd,users

all_train_X = data.loc[data.eval_set == "train",:]
test_X = data.loc[data.eval_set == "test",:]
del data

In [None]:
all_train_X.to_csv('all_train_X.csv')
test_X.to_csv('test_X.csv')

## Load Test and Train Data From File

In [2]:
all_train_X = pd.read_csv('all_train_X_AH_old.csv')
all_train_X.drop('Unnamed: 0',axis=1,inplace=True)
print(all_train_X.shape)
test_X = pd.read_csv('test_X_AH_old.csv')
test_X.drop('Unnamed: 0',axis=1,inplace=True)
print(test_X.shape)

(8474661, 139)
(4833292, 139)


In [None]:
#Combine unique sh1ng feautures
all_train_X = pd.read_csv('all_train_X_old.csv')
all_train_X.drop('Unnamed: 0',axis=1,inplace=True)
test_X = pd.read_csv('test_X_old.csv')
test_X.drop('Unnamed: 0',axis=1,inplace=True)
print(all_train_X.shape)
print(test_X.shape)

ah_train_features = pd.read_csv('AH_train_features_old.csv')
ah_test_features = pd.read_csv('AH_test_features_old.csv')
ah_train_features.drop('Unnamed: 0',axis=1,inplace=True)
ah_test_features.drop('Unnamed: 0',axis=1,inplace=True)

print(ah_train_features.shape)
print(ah_test_features.shape)
ah_train_features.head()
ah_test_features.head()

all_train_X = all_train_X.merge(ah_train_features,on=['user_id','product_id'],how='left')
test_X = test_X.merge(ah_test_features,on=['user_id','product_id'],how='left')
del ah_train_features,ah_test_features
print(all_train_X.shape)
print(test_X.shape)

all_train_X.to_csv('all_train_X_AH_old.csv')
test_X.to_csv('test_X_AH_old.csv')

In [None]:
import winsound

winsound.PlaySound('SystemExit', winsound.SND_FILENAME)

# XGBoost training starts here:

In [None]:
train_index, valid_index = list(GroupKFold(n_splits=2).split(all_train_X.drop(['eval_set', 
                                                                               'user_id', 
                                                                               'product_id', 
                                                                               'order_id',
                                                                               'reordered'], axis=1),
                                                  all_train_X.reordered,groups=all_train_X.user_id))[0]
all_train_X = all_train_X.iloc[valid_index,:]

In [None]:

train_index, valid_index = list(GroupKFold(n_splits=5).split(all_train_X.drop(['eval_set', 
                                                                               'user_id', 
                                                                               'product_id', 
                                                                               'order_id',
                                                                               'reordered'], axis=1),
                                                  all_train_X.reordered,groups=all_train_X.user_id))[0]

    
train_X = all_train_X.iloc[train_index,:]
valid_X = all_train_X.iloc[valid_index,:]

print(train_X.shape)
print(valid_X.shape)

In [None]:
print('the class imbalance:')
print(np.mean(train_X.reordered))


In [None]:
d_train = xgb.DMatrix(train_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1), train_X.reordered)

d_valid = xgb.DMatrix(valid_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1), valid_X.reordered)

In [None]:
start_time = timeit.default_timer()

xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.1
    ,"max_depth"        : 7
    ,"min_child_weight" : 6 
    ,"gamma"            :0.1
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :20
}


watchlist= [(d_valid, 'valid')]

bst = xgb.train(params=xgb_params, 
                    dtrain=d_train, 
                    num_boost_round=800, evals=watchlist, 
                    early_stopping_rounds=20, 
                    verbose_eval=10)


valid_probs = bst.predict(d_valid)


print(timeit.default_timer() - start_time )

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(context='poster',font_scale = 0.5)
xgb.plot_importance(bst,height=0.3,importance_type='gain')

# Training for None orders prediction:

In [None]:

data_tmp = train_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
train_none_X = train_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)

data_tmp = valid_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
valid_none_X = valid_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)


del data_tmp 
print('Percent None orders')
print(np.mean(train_none_X.None_order))

print('train_none_X size', train_none_X.shape)


In [None]:
d_train_none = xgb.DMatrix(train_none_X.drop([ 'user_id', 
                                    'None_order','order_id','reordered'], axis=1), train_none_X.None_order)

d_valid_none = xgb.DMatrix(valid_none_X.drop(['user_id', 
                                        'None_order','order_id','reordered'], axis=1), valid_none_X.None_order)


In [None]:
start_time = timeit.default_timer()


xgb_none_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.05
    ,"max_depth"        : 5
    ,"min_child_weight" :14
    ,"gamma"            :0.1
    ,"subsample"        :0.85
    ,"colsample_bytree" :0.75
    ,"alpha"            :2e-05
    ,"lambda"           :10
}


watchlist= [(d_train_none, 'train'),(d_valid_none, 'valid')]

bst_none = xgb.train(params=xgb_none_params, 
                    dtrain=d_train_none, 
                    num_boost_round=811, evals=watchlist, 
                    early_stopping_rounds=5, 
                    verbose_eval=10)

print(f1_score(valid_none_X.None_order,bst_none.predict(d_valid_none)>0.20))

print(timeit.default_timer() - start_time )

In [None]:
from sklearn.metrics import f1_score

print(f1_score(valid_none_X.None_order,bst_none.predict(d_valid_none)>0.21))

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(context='poster',font_scale = 0.5)
xgb.plot_importance(bst_none,height=0.3, importance_type='gain')

# Light GBM training

In [None]:
import lightgbm as lgb

start_time = timeit.default_timer()

#X_new = SelectKBest(chi2, k=2).fit_transform(X, y)


lgb_train_data = lgb.Dataset(train_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1), train_X.reordered)

lgb_valid_data = lgb.Dataset(valid_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1), valid_X.reordered)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
     'early_stopping_rounds':10,
    'metric': {'binary_logloss'},
    'num_leaves': 256,
    'min_sum_hessian_in_leaf': 20,
    'max_depth': -1,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    # 'bagging_fraction': 0.9,
    # 'bagging_freq': 3,
    #'evals_result': {'train':lgb_train_data, 'valid':lgb_valid_data},
    'verbose': 1
    
}



num_round = 200
lgb_bst = lgb.train(params, lgb_train_data, num_round, valid_sets=[lgb_valid_data],verbose_eval=20)

valid_probs = lgb_bst.predict(valid_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1))

print(timeit.default_timer() - start_time )

In [None]:
ax = lgb.plot_importance(lgb_bst, height=0.2, xlim=None, ylim=None, title='Feature importance', 
                         xlabel='Feature importance', ylabel='Features', importance_type='gain', 
                         max_num_features=None, ignore_zero=True, figsize=(40,40), grid=False)

plt.show()

# XGBoost predictions

In [None]:
train = pd.read_csv('order_products__train.csv', 
                dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8})

orders = pd.read_csv('orders.csv', 
                     dtype={
                            'order_id': np.int32,
                            'user_id': np.int64,
                            'eval_set': 'category',
                            'order_number': np.int16,
                            'order_dow': np.int8,
                            'order_hour_of_day': np.int8,
                            'days_since_prior_order': np.float32})

train = train.merge(right=orders[['order_id','user_id']],on='order_id')
del orders

In [None]:
# Prepare None predictions

valid_predicted_none_X = valid_none_X[['user_id','order_id']].copy()
valid_predicted_none_X.loc[:,'predicted_none'] = (bst_none.predict(d_valid_none) > 0.21).astype(int)
valid_predicted_none_X.sort_values('order_id',inplace=True)
valid_predicted_none_X.set_index('order_id', inplace=True)
print(valid_predicted_none_X.shape)
valid_predicted_none_X.head()


In [None]:
from jnius import autoclass

FScore = autoclass('FScore')

fscore = FScore()

# f1 score based on maximum f1 score optimization

valid_predicted_X = valid_X[['user_id','order_id','product_id']].copy()
valid_predicted_X.loc[:,'reordered'] = valid_probs
# Predictions with catboost
#valid_predicted_X.loc[:,'reordered'] = cb_model.predict(valid_XX,prediction_type='RawFormulaVal')

a = valid_predicted_X.groupby('order_id').apply(lambda x: fscore.max_expected_fscore_preds_cube(x.reordered.tolist(),1))
a = pd.DataFrame(a,columns=['labels'])
b = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])
b['labels'] = a['labels']
valid_submit_pros = pd.DataFrame(b.apply(lambda x: ' '.join([str(item) for item,label in zip(x['products'],x['labels']) if label==1]),axis=1),columns=['products'])
valid_submit_pros.index = valid_submit_pros.index.astype(int)
valid_submit_pros.head()

# #ADD NONE TO ORDER WITH NO PREDICTED PRODUCTS
valid_submit_nones = valid_predicted_X.groupby('order_id').agg({'reordered':'max','product_id':'min'})
valid_submit_nones = valid_submit_nones[valid_submit_nones.reordered==0]
valid_submit_nones.drop('reordered',axis=1,inplace=True)
valid_submit_nones.rename(index=str, columns={'product_id':'products'},inplace=True)
valid_submit_nones.index = valid_submit_nones.index.astype(int)
valid_submit_nones.products=''

valid_submit = pd.concat([valid_submit_pros,valid_submit_nones])
valid_submit.sort_index(inplace=True)
valid_submit.products = valid_submit.products.astype(str)


# Add extra none order prediction

valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']


valid_submit.loc[valid_submit['products']=='','products'] = 'None'

valid_submit['products'] = valid_submit['products'].str.strip()

print(valid_submit.shape)
print(valid_submit[:20])

In [None]:
#PROCESS GROUND TRUTH

valid_ordered_products = train[train.isin({'user_id': valid_X.user_id.tolist()}).user_id==True]

try:
    df_train_gt = pd.read_csv('valid.csv', index_col='order_id')
    print('reading_from_file:')
except:
    print('regenerating')
    train_gtl = []

    for uid, subset in valid_ordered_products.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    print(len(train_gtl))
    df_valid_gt = pd.DataFrame(train_gtl)

    df_valid_gt.columns = ['order_id', 'products']
    df_valid_gt.set_index('order_id', inplace=True)
    df_valid_gt.sort_index(inplace=True)
    
    #df_valid_gt.to_csv('train.csv')
print(df_valid_gt.shape)
df_valid_gt.sort_index(inplace=True)
df_valid_gt.products = df_valid_gt.products.astype(str)
df_valid_gt.head()

In [None]:
f1 = []
for gt, pred in zip(df_valid_gt.sort_index().products, valid_submit.sort_index().products):
    lgt = gt.replace("None", "-1").split(' ')
    lpred = pred.replace("None", "-1").split(' ')
    
    rr = (np.intersect1d(lgt, lpred))
    precision = np.float(len(rr)) / len(lpred)
    recall = np.float(len(rr)) / len(lgt)

    denom = precision + recall
    f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

print(np.mean(f1))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.hist(f1, normed=True, bins=40)
plt.ylabel('prob');
print(np.mean([item for item in f1 if item>-1]))

# F1 score in training data

In [None]:
# Prepare None predictions

valid_predicted_none_X = train_none_X.copy()
valid_predicted_none_X.loc[:,'predicted_none'] = (bst_none.predict(d_train_none) > 0.19).astype(int)
valid_predicted_none_X.sort_values('order_id',inplace=True)
valid_predicted_none_X.set_index('order_id', inplace=True)
print(valid_predicted_none_X.shape)
#valid_predicted_none_X.head()


from jnius import autoclass

FScore = autoclass('FScore')

fscore = FScore()

# f1 score based on maximum f1 score optimization

valid_predicted_X = train_X.copy()
valid_predicted_X.loc[:,'reordered'] = bst.predict(d_train)

a = valid_predicted_X.groupby('order_id').apply(lambda x: fscore.max_expected_fscore_preds_cube(x.reordered.tolist(),1))
a = pd.DataFrame(a,columns=['labels'])
b = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])
b['labels'] = a['labels']
valid_submit_pros = pd.DataFrame(b.apply(lambda x: ' '.join([str(item) for item,label in zip(x['products'],x['labels']) if label==1]),axis=1),columns=['products'])
valid_submit_pros.index = valid_submit_pros.index.astype(int)
valid_submit_pros.head()

#ADD NONE TO ORDER WITH NO PREDICTED PRODUCTS
valid_submit_nones = valid_predicted_X.groupby('order_id').agg({'reordered':'max','product_id':'min'})
valid_submit_nones = valid_submit_nones[valid_submit_nones.reordered==0]
valid_submit_nones.drop('reordered',axis=1,inplace=True)
valid_submit_nones.rename(index=str, columns={'product_id':'products'},inplace=True)
valid_submit_nones.index = valid_submit_nones.index.astype(int)
valid_submit_nones.products=''

valid_submit = pd.concat([valid_submit_pros,valid_submit_nones])
valid_submit.sort_index(inplace=True)
valid_submit.products = valid_submit.products.astype(str)


# Add extra none order prediction

valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']


valid_submit.loc[valid_submit['products']=='','products'] = 'None'

valid_submit['products'] = valid_submit['products'].str.strip()

#print(valid_submit.shape)
#print(valid_submit[:20])

In [None]:
#PROCESS GROUND TRUTH

valid_ordered_products = train[train.isin({'user_id': train_users.user_id.tolist()}).user_id==True]

try:
    df_train_gt = pd.read_csv('valid.csv', index_col='order_id')
    print('reading_from_file:')
except:
    print('regenerating')
    train_gtl = []

    for uid, subset in valid_ordered_products.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    print(len(train_gtl))
    df_valid_gt = pd.DataFrame(train_gtl)

    df_valid_gt.columns = ['order_id', 'products']
    df_valid_gt.set_index('order_id', inplace=True)
    df_valid_gt.sort_index(inplace=True)
    
    #df_valid_gt.to_csv('train.csv')
print(df_valid_gt.shape)
df_valid_gt.sort_index(inplace=True)
df_valid_gt.products = df_valid_gt.products.astype(str)
#df_valid_gt.head()

In [None]:
f1 = []
for gt, pred in zip(df_valid_gt.sort_index().products, valid_submit.sort_index().products):
    lgt = gt.replace("None", "-1").split(' ')
    lpred = pred.replace("None", "-1").split(' ')
    
    rr = (np.intersect1d(lgt, lpred))
    precision = np.float(len(rr)) / len(lpred)
    recall = np.float(len(rr)) / len(lgt)

    denom = precision + recall
    f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

print(np.mean(f1))

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.hist(f1, normed=True, bins=40)
plt.ylabel('prob');
print(np.mean([item for item in f1 if item>-1]))

# Prepare test data

In [None]:
# TEST DATA FOR xgboost
# F1 SCORE MAXIMIZATION THREHSOLD PREDICTIONS
# None PREDICTIONS
valid_predicted_none_X = test_none_X.copy()
valid_predicted_none_X.loc[:,'predicted_none'] = (bst_none.predict(d_test_none) > 0.16).astype(int)
valid_predicted_none_X.sort_values('order_id',inplace=True)
valid_predicted_none_X.set_index('order_id', inplace=True)
print(valid_predicted_none_X.shape)



d_test = xgb.DMatrix(test_X.drop(['eval_set', 'user_id', 
                                        'product_id', 'order_id',
                                        'reordered'], axis=1), test_X.reordered)


valid_predicted_X = test_X.copy()
valid_predicted_X.loc[:,'reordered'] = bst.predict(d_test)

a = valid_predicted_X.groupby('order_id').apply(lambda x: fscore.max_expected_fscore_preds_cube(x.reordered.tolist(),1))
a = pd.DataFrame(a,columns=['labels'])
b = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])
b['labels'] = a['labels']
valid_submit_pros = pd.DataFrame(b.apply(lambda x: ' '.join([str(item) for item,label in zip(x['products'],x['labels']) if label==1]),axis=1),columns=['products'])
valid_submit_pros.index = valid_submit_pros.index.astype(int)
valid_submit_pros.head()



#ADD NONE TO ORDER WITH NO PREDICTED PRODUCTS
valid_submit_nones = valid_predicted_X.groupby('order_id').agg({'reordered':'max','product_id':'min'})
valid_submit_nones = valid_submit_nones[valid_submit_nones.reordered==0]
valid_submit_nones.drop('reordered',axis=1,inplace=True)
valid_submit_nones.rename(index=str, columns={'product_id':'products'},inplace=True)
valid_submit_nones.index = valid_submit_nones.index.astype(int)
valid_submit_nones.products=''

valid_submit = pd.concat([valid_submit_pros,valid_submit_nones])
valid_submit.sort_index(inplace=True)
valid_submit.products = valid_submit.products.astype(str)


# Add extra none order prediction

valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']

valid_submit.loc[valid_submit['products']=='','products'] = 'None'

valid_submit['products'] = valid_submit['products'].str.strip()

print(valid_submit.shape)
valid_submit.to_csv('test_submit_single.csv')
print(valid_submit[:20])

#  XGB Optimization

In [None]:
# Optimization STEP1: The goal here is to determine the number of boosting rounds or the number of estimators, I think
# they are the same thing, however perhaps because of the leak the number I got was huge. probably more than 500 is 
# overkill. In this competition we cannot do this step because of shared user id between samples
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search


xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=200,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=6,
 scale_pos_weight=1,
 seed=27)



xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, 
                  d_train, 
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  metrics='logloss',
                  early_stopping_rounds=50, 
                  verbose_eval=100)

cvresult.tail(5)

In [None]:
train_X_opt = train_X.sample.drop(['eval_set', 'user_id', 'product_id', 'order_id',
                                        'reordered'], axis=1)
train_Y_opt = train_X.reordered

In [None]:
start_time = timeit.default_timer()




# OPTIMIZATION STEP 2:
# Tune max_depth and min_child_weight, try to increase these next time and maybe try a finer round
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier

gkf = list(GroupKFold(n_splits=3).split(train_X_opt,train_Y_opt,train_X.user_id))


param_test1 = {
 'max_depth':[5,6,7,8],
 'min_child_weight':[6,8,10,12,14]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=123, max_depth=5,
 min_child_weight=10, gamma=0.70, subsample=0.76, colsample_bytree=0.95,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=20,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_X_opt,train_Y_opt)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [None]:
# OPTINAL OPTIMIZATION STEP 2b for finer determination of tree depth and :

In [None]:
# Step 3: Tune gamma
# Now lets tune gamma value using the parameters already tuned above. Gamma can take various values 
# but I’ll check for 5 values here. You can go into more precise values as.

start_time = timeit.default_timer()




# OPTIMIZATION STEP 2:
# Tune max_depth and min_child_weight, try to increase these next time and maybe try a finer round
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier

gkf = list(GroupKFold(n_splits=3).split(train_X_opt,train_Y_opt,train_X.user_id))


param_test1 = {
    'gamma':[i/10.0 for i in range(0,15)]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=91, max_depth=7,
 min_child_weight=6, gamma=0.70, subsample=0.76, colsample_bytree=0.95,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=20,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_X_opt,train_Y_opt)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [None]:
# This shows that our original value of gamma, i.e. 0 is the optimum one. 
# Before proceeding, a good idea would be to re-calibrate the number of boosting rounds for the updated parameters.

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=6,
 scale_pos_weight=1,
 seed=27)

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, 
                  d_train_opt, 
                  num_boost_round=xgb1.get_params()['n_estimators'], 
                  nfold=5,
                  metrics='logloss',
                  early_stopping_rounds=50, 
                  verbose_eval=100)

cvresult.tail(5)

In [None]:
# The next step would be try different subsample and colsample_bytree values. 
# Lets do this in 2 stages as well and take values 0.6,0.7,0.8,0.9 for both to start with.
# BE CAREFUL ABOUT MAX AND MIN ALLOWED LEVELS OF EACH OF THE PARAMETERS

start_time = timeit.default_timer()

from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier

gkf = list(GroupKFold(n_splits=3).split(train_X_opt,train_Y_opt,train_X.user_id))



param_test1 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=126, max_depth=7,
 min_child_weight=6, gamma=0.1, subsample=0.76, colsample_bytree=0.95,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=20,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_X_opt,train_Y_opt)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [None]:
# Here, we found 0.8 as the optimum value for both subsample and colsample_bytree. 
# Now we should try values in 0.05 interval around these.

param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch5.fit(train_X_opt,train_Y_opt)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train_X_opt,train_Y_opt)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
param_test7 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(train_X_opt,train_Y_opt)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

# FINAL RESULT:
# YOU should lower the learning rate and add more trees. with the optimum number of parameters

# None prediction tuning

In [None]:
start_time = timeit.default_timer()




# OPTIMIZATION STEP 2:
# Tune max_depth and min_child_weight, try to increase these next time and maybe try a finer round
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier


gkf = list(GroupKFold(n_splits=3).split(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order,
                                        train_none_X.user_id))



param_test1 = {
 'max_depth':[5,6,7,8,9],
 'min_child_weight':[6,8,10,12,14]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=158, max_depth=5,
 min_child_weight=14, gamma=0.1, subsample=0.76, colsample_bytree=0.95,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=10,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [None]:
start_time = timeit.default_timer()


# OPTIMIZATION STEP 2:
# Tune max_depth and min_child_weight, try to increase these next time and maybe try a finer round
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier


gkf = list(GroupKFold(n_splits=3).split(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order,
                                        train_none_X.user_id))



param_test1 = {
    'gamma':[i/10.0 for i in range(0,15)]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=158, max_depth=5,
 min_child_weight=14, gamma=0.1, subsample=0.76, colsample_bytree=0.95,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=10,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
start_time = timeit.default_timer()


# OPTIMIZATION STEP 2:
# Tune max_depth and min_child_weight, try to increase these next time and maybe try a finer round
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier


gkf = list(GroupKFold(n_splits=3).split(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order,
                                        train_none_X.user_id))



param_test1 = {
     'colsample_bytree':[i/100.0 for i in range(55,85,10)]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=158, max_depth=5,
 min_child_weight=14, gamma=0.1, subsample=0.85, colsample_bytree=0.75,
 objective= 'binary:logistic', nthread=-1,scale_pos_weight=1, reg_lambda=10,seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), 
                                        train_none_X.None_order)

print(timeit.default_timer() - start_time )

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

# Light GBM optimization

In [None]:
all_train_X = pd.read_csv('all_train_X_AH.csv')
all_train_X.drop('Unnamed: 0',axis=1,inplace=True)
#test_X = pd.read_csv('test_X_AH.csv')
#test_X.drop('Unnamed: 0',axis=1,inplace=True)
print(all_train_X.shape)
#print(test_X.shape)

In [None]:
train_index, valid_index = list(GroupKFold(n_splits=4).split(all_train_X.drop(['eval_set', 
                                                                               'user_id', 
                                                                               'product_id', 
                                                                               'order_id',
                                                                               'reordered'], axis=1),
                                                  all_train_X.reordered,groups=all_train_X.user_id))[0]
all_train_X = all_train_X.iloc[valid_index,:]

In [None]:
start_time = timeit.default_timer()

from lightgbm import LGBMClassifier
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import GridSearchCV 

gkf = list(GroupKFold(n_splits=3).split(all_train_X.drop(['eval_set', 
                                                            'user_id', 
                                                            'product_id', 
                                                            'order_id',
                                                            'reordered'],axis=1), all_train_X.reordered,groups=all_train_X.user_id))
           
           
param_test={'num_leaves':[216, 236, 256, 276,296],
              'max_depth':[-1, 10, 12, 14],
              'min_child_weight':[ 15, 20, 25]}
           

gsearch1 = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=256, feature_fraction=0.6,
                                                   max_depth=12, learning_rate=0.1, 
n_estimators=260, max_bin=255, subsample_for_bin=50000, objective='binary', min_split_gain=0, min_child_weight=20, 
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, seed=0, 
                                              nthread=-1, silent=True), 
param_grid = param_test, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=gkf)

gsearch1.fit(all_train_X.drop(['eval_set', 'user_id', 'product_id', 'order_id','reordered'],axis=1), all_train_X.reordered)

print(timeit.default_timer() - start_time )
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

# 5 fold CV Functions

Once the features are engineered, we can do 5-fold CV using the functions below

In [3]:
# -*- coding: utf-8 -*-
"""
@author: Faron
"""
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from datetime import datetime

'''
This kernel implements the O(nÂ²) F1-Score expectation maximization algorithm presented in
"Ye, N., Chai, K., Lee, W., and Chieu, H.  Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."

It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
with [[None]] being the indicator for predicting label "None"
given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
under label independence assumption by means of dynamic programming in O(nÂ²).
'''

class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


def print_best_prediction(P, pNone=None):
    print("Maximize F1-Expectation")
    print("=" * 23)
    P = np.sort(P)[::-1]
    n = P.shape[0]
    L = ['L{}'.format(i + 1) for i in range(n)]

    if pNone is None:
        print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
    print("Posteriors: {} (n={})".format(PL, n))
    print("p(None|x)={}".format(pNone))

    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))



In [4]:

def prepare_valid_preds_faron(valid_X,valid_probs,valid_predicted_none_X,threshold):

    valid_predicted_X = valid_X.copy()
    valid_predicted_X.loc[:,'reordered'] = valid_probs

    valid_predicted_X = valid_predicted_X.merge(valid_predicted_none_X[['order_id','predicted_none']],on='order_id',how='left')
    valid_predicted_X.sort_values(['order_id','reordered'],ascending=False,inplace=True)


    a = valid_predicted_X.groupby('order_id').apply(lambda x: F1Optimizer.maximize_expectation([prob if prob>threshold else 0 for prob in x.reordered.tolist()])[0])
    
    # a = valid_predicted_X.groupby('order_id').apply(lambda x: F1Optimizer.maximize_expectation(x.reordered.tolist(), 
    #                                                                                       x.predicted_none.tolist()[0])[0])
    a = pd.DataFrame(a,columns=['top_k'])

    #b = valid_predicted_X.groupby('order_id').apply(lambda x: F1Optimizer.maximize_expectation(x.reordered.tolist(), 
    #                                                                                           x.predicted_none.tolist()[0])[1])
    
    b = valid_predicted_X.groupby('order_id').apply(lambda x: F1Optimizer.maximize_expectation([prob if prob>threshold else 0 for prob in x.reordered.tolist()])[1])
    b = pd.DataFrame(b,columns=['None_product'])

    c = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])

    c['top_k'] = a['top_k']
    c['None_product'] = b['None_product']

    valid_submit = pd.DataFrame(c.apply(lambda x:' '.join([str(item) for index,item in enumerate(x['products']) if index<x['top_k']]),axis=1),columns=['products'])
    valid_submit.index = valid_submit.index.astype(int)

    valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']


    valid_submit.loc[valid_submit['products']==' ','products'] = 'None'

    valid_submit['products'] = valid_submit['products'].str.strip()

    print('valid_submit:',valid_submit.shape)
    print('valid_submit:',valid_submit[:20])
    
    return valid_submit


In [5]:
def prepare_none_data(train_X,valid_X,test_X):
    
    data_tmp = train_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
    data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
    train_none_X = train_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)

    data_tmp = valid_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
    data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
    valid_none_X = valid_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)

    data_tmp = test_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
    data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
    test_none_X = test_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)
    

    print('Percent None orders')
    print(np.mean(train_none_X.None_order))
#     print(train_none_X.columns)
#     print(valid_none_X.columns)
#     print(test_none_X.columns)

    print('train_none_X size', train_none_X.shape)
    print('valid_none_X size',valid_none_X.shape)
    print('test_none_X size',test_none_X.shape)
    
    return train_none_X,valid_none_X,test_none_X

In [6]:
def prepare_none_data_test(test_X):
    

    data_tmp = test_X.groupby('user_id').agg({'reordered' : lambda x: 1!=max(x) }).reset_index()
    data_tmp.rename(index=str,columns={'reordered':'None_order'},inplace=True)
    test_none_X = test_X.merge(right=data_tmp,on='user_id',how='left').groupby('user_id').mean().reset_index(inplace=False)
    
    print('test_none_X size',test_none_X.shape)
    
    return test_none_X

In [7]:
from jnius import autoclass


def prepare_valid_preds(valid_X,valid_probs,valid_predicted_none_X,threshold):

    

    FScore = autoclass('FScore')

    fscore = FScore()

    # f1 score based on maximum f1 score optimization

    valid_predicted_X = valid_X[['user_id','order_id','product_id']].copy()
    valid_predicted_X.loc[:,'reordered'] = valid_probs
    # Predictions with catboost
    #valid_predicted_X.loc[:,'reordered'] = cb_model.predict(valid_XX,prediction_type='RawFormulaVal')

    a = valid_predicted_X.groupby('order_id').apply(lambda x: 
                                                    fscore.max_expected_fscore_preds_cube([prob if prob>threshold else 0 for prob in x.reordered.tolist()],1))
    a = pd.DataFrame(a,columns=['labels'])
    b = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])
    b['labels'] = a['labels']
    valid_submit_pros = pd.DataFrame(b.apply(lambda x: ' '.join([str(item) for item,label in zip(x['products'],x['labels']) if label==1]),axis=1),columns=['products'])
    valid_submit_pros.index = valid_submit_pros.index.astype(int)
    valid_submit_pros.head()

    #ADD NONE TO ORDER WITH NO PREDICTED PRODUCTS
    valid_submit_nones = valid_predicted_X.groupby('order_id').agg({'reordered':'max','product_id':'min'})
    valid_submit_nones = valid_submit_nones[valid_submit_nones.reordered==0]
    valid_submit_nones.drop('reordered',axis=1,inplace=True)
    valid_submit_nones.rename(index=str, columns={'product_id':'products'},inplace=True)
    valid_submit_nones.index = valid_submit_nones.index.astype(int)
    valid_submit_nones.products=''

    valid_submit = pd.concat([valid_submit_pros,valid_submit_nones])
    valid_submit.sort_index(inplace=True)
    valid_submit.products = valid_submit.products.astype(str)


    # Add extra none order prediction

    valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']


    valid_submit.loc[valid_submit['products']=='','products'] = 'None'

    valid_submit['products'] = valid_submit['products'].str.strip()

    print('valid_submit:',valid_submit.shape)
    print('valid_submit:',valid_submit[:20])
    
    return valid_submit

In [8]:
def prepare_ground_truth_simple(valid_ordered_products):
    
    #PROCESS GROUND TRUTH

    #valid_ordered_products = train[train.isin({'user_id': valid_X.user_id.tolist()}).user_id==True]

    print('Reordered distribution:')
    print(np.mean(valid_ordered_products.reordered))
    train_gtl = []

    for uid, subset in valid_ordered_products.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    print(len(train_gtl))
    df_valid_gt = pd.DataFrame(train_gtl)

    df_valid_gt.columns = ['order_id', 'products']
    df_valid_gt.set_index('order_id', inplace=True)
    df_valid_gt.sort_index(inplace=True)


    print(df_valid_gt.shape)
    df_valid_gt.sort_index(inplace=True)
    df_valid_gt.products = df_valid_gt.products.astype(str)
    
    print('df_valid_gt:',df_valid_gt.shape)
    print('df_valid_gt:',df_valid_gt[:5])
    
    return df_valid_gt

In [9]:
def prepare_ground_truth(train,valid_X):
    
    #PROCESS GROUND TRUTH

    valid_ordered_products = train[train.isin({'user_id': valid_X.user_id.tolist()}).user_id==True]

    print('regenerating')
    train_gtl = []

    for uid, subset in valid_ordered_products.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    print(len(train_gtl))
    df_valid_gt = pd.DataFrame(train_gtl)

    df_valid_gt.columns = ['order_id', 'products']
    df_valid_gt.set_index('order_id', inplace=True)
    df_valid_gt.sort_index(inplace=True)


    print(df_valid_gt.shape)
    df_valid_gt.sort_index(inplace=True)
    df_valid_gt.products = df_valid_gt.products.astype(str)
    
    print('df_valid_gt:',df_valid_gt.shape)
    print('df_valid_gt:',df_valid_gt[:5])
    
    return df_valid_gt

In [10]:
def print_F1_score(valid_submit,df_valid_gt):

    f1 = []
    for gt, pred in zip(df_valid_gt.sort_index().products, valid_submit.sort_index().products):
        lgt = gt.replace("None", "-1").split(' ')
        lpred = pred.replace("None", "-1").split(' ')

        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

    valid_score = np.mean(f1)
    print('validation score:',valid_score)

    return

In [11]:
# Prepare None predictions

def prepare_None_predictions(valid_none_X,valid_none_probs,none_threshold):

    valid_predicted_none_X = valid_none_X[['user_id','order_id']].copy()
    valid_predicted_none_X.loc[:,'predicted_none'] = ( valid_none_probs > none_threshold).astype(int)
    valid_predicted_none_X.sort_values('order_id',inplace=True)
    valid_predicted_none_X.set_index('order_id', inplace=True)
    print('valid_predicted_none_X.shape:',valid_predicted_none_X.shape)
    valid_predicted_none_X.head()
    
    return valid_predicted_none_X
    

In [12]:
# Prepare None predictions

def prepare_None_predictions_faron(valid_none_X,valid_none_probs,none_threshold):

    valid_predicted_none_X = valid_none_X[['user_id','order_id']].copy()
    valid_predicted_none_X.loc[:,'predicted_none'] = ( valid_none_probs > none_threshold).astype(int)
    valid_predicted_none_X.sort_values('order_id',inplace=True)
    #valid_predicted_none_X.set_index('order_id', inplace=True)
    print('valid_predicted_none_X.shape:',valid_predicted_none_X.shape)
    valid_predicted_none_X.head()
    
    return valid_predicted_none_X

In [13]:
# TEST DATA FOR xgboost
# F1 SCORE MAXIMIZATION THREHSOLD PREDICTIONS
# None PREDICTIONS

from jnius import autoclass

 

def prepare_test_predictions(test_X,text_none_X,test_probs,test_none_probs,none_threshold,threshold):
    valid_predicted_none_X = test_none_X.copy()
    valid_predicted_none_X.loc[:,'predicted_none'] = ( test_none_probs> none_threshold).astype(int)
    valid_predicted_none_X.sort_values('order_id',inplace=True)
    valid_predicted_none_X.set_index('order_id', inplace=True)
    print(valid_predicted_none_X.shape)

    FScore = autoclass('FScore')

    fscore = FScore()

    valid_predicted_X = test_X[['user_id','order_id','product_id']].copy()
    valid_predicted_X.loc[:,'reordered'] = test_probs

    a = valid_predicted_X.groupby('order_id').apply(lambda x: 
                                                    fscore.max_expected_fscore_preds_cube([prob if prob>threshold else 0 for prob in x.reordered.tolist()],1))
    a = pd.DataFrame(a,columns=['labels'])
    b = pd.DataFrame(valid_predicted_X.groupby('order_id').apply(lambda x: x.product_id.tolist()),columns=['products'])
    b['labels'] = a['labels']
    valid_submit_pros = pd.DataFrame(b.apply(lambda x: ' '.join([str(item) for item,label in zip(x['products'],x['labels']) if label==1]),axis=1),columns=['products'])
    valid_submit_pros.index = valid_submit_pros.index.astype(int)
    valid_submit_pros.head()



    #ADD NONE TO ORDER WITH NO PREDICTED PRODUCTS
    valid_submit_nones = valid_predicted_X.groupby('order_id').agg({'reordered':'max','product_id':'min'})
    valid_submit_nones = valid_submit_nones[valid_submit_nones.reordered==0]
    valid_submit_nones.drop('reordered',axis=1,inplace=True)
    valid_submit_nones.rename(index=str, columns={'product_id':'products'},inplace=True)
    valid_submit_nones.index = valid_submit_nones.index.astype(int)
    valid_submit_nones.products=''

    valid_submit = pd.concat([valid_submit_pros,valid_submit_nones])
    valid_submit.sort_index(inplace=True)
    valid_submit.products = valid_submit.products.astype(str)


    # Add extra none order prediction

    valid_submit.loc[valid_predicted_none_X['predicted_none']==1,'products'] = 'None ' + valid_submit['products']

    valid_submit.loc[valid_submit['products']=='','products'] = 'None'

    valid_submit['products'] = valid_submit['products'].str.strip()

    print(valid_submit.shape)
    valid_submit.to_csv('test_submit.csv')
    print(valid_submit[:5])

# Predictions with lightGBM

In [None]:
train_index, valid_index = list(GroupKFold(n_splits=3).split(all_train_X.drop(['eval_set', 
                                                                               'user_id', 
                                                                               'product_id', 
                                                                               'order_id',
                                                                               'reordered'], axis=1),
                                                  all_train_X.reordered,groups=all_train_X.user_id))[0]
all_train_X = all_train_X.iloc[valid_index,:]

In [None]:
#

import lightgbm as lgb

#################################

xgb_none_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.05
    ,"max_depth"        : 5
    ,"min_child_weight" :14
    ,"gamma"            :0.05
    ,"subsample"        :0.85
    ,"colsample_bytree" :0.75
    ,"alpha"            :2e-05
    ,"lambda"           :10
}

num_round = 620
learning_rates= [0.05 if x<350 else 0.01 for x in range(num_round)]

light_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
     'early_stopping_rounds':10,
    'metric': {'binary_logloss'},
    'num_leaves': 256,
    'min_sum_hessian_in_leaf': 20,
    'max_depth': -1,
    'feature_fraction': 0.50,
    # 'bagging_fraction': 0.9,
    # 'bagging_freq': 3,
    'verbose': 1,
    'min_gain_to_split':0,
    'max_bin':255
}


none_threshold = 0.18

n_splits = 4
group_kfold = GroupKFold(n_splits=n_splits)

test_probs = np.ndarray(shape=(n_splits,test_X.shape[0]))
test_none_probs = np.ndarray(shape=(n_splits,test_X.groupby('user_id').size().shape[0]))


split_index = 0
#stopping_rounds = [400,500,400,500]

for train_index, valid_index in group_kfold.split(all_train_X.drop(['eval_set', 'user_id', 
                                            'product_id', 'order_id',
                                            'reordered'], axis=1),
                                                  all_train_X.reordered,groups=all_train_X.user_id):
    
    #train_X = all_train_X.iloc[train_index,:]
    valid_X = all_train_X.iloc[valid_index,:]
    
    train_none_X,valid_none_X,test_none_X = prepare_none_data(all_train_X.iloc[train_index,:],valid_X,test_X)
    
    #####################################################################
    #Start training for none_orders
    d_train_none = xgb.DMatrix(train_none_X.drop([ 'user_id', 
                                        'None_order','order_id','reordered'], axis=1), train_none_X.None_order)

    d_valid_none = xgb.DMatrix(valid_none_X.drop(['user_id', 
                                            'None_order','order_id','reordered'], axis=1), valid_none_X.None_order)


    

    watchlist= [(d_valid_none, 'valid')]

    bst_none = xgb.train(params=xgb_none_params, 
                        dtrain=d_train_none, 
                        num_boost_round=811, evals=watchlist, 
                        early_stopping_rounds=10, 
                        verbose_eval=100)
    
    d_test_none = xgb.DMatrix(test_none_X.drop(['user_id', 
                                        'None_order','order_id','reordered'], axis=1))
    
    valid_none_probs = bst_none.predict(d_valid_none)
    
    
    
    ####################################################################
    # Start training for LightGBM
    lgb_train_data = lgb.Dataset(all_train_X.iloc[train_index,:].drop(['eval_set', 'user_id', 
                                            'product_id', 'order_id',
                                            'reordered'], axis=1), label=all_train_X.iloc[train_index,:].reordered)

    lgb_valid_data = lgb.Dataset(valid_X.drop(['eval_set', 'user_id', 
                                            'product_id', 'order_id',
                                            'reordered'], axis=1), label=valid_X.reordered)


    lgb_bst = lgb.train(light_params, 
                        lgb_train_data, 
                        num_round, 
                        valid_sets=[lgb_valid_data],
                        verbose_eval=20,learning_rates=learning_rates)

    valid_probs = lgb_bst.predict(valid_X.drop(['eval_set', 'user_id', 
                                            'product_id', 'order_id',
                                            'reordered'], axis=1))

    ####################################################################################
    
    valid_predicted_none_X = prepare_None_predictions(valid_none_X,valid_none_probs,none_threshold)
    valid_submit = prepare_valid_preds(valid_X,valid_probs,valid_predicted_none_X,0)
    df_valid_gt = prepare_ground_truth_simple(valid_X)
    print_F1_score(valid_submit,df_valid_gt)
    
    
    test_probs[split_index,:] = lgb_bst.predict(test_X.drop(['eval_set', 'user_id', 
                                            'product_id', 'order_id',
                                            'reordered'], axis=1))
    test_none_probs[split_index,:] = bst_none.predict(d_test_none)
    np.savez('10CV_light_probs.npz',  test_probs=test_probs,test_none_probs=test_none_probs)

    print('finished '+str(split_index)+' split')
    print()
    
    split_index+=1
    
    if split_index==1:
        test_probs = test_probs[0]
        test_none_probs = test_none_probs[0]
        break
        
    
test_none_probs = np.mean(test_none_probs,axis=0)
test_probs = np.mean(test_probs,axis=0)
    
#prepare_test_predictions(test_X,test_none_X,test_probs,test_none_probs,none_threshold) 

In [16]:
# Predict with a different None threshold
probs_dict = np.load('10CV_light_probs.npz')
test_probs=probs_dict['test_probs']
test_none_probs=probs_dict['test_none_probs']
test_probs = test_probs[0]
test_none_probs = test_none_probs[0]
none_threshold = 0.19
prepare_test_predictions(test_X,test_none_X,test_probs,test_none_probs,none_threshold,0.005) 

(75000, 139)
(75000, 1)
                                                   products
order_id                                                   
17                      21709 47766 38777 21463 26429 13107
34        13176 47766 47792 21137 43504 39180 39475 1608...
137                 41787 24852 5134 38689 25890 2326 23794
182       47209 11520 39275 13629 47672 5479 33000 41149...
257       49235 24852 27966 37646 21137 24838 27104 4501...


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

ax = lgb.plot_importance(lgb_bst, height=0.2, xlim=None, ylim=None, title='Feature importance', 
                         xlabel='Feature importance', ylabel='Features', importance_type='gain', 
                         max_num_features=None, ignore_zero=True, figsize=(20,20), grid=False)

plt.show()