<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

https://www.kaggle.com/paulantoine/light-gbm-benchmark-0-3692

This scripts considered all the products a user has ordered.

We train a model computing the probability of reorder on the "train" data.

For the submission, we keep the orders that have a probability of re-order higher than a threshold.

In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = os.path.join('..', 'all/')


print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('products {}: {}'.format(products.shape, ', '.join(products.columns)))
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

loading prior
loading train
loading orders
loading products
products (49688, 3): product_id, aisle_id, department_id
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [2]:
def compute_product_features():
    prods = pd.DataFrame()
    prods['orders'] = priors.groupby('product_id').size().astype(np.int32)
    prods['reorders'] = priors['reordered'].groupby(priors['product_id']).sum().astype(np.float32)
    prods['reorder_rate'] = prods['reorders'] / prods['orders'].astype(np.float32)
    return prods


prods = compute_product_features()
prods.head()

Unnamed: 0_level_0,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,1136.0,0.613391
2,90,12.0,0.133333
3,277,203.0,0.732852
4,329,147.0,0.446809
5,15,9.0,0.6


In [3]:
# add product's historical performance (calculated from the prior table)
# to the product table as additional features
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
products.head()

Unnamed: 0_level_0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,61,19,1852.0,1136.0,0.613391
2,2,104,13,90.0,12.0,0.133333
3,3,94,7,277.0,203.0,0.732852
4,4,38,1,329.0,147.0,0.446809
5,5,5,13,15.0,9.0,0.6


In [4]:
print('add order info to priors')

# join with the orders table to get order-level features
# for each order id
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)
priors.head()

add order info to priors


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [5]:
def compute_user_features():
    usr = pd.DataFrame()
    usr['average_days_between_orders'] = (orders.
                                          groupby('user_id')['days_since_prior_order'].
                                          mean().
                                          astype(np.float32))
    usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

    # note that the priors table has already been joined with the order table,
    # hence we can perform aggregation on the user_id to get user-product level
    # statistics per user
    users = pd.DataFrame()
    users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
    users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
    users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

    # i assume we can do a concatenation instead of a join here
    # since the index user is already sorted
    users = users.join(usr)
    del usr
    users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
    return users


users = compute_user_features()
print('dimension: ', users.shape)
users.head()

dimension:  (206209, 6)


Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.0,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.0
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.0,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.0,6,3.0
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.5,5,7.4


In [6]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [7]:
userXproduct = priors.copy()
userXproduct['user_product'] = userXproduct.product_id + userXproduct.user_id * 100000

userXproduct = userXproduct.sort_values('order_number')
userXproduct = userXproduct \
    .groupby('user_product', sort=False) \
    .agg({'order_id': ['size', 'last'], 'add_to_cart_order': 'sum'})
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.astype(
    {'nb_orders': np.int16, 'last_order_id': np.int32, 'sum_pos_in_cart': np.int16}, 
    inplace=True)

userXproduct.head()

Unnamed: 0_level_0,nb_orders,last_order_id,sum_pos_in_cart
user_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8623906075,1,1520399,14
15429341329,1,2049062,3
15429323081,3,1489630,8
15429321527,3,2251505,14
15429335050,4,2251505,18


In [8]:
userXproduct.loc[6075 + 86239 * 100000]

nb_orders                1
last_order_id      1520399
sum_pos_in_cart         14
Name: 8623906075, dtype: int64

In [9]:
userXproduct.loc[41329 + 154293 * 100000]

nb_orders                1
last_order_id      2049062
sum_pos_in_cart          3
Name: 15429341329, dtype: int64

In [10]:
del priors

### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

# train contains the order id and product id of the next purchase,
# 
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
train.head()

split orders : train, test


Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,product_id,add_to_cart_order,reordered
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49302,1,49302,1,1
1,11109,1,11109,2,1
1,10246,1,10246,3,0
1,49683,1,49683,4,0
1,43633,1,43633,5,1


In [11]:
train_orders.head()

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1187899,1187899,1,train,11,4,8,14.0
1492625,1492625,2,train,15,1,11,30.0
2196797,2196797,5,train,5,0,11,6.0
525192,525192,7,train,21,2,11,6.0
880375,880375,8,train,4,1,14,10.0


In [12]:
users.head()

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.0,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.0
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.0,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.0,6,3.0
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.5,5,7.4


In [13]:
order_list = []
product_list = []
labels = []

train_index = set(train.index)     

labels_given = True
for row in train_orders.itertuples():
    user_id = row.user_id
    order_id = row.order_id

    # all the products a user has bought
    # from the prior table
    user_prods = users['all_products'][user_id]
    product_list += user_prods
    order_list += [order_id] * len(user_prods)

    if labels_given:
        # the label is whether the item is in the train data, since train
        # data records the products that were actually bought on the next purchase
        labels += [(order_id, prod) in train_index for prod in user_prods]

In [14]:
labels[:5]

[False, True, True, True, False]

In [15]:
labels = np.array(labels, dtype=np.int8)
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
print('dimension: ', df.shape)
df.head()

dimension:  (8474661, 2)


Unnamed: 0,order_id,product_id
0,1187899,17122
1,1187899,196
2,1187899,26405
3,1187899,46149
4,1187899,14084


In [16]:
print('user related features')

# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.map.html
df['user_id'] = df.order_id.map(orders.user_id)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] =  df.user_id.map(users.average_basket)
df.head()

user related features


Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket
0,1187899,17122,1,11,59,18,19.0,5.363636
1,1187899,196,1,11,59,18,19.0,5.363636
2,1187899,26405,1,11,59,18,19.0,5.363636
3,1187899,46149,1,11,59,18,19.0,5.363636
4,1187899,14084,1,11,59,18,19.0,5.363636


In [17]:
print('order related features')
df['order_dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)

# create an additional feature
df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
df.head()

order related features


Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_dow,order_hour_of_day,days_since_prior_order,days_since_ratio
0,1187899,17122,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842
1,1187899,196,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842
2,1187899,26405,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842
3,1187899,46149,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842
4,1187899,14084,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842


In [18]:
print('product related features')
df['aisle_id'] = df.product_id.map(products.aisle_id)
df['department_id'] = df.product_id.map(products.department_id)
df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
df['product_reorders'] = df.product_id.map(products.reorders)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
df.head()

product related features


Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_dow,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
0,1187899,17122,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842,24,4,13880,9377.0,0.675576
1,1187899,196,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842,77,7,35791,27791.0,0.77648
2,1187899,26405,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842,54,17,1214,536.0,0.441516
3,1187899,46149,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842,77,7,8558,6953.0,0.812456
4,1187899,14084,1,11,59,18,19.0,5.363636,4,8,14.0,0.736842,91,16,15935,12923.0,0.810982


In [19]:
print('user_X_product related features')

df['z'] = df.user_id * 100000 + df.product_id
df.drop(['user_id'], axis=1, inplace=True)
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
                                             df.order_id.map(orders.order_dow)

df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
df.head()

user_X_product related features


Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_dow,order_hour_of_day,days_since_prior_order,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,UP_same_dow_as_last_order
0,1187899,17122,11,59,18,19.0,5.363636,4,8,14.0,...,13880,9377.0,0.675576,1,0.090909,6.0,0.090909,6,7,True
1,1187899,196,11,59,18,19.0,5.363636,4,8,14.0,...,35791,27791.0,0.77648,10,0.909091,1.4,0.909091,1,0,True
2,1187899,26405,11,59,18,19.0,5.363636,4,8,14.0,...,1214,536.0,0.441516,2,0.181818,5.0,0.181818,7,1,True
3,1187899,46149,11,59,18,19.0,5.363636,4,8,14.0,...,8558,6953.0,0.812456,3,0.272727,3.0,0.272727,1,0,True
4,1187899,14084,11,59,18,19.0,5.363636,4,8,14.0,...,15935,12923.0,0.810982,1,0.090909,2.0,0.090909,10,0,False


In [20]:
def features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    
    if labels_given:
        train_index = set(train.index)

    for row in selected_orders.itertuples():
        user_id = row.user_id
        order_id = row.order_id

        # all the products a user has bought
        # from the prior table
        user_prods = users['all_products'][user_id]
        product_list += user_prods
        order_list += [order_id] * len(user_prods)

        if labels_given:
            # the label is whether the item is in the train data, since train
            # data records the products that were actually bought on the next purchase
            labels += [(order_id, prod) in train_index for prod in user_prods]

    labels = np.array(labels, dtype=np.int8)
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)

    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['order_dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)

    # create an additional feature
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    
    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
                                                 df.order_id.map(orders.order_dow)
    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    
    categorical_feature = ['aisle_id', 'department_id',
                           'order_hour_of_day', 'order_dow', 'UP_same_dow_as_last_order']
    for cat_col in categorical_feature:
        df[cat_col] = df[cat_col].astype('category')
    
    return df, labels

In [22]:
import time

start = time.time()

df_train, labels = features(train_orders, labels_given=True)

end = time.time()
print('elapsed: ', end - start)

print('label distribution: ', np.bincount(labels) / labels.shape[0])
print('dimension: ', df_train.shape)
df_train.head()

user related features
order related features
product related features
user_X_product related features
elapsed:  21.84188222885132
label distribution:  [0.90219975 0.09780025]
dimension:  (8474661, 23)


Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_dow,order_hour_of_day,days_since_prior_order,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,UP_same_dow_as_last_order
0,1187899,17122,11,59,18,19.0,5.363636,4,8,14.0,...,13880,9377.0,0.675576,1,0.090909,6.0,0.090909,6,7,True
1,1187899,196,11,59,18,19.0,5.363636,4,8,14.0,...,35791,27791.0,0.77648,10,0.909091,1.4,0.909091,1,0,True
2,1187899,26405,11,59,18,19.0,5.363636,4,8,14.0,...,1214,536.0,0.441516,2,0.181818,5.0,0.181818,7,1,True
3,1187899,46149,11,59,18,19.0,5.363636,4,8,14.0,...,8558,6953.0,0.812456,3,0.272727,3.0,0.272727,1,0,True
4,1187899,14084,11,59,18,19.0,5.363636,4,8,14.0,...,15935,12923.0,0.810982,1,0.090909,2.0,0.090909,10,0,False


In [23]:
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders',# 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last', 'order_dow', 'UP_same_dow_as_last_order']

print('number of final features: ', len(f_to_use))



print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use], label=labels)

# del df_train

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'verbose': 1,
}
ROUNDS = 100

print('light GBM train :-)')

bst = lgb.train(params, d_train, ROUNDS)

number of final features:  20
formating for lgb
light GBM train :-)


In [24]:
feature_imp = bst.feature_importance('gain')
arg_sorted = np.argsort(feature_imp)[::-1]
print(feature_imp[arg_sorted])
print(np.array(f_to_use)[arg_sorted])

[2181625.90942001 1908937.72046661  687739.74162865  250505.82088947
  203600.18598557  108355.45936966  100741.03770065   70315.23092842
   56114.06551266   40990.41271114   38200.10739231   33637.72092247
   26051.79660988   22296.93989182   22155.05239201   17856.54462624
    9717.20280647    9603.16687202    8040.00509834    6261.17100811]
['UP_orders_since_last' 'UP_reorder_rate' 'UP_orders'
 'product_reorder_rate' 'aisle_id' 'days_since_prior_order'
 'user_total_orders' 'days_since_ratio' 'total_distinct_items'
 'user_total_items' 'product_reorders' 'user_average_basket'
 'product_orders' 'user_average_days_between_orders'
 'UP_delta_hour_vs_last' 'order_hour_of_day' 'UP_average_pos_in_cart'
 'department_id' 'UP_same_dow_as_last_order' 'order_dow']


In [None]:
hi

In [None]:
import time

start = time.time()

df_test, _ = features(test_orders)
for cat_col in categorical_feature:
    df_test[cat_col] = df_test[cat_col].astype('category')

end = time.time()
end - start

In [None]:
print('light GBM predict')
start = time.time()

preds = bst.predict(df_test[f_to_use])
df_test['pred'] = preds

end = time.time()
end - start

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

plt.hist(preds)
plt.show()

In [None]:
TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

start = time.time()
d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)

end = time.time()
end - start

In [None]:
sub.head()