<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

https://www.kaggle.com/rshally/instacart-lb-392-runs-on-kaggle-with-2nd-clf

Instead of choosing 1 single cutoff-threshold for every order, it relaxes this and repeats the prediction for multiple thresholds (e.g. instead of a global threshold at 0.21, it generates prediction at threshold 0.17, 0.21, 0.25) and a second classifier is applied to train on the prediction at different threshold to determine the best threshold for each order. The features are the second-level classifiers are ones such as min, max, mean of the predicted probability for each of three threshold (std or len could also work).

In [1]:
import os
import pandas as pd
import numpy as np
# import gc
import lightgbm as lgb
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt

myfolder = os.path.join('..', 'all/')
print('loading files ...')

prior = pd.read_csv(myfolder + 'order_products__prior.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.uint8, 'add_to_cart_order': np.uint8})

train_orders = pd.read_csv(myfolder + 'order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })

orders = pd.read_csv(myfolder + 'orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float16})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)

# replace a user's first order's days_since_prior_order with 30 ...
orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

products = pd.read_csv(myfolder + 'products.csv', dtype={'product_id': np.uint16,
            'aisle_id': np.uint8, 'department_id': np.uint8},
             usecols=['product_id', 'aisle_id', 'department_id'])

print('done loading')

loading files ...
done loading


In [2]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,0,1,2,8,30
1,2398795,1,0,2,3,7,15
2,473747,1,0,3,3,12,21
3,2254736,1,0,4,4,7,29
4,431534,1,0,5,4,15,28


In [3]:
train_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [4]:
print('merge prior and orders and keep train separate ...')

orders_products = orders.merge(prior, how = 'inner', on = 'order_id')
train_orders = train_orders.merge(
    orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')

del prior

merge prior and orders and keep train separate ...


In [5]:
orders_products.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,0,1,2,8,30,196,1,0
1,2539329,1,0,1,2,8,30,14084,2,0
2,2539329,1,0,1,2,8,30,12427,3,0
3,2539329,1,0,1,2,8,30,26088,4,0
4,2539329,1,0,1,2,8,30,26405,5,0


In [6]:
print('Creating features I ...')

# sort orders and products to get the rank or the reorder frequency
prdss = orders_products.sort_values(['user_id', 'order_number', 'product_id'], ascending=True)
prdss['product_time'] = prdss.groupby(['user_id', 'product_id']).cumcount()+1

# getting products ordered first and second times to calculate probability later
sub1 = prdss[prdss['product_time'] == 1].groupby('product_id').size().to_frame('prod_first_orders')
sub2 = prdss[prdss['product_time'] == 2].groupby('product_id').size().to_frame('prod_second_orders')
sub1['prod_orders'] = prdss.groupby('product_id')['product_id'].size()
sub1['prod_reorders'] = prdss.groupby('product_id')['reordered'].sum()
sub2 = sub2.reset_index().merge(sub1.reset_index())
sub2['prod_reorder_probability'] = sub2['prod_second_orders']/sub2['prod_first_orders']
sub2['prod_reorder_ratio'] = sub2['prod_reorders']/sub2['prod_orders']
prd = sub2[['product_id', 'prod_orders','prod_reorder_probability', 'prod_reorder_ratio']]

del sub1, sub2, prdss

Creating features I ...


In [7]:
print('Creating features II ...')

# extracting prior information (features) by user
users = orders[orders['eval_set'] == 0].groupby(['user_id'])['order_number'].max().to_frame('user_orders')
users['user_period'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].sum()
users['user_mean_days_since_prior'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].mean()

# merging features about users and orders into one dataset
us = orders_products.groupby('user_id').size().to_frame('user_total_products')
us['eq_1'] = orders_products[orders_products['reordered'] == 1].groupby('user_id')['product_id'].size()
us['gt_1'] = orders_products[orders_products['order_number'] > 1].groupby('user_id')['product_id'].size()
us['user_reorder_ratio'] = us['eq_1'] / us['gt_1']
us.drop(['eq_1', 'gt_1'], axis = 1, inplace = True)
us['user_distinct_products'] = orders_products.groupby(['user_id'])['product_id'].nunique()

# the average basket size of the user
users = users.reset_index().merge(us.reset_index())
users['user_average_basket'] = users['user_total_products'] / users['user_orders']

us = orders[orders['eval_set'] != 0]
us = us[['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
users = users.merge(us)

del us

Creating features II ...


In [8]:
print('Finalizing features and the main data file  ...')
# merging orders and products and grouping by user and product and calculating features for the user/product combination
data = orders_products.groupby(['user_id', 'product_id']).size().to_frame('up_orders')
data['up_first_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].min()
data['up_last_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].max()
data['up_average_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].mean()
data = data.reset_index()

#merging previous data with users
data = data.merge(prd, on = 'product_id')
data = data.merge(users, on = 'user_id')

#user/product combination features about the particular order
data['up_order_rate'] = data['up_orders'] / data['user_orders']
data['up_orders_since_last_order'] = data['user_orders'] - data['up_last_order']
data = data.merge(train_orders[['user_id', 'product_id', 'reordered']], 
                  how = 'left', on = ['user_id', 'product_id'])
data = data.merge(products, on = 'product_id')

del orders_products     #, orders, train_orders

Finalizing features and the main data file  ...


In [9]:
print(' Training and test data for later use in F1 optimization and training  ...')

#save the actual reordered products of the train set in a list format and then delete the original frames
train_orders = train_orders[train_orders['reordered']==1].drop('reordered',axis=1)
orders.set_index('order_id', drop=False, inplace=True)
train1=orders[['order_id','eval_set']].loc[orders['eval_set']==1]
train1['actual'] = train_orders.groupby('order_id').aggregate({'product_id':lambda x: list(x)})
train1['actual']=train1['actual'].fillna('')
n_actual = train1['actual'].apply(lambda x: len(x)).mean()   # this is the average cart size

test1=orders[['order_id','eval_set']].loc[orders['eval_set']==2]
test1['actual']=' '
traintest1=pd.concat([train1,test1])
traintest1.set_index('order_id', drop=False, inplace=True)

del orders, train_orders, train1, test1

 Training and test data for later use in F1 optimization and training  ...


In [10]:
print('setting dtypes for data ...')

#reduce the size by setting data types
data = data.astype(dtype= {'user_id' : np.uint32, 'product_id'  : np.uint16,
            'up_orders'  : np.uint8, 'up_first_order' : np.uint8, 'up_last_order' : np.uint8,
            'up_average_cart_position' : np.uint8, 'prod_orders' : np.uint16, 
            'prod_reorder_probability' : np.float16,   
            'prod_reorder_ratio' : np.float16, 'user_orders' : np.uint8,
            'user_period' : np.uint8, 'user_mean_days_since_prior' : np.uint8,
            'user_total_products' : np.uint8, 'user_reorder_ratio' : np.float16, 
            'user_distinct_products' : np.uint8, 'user_average_basket' : np.uint8,
            'order_id'  : np.uint32, 'eval_set' : np.uint8, 
            'days_since_prior_order' : np.uint8, 'up_order_rate' : np.float16, 
            'up_orders_since_last_order':np.uint8,
            'aisle_id': np.uint8, 'department_id': np.uint8})

data['reordered'].fillna(0, inplace=True)  # replace NaN with zeros (not reordered) 
data['reordered']=data['reordered'].astype(np.uint8)
data.head()

setting dtypes for data ...


Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,user_distinct_products,user_average_basket,order_id,eval_set,days_since_prior_order,up_order_rate,up_orders_since_last_order,reordered,aisle_id,department_id
0,1,196,10,1,10,1,35791,0.58252,0.776367,10,...,18,5,1187899,1,14,1.0,0,1,77,7
1,15,196,5,15,22,2,35791,0.58252,0.776367,22,...,13,3,2161313,2,7,0.227295,0,0,77,7
2,19,196,3,2,7,6,35791,0.58252,0.776367,9,...,133,22,1735923,2,8,0.333252,2,0,77,7
3,21,196,1,10,10,2,35791,0.58252,0.776367,33,...,102,6,1854765,1,28,0.030304,23,0,77,7
4,31,196,2,10,17,15,35791,0.58252,0.776367,20,...,190,14,280888,2,18,0.099976,3,0,77,7


In [11]:
print('Preparing Train and Test sets ...')

# filter by eval_set (train=1, test=2) and dropp the id's columns (not part of training features) 
# but keep prod_id and user_id in test

train = data[data['eval_set'] == 1].drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis = 1)
test =  data[data['eval_set'] == 2].drop(['eval_set', 'user_id', 'reordered'], axis = 1)

check =  data.drop(['eval_set', 'user_id', 'reordered'], axis = 1)

# del data

Preparing Train and Test sets ...


In [12]:
print('preparing X,y for LightGBM ...')

X_train, X_eval, y_train, y_eval = train_test_split(
    train[train.columns.difference(['reordered'])], train['reordered'], test_size=0.1, random_state=2)

del train

preparing X,y for LightGBM ...


In [13]:
print('formatting and training LightGBM ...')

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_eval, y_eval, reference = lgb_train)

# there is some room to change the parameters and improve - I have not done it systematically

params = {'task': 'train', 'boosting_type': 'gbdt',   'objective': 'binary', 'metric': {'binary_logloss', 'auc'},
    'num_iterations' : 100, 'max_bin' : 100, 'num_leaves': 512, 'feature_fraction': 0.8,  'bagging_fraction': 0.95,
    'bagging_freq': 5, 'min_data_in_leaf' : 200, 'learning_rate' : 0.05}

# set lower num_boost_round (I used 300 instead of 50 at home) to avoid time-out on Kaggle
# , num_boost_round = 50
lgb_model = lgb.train(params, lgb_train, valid_sets = lgb_eval, early_stopping_rounds=10)

del lgb_train, X_train, y_train

formatting and training LightGBM ...




[1]	valid_0's auc: 0.815095	valid_0's binary_logloss: 0.658636
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.825001	valid_0's binary_logloss: 0.627166
[3]	valid_0's auc: 0.824316	valid_0's binary_logloss: 0.598799
[4]	valid_0's auc: 0.826171	valid_0's binary_logloss: 0.572733
[5]	valid_0's auc: 0.827208	valid_0's binary_logloss: 0.548881
[6]	valid_0's auc: 0.827635	valid_0's binary_logloss: 0.527107
[7]	valid_0's auc: 0.828114	valid_0's binary_logloss: 0.507093
[8]	valid_0's auc: 0.828224	valid_0's binary_logloss: 0.488705
[9]	valid_0's auc: 0.828617	valid_0's binary_logloss: 0.471676
[10]	valid_0's auc: 0.828754	valid_0's binary_logloss: 0.456009
[11]	valid_0's auc: 0.828649	valid_0's binary_logloss: 0.441633
[12]	valid_0's auc: 0.828876	valid_0's binary_logloss: 0.428181
[13]	valid_0's auc: 0.829105	valid_0's binary_logloss: 0.415715
[14]	valid_0's auc: 0.829211	valid_0's binary_logloss: 0.404147
[15]	valid_0's auc: 0.829336	valid_0's binary_loglo

In [16]:
data.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,user_distinct_products,user_average_basket,order_id,eval_set,days_since_prior_order,up_order_rate,up_orders_since_last_order,reordered,aisle_id,department_id
0,1,196,10,1,10,1,35791,0.58252,0.776367,10,...,18,5,1187899,1,14,1.0,0,1,77,7
1,15,196,5,15,22,2,35791,0.58252,0.776367,22,...,13,3,2161313,2,7,0.227295,0,0,77,7
2,19,196,3,2,7,6,35791,0.58252,0.776367,9,...,133,22,1735923,2,8,0.333252,2,0,77,7
3,21,196,1,10,10,2,35791,0.58252,0.776367,33,...,102,6,1854765,1,28,0.030304,23,0,77,7
4,31,196,2,10,17,15,35791,0.58252,0.776367,20,...,190,14,280888,2,18,0.099976,3,0,77,7


In [18]:
check['reordered'] = lgb_model.predict(check[check.columns.difference(
    ['order_id', 'product_id'])], num_iteration = lgb_model.best_iteration)
check.head()

Unnamed: 0,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,user_period,...,user_reorder_ratio,user_distinct_products,user_average_basket,order_id,days_since_prior_order,up_order_rate,up_orders_since_last_order,aisle_id,department_id,reordered
0,196,10,1,10,1,35791,0.58252,0.776367,10,206,...,0.759277,18,5,1187899,14,1.0,0,77,7,0.861507
1,196,5,15,22,2,35791,0.58252,0.776367,22,1,...,0.867676,13,3,2161313,7,0.227295,0,77,7,0.602386
2,196,3,2,7,6,35791,0.58252,0.776367,9,106,...,0.412842,133,22,1735923,8,0.333252,2,77,7,0.290349
3,196,1,10,10,2,35791,0.58252,0.776367,33,91,...,0.515137,102,6,1854765,28,0.030304,23,77,7,0.020682
4,196,2,10,17,15,35791,0.58252,0.776367,20,123,...,0.381104,190,14,280888,18,0.099976,3,77,7,0.09855


In [21]:
def combi(z,df):
    
    prd_bag = dict()
    z_bag = dict()
    for row in df.itertuples():
        if row.reordered > z:   
            try:
                prd_bag[row.order_id] += ' ' + str(row.product_id)
                z_bag[row.order_id]+= ' ' + str(int(100*row.reordered))
            except:
                prd_bag[row.order_id] = str(row.product_id)
                z_bag[row.order_id]= str(int(100*row.reordered))

    for order in df.order_id:
        if order not in prd_bag:
            prd_bag[order] = ' '
            z_bag[order] = ' '

    return prd_bag,z_bag 

In [23]:
z = 0.21
prd_bag, z_bag = combi(z,check)

AttributeError: 'dict' object has no attribute 'head'

In [25]:
ptemp = pd.DataFrame.from_dict(prd_bag, orient='index')
ptemp.reset_index(inplace=True)
ztemp = pd.DataFrame.from_dict(z_bag, orient='index')
ztemp.reset_index(inplace=True)
ptemp.columns = ['order_id', 'products']
ztemp.columns = ['order_id', 'zs']

In [26]:
ptemp.head()

Unnamed: 0,order_id,products
0,1187899,196 10258 12427 13032 25133 35951 38928 39657 ...
1,2161313,196 12427 10441 11266 14715 27839 37710
2,1735923,196 2192 12108 15131 17008 31487 34690 35123
3,2757217,196 13176 39657 6184 18023 47402
4,859654,196 13176 30450 38928 49235 43352 13575 11759 ...


In [27]:
ztemp.head()

Unnamed: 0,order_id,zs
0,1187899,86 80 80 32 73 33 37 34 63 30
1,2161313,60 34 41 23 32 28 29
2,1735923,29 26 30 32 65 36 32 27
3,2757217,66 55 38 35 34 27
4,859654,53 75 27 50 62 24 76 23 64 28


In [28]:
traintest1.head()

Unnamed: 0_level_0,order_id,eval_set,actual
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1187899,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1492625,1492625,1,"[22963, 7963, 16589, 32792, 41787, 22825, 2485..."
2196797,2196797,1,"[15349, 21413, 40706, 21616]"
525192,525192,1,"[47272, 37999, 13198, 43967, 40852, 17638, 298..."
880375,880375,1,"[15937, 23165, 21903, 41540]"


In [29]:
tt = traintest1.copy()
ptemp['list_prod'] = ptemp['products'].apply(lambda x: list(map(int, x.split())))
ztemp['list_z'] = ztemp['zs'].apply(lambda x: list(map(int, x.split())))
n_cart = ptemp['products'].apply(lambda x: len(x.split())).mean()
tt = tt.merge(ptemp,on='order_id',how='inner')
tt = tt.merge(ztemp,on='order_id',how='inner')
tt.drop(['products','zs'],axis=1,inplace=True)
tt.head()

Defaulting to column, but this will raise an ambiguity error in a future version
  """


Unnamed: 0,order_id,eval_set,actual,list_prod,list_z
0,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...","[196, 10258, 12427, 13032, 25133, 35951, 38928...","[86, 80, 80, 32, 73, 33, 37, 34, 63, 30]"
1,1492625,1,"[22963, 7963, 16589, 32792, 41787, 22825, 2485...","[18523, 24852, 33754, 47209, 21709, 7781, 1559...","[39, 49, 37, 48, 27, 24, 39, 37, 34, 27]"
2,2196797,1,"[15349, 21413, 40706, 21616]","[11777, 26604, 27344, 8518, 40706, 24535, 4369...","[59, 60, 25, 31, 43, 49, 53, 26]"
3,525192,1,"[47272, 37999, 13198, 43967, 40852, 17638, 298...","[21137, 4920, 31683, 15592, 37999, 42803, 3217...","[59, 23, 33, 26, 22, 21, 31, 34, 57, 25, 29, 3..."
4,880375,1,"[15937, 23165, 21903, 41540]","[9839, 17794, 21903, 28985, 33640, 8193, 18531...","[27, 42, 34, 27, 23, 22, 34, 33, 33, 22, 48, 2..."


In [30]:
def f1_score_single(x):                 #from LiLi but modified to get 1 for both empty

    y_true = x.actual
    y_pred = x.list_prod
    if y_true == '' and y_pred ==[] : return 1.
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)

In [31]:
tt['zavg'] = tt['list_z'].apply(lambda x: 0.01*np.mean(x) if x!=[] else 0.).astype(np.float16)
tt['zmax'] = tt['list_z'].apply(lambda x: 0.01*np.max(x) if x!=[] else 0.).astype(np.float16)
tt['zmin'] = tt['list_z'].apply(lambda x: 0.01*np.min(x) if x!=[] else 0.).astype(np.float16)
tt['f1'] = tt.apply(f1_score_single,axis=1).astype(np.float16)

In [32]:
tt.head()

Unnamed: 0,order_id,eval_set,actual,list_prod,list_z,zavg,zmax,zmin,f1
0,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...","[196, 10258, 12427, 13032, 25133, 35951, 38928...","[86, 80, 80, 32, 73, 33, 37, 34, 63, 30]",0.547852,0.859863,0.300049,0.799805
1,1492625,1,"[22963, 7963, 16589, 32792, 41787, 22825, 2485...","[18523, 24852, 33754, 47209, 21709, 7781, 1559...","[39, 49, 37, 48, 27, 24, 39, 37, 34, 27]",0.361084,0.48999,0.23999,0.272705
2,2196797,1,"[15349, 21413, 40706, 21616]","[11777, 26604, 27344, 8518, 40706, 24535, 4369...","[59, 60, 25, 31, 43, 49, 53, 26]",0.432617,0.600098,0.25,0.333252
3,525192,1,"[47272, 37999, 13198, 43967, 40852, 17638, 298...","[21137, 4920, 31683, 15592, 37999, 42803, 3217...","[59, 23, 33, 26, 22, 21, 31, 34, 57, 25, 29, 3...",0.331299,0.589844,0.209961,0.434814
4,880375,1,"[15937, 23165, 21903, 41540]","[9839, 17794, 21903, 28985, 33640, 8193, 18531...","[27, 42, 34, 27, 23, 22, 34, 33, 33, 22, 48, 2...",0.297119,0.47998,0.219971,0.333252


In [34]:
tt = traintest1.copy()

i=0
for z in [0.17, 0.21, 0.25]:
    prd_bag,z_bag = combi(z, check)
    ptemp = pd.DataFrame.from_dict(prd_bag, orient='index')
    ptemp.reset_index(inplace=True)
    ztemp = pd.DataFrame.from_dict(z_bag, orient='index')
    ztemp.reset_index(inplace=True)
    ptemp.columns = ['order_id', 'products']
    ztemp.columns = ['order_id', 'zs']

    ptemp['list_prod'] = ptemp['products'].apply(lambda x: list(map(int, x.split())))
    ztemp['list_z'] = ztemp['zs'].apply(lambda x: list(map(int, x.split())))
    n_cart = ptemp['products'].apply(lambda x: len(x.split())).mean()
    tt = tt.merge(ptemp, on='order_id',how='inner')
    tt = tt.merge(ztemp, on='order_id',how='inner')

    tt.drop(['products','zs'],axis=1,inplace=True)
    tt['zavg'] = tt['list_z'].apply(lambda x: 0.01*np.mean(x) if x!=[] else 0.).astype(np.float16)
    tt['zmax'] = tt['list_z'].apply(lambda x: 0.01*np.max(x) if x!=[] else 0.).astype(np.float16)
    tt['zmin'] = tt['list_z'].apply(lambda x: 0.01*np.min(x) if x!=[] else 0.).astype(np.float16)
    tt['f1'] = tt.apply(f1_score_single,axis=1).astype(np.float16)
    F1 = tt['f1'].loc[tt['eval_set']==1].mean()
    tt = tt.rename(columns={'list_prod': 'prod'+str(i), 'f1': 'f1'+str(i), 'list_z': 'z'+str(i),
                'zavg': 'zavg'+str(i), 'zmax': 'zmax'+str(i),  'zmin': 'zmin'+str(i)})
    print(' z,F1,n_actual,n_cart :  ', z,F1,n_actual,n_cart)
    i=i+1


tt['fm'] = tt[['f10', 'f11', 'f12']].idxmax(axis=1)
tt['f1'] = tt[['f10', 'f11', 'f12']].max(axis=1)
tt['fm'] = tt.fm.replace({'f10': 0,'f11': 1, 'f12':2}).astype(np.uint8)
print(' f1 maximized ', tt['f1'].loc[tt['eval_set']==1].mean())

Defaulting to column, but this will raise an ambiguity error in a future version
  app.launch_new_instance()


 z,F1,n_actual,n_cart :   0.17 0.3806 6.316822778925226 10.87684339674796
 z,F1,n_actual,n_cart :   0.21 0.383 6.316822778925226 8.429282911997051
 z,F1,n_actual,n_cart :   0.25 0.3774 6.316822778925226 6.703490148344641
 f1 maximized  0.4387


In [36]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

print('Fitting the second classifier for F1 ...')

X=tt[[ 'zavg0', 'zmax0','zmin0', 'zavg1', 'zmax1', 'zmin1', 'zavg2', 'zmax2', 'zmin2']].loc[tt['eval_set']==1]
y=tt['fm'].loc[tt['eval_set']==1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = GradientBoostingClassifier().fit(X_train, y_train)
print('GB Accuracy on training set: {:.2f}' .format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}' .format(clf.score(X_test, y_test)))
#pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=["Importance"]).plot(kind='bar')
#plt.show()

final=tt[['order_id','prod0','prod1','prod2','zavg0']].loc[tt['eval_set']==2]
df_test=tt[[ 'zavg0', 'zmax0','zmin0', 'zavg1', 'zmax1', 'zmin1', 'zavg2', 'zmax2', 'zmin2']].loc[tt['eval_set']==2]
final['fit']= clf.predict(df_test)
final['best'] = final.apply(lambda row: row['prod0'] if row['fit']==0 else 
                                 ( row['prod1'] if row['fit']==1 else  row['prod2'] )  , axis=1)


Fitting the second classifier for F1 ...
GB Accuracy on training set: 0.60
Accuracy on test set: 0.60


In [37]:
def mylist(x):
    prodids = x.best
    zavg = x.zavg0
    if prodids == []: return 'None'            
#     if zavg < 0.5:
#         if len(prodids) == 1: return  str(prodids[0])+' None'
#         if len(prodids) == 2: return  str(prodids[0])+ ' '+ str(prodids[1]) +' None'
    return ' '.join(str(i) for i in prodids)

final['products']=final.apply(mylist,axis=1)

final[['order_id','products']].to_csv('final_submission1.csv', index=False)