In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import gc
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

import lightgbm as lgbm
from sklearn.metrics import log_loss, auc, roc_curve

# Any results you write to the current directory are saved as output.

In [2]:
def load_data(path_data):
    '''
    --------------------------------order_product--------------------------------
    * Unique in order_id + product_id
    '''
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    '''
    --------------------------------order--------------------------------
    * This file tells us which set (prior, train, test) an order belongs
    * Unique in order_id
    * order_id in train, prior, test has no intersection
    * this is the #order_number order of this user
    '''
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})

    #  order in prior, train, test has no duplicate
    #  order_ids_pri = priors.order_id.unique()
    #  order_ids_trn = train.order_id.unique()
    #  order_ids_tst = orders[orders.eval_set == 'test']['order_id'].unique()
    #  print(set(order_ids_pri).intersection(set(order_ids_trn)))
    #  print(set(order_ids_pri).intersection(set(order_ids_tst)))
    #  print(set(order_ids_trn).intersection(set(order_ids_tst)))

    '''
    --------------------------------product--------------------------------
    * Unique in product_id
    '''
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    '''Create statistical columns, group by [N columns] and compute stats on [N column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       agg_dict: python dictionary

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       {real_column_name: {your_specified_new_column_name : method}}
       agg_dict = {'user_id':{'prod_tot_cnts':'count'},
                   'reordered':{'reorder_tot_cnts_of_this_prod':'sum'},
                   'user_buy_product_times': {'prod_order_once':lambda x: sum(x==1),
                                              'prod_order_more_than_once':lambda x: sum(x==2)}}
       ka_add_stats_features_1_vs_n(train, ['product_id'], agg_dict)
    '''
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    '''Create statistical columns, group by [N columns] and compute stats on [1 column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       target_columns_list: list_like
          column you want to compute stats, need to be a list with only one element
       methods_list: list_like
          methods that you want to use, all methods that supported by groupby in Pandas

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       ka_add_stats_features_n_vs_1(train, group_columns_list=['x0'], target_columns_list=['x10'])
    '''
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new

In [3]:
path_data = 'data/'
priors, train, orders, products, aisles, departments, sample_submission = load_data(path_data)

In [4]:
# Products information ----------------------------------------------------------------
# add order information to priors set
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id')

# create new variables
# _user_buy_product_times: 用户是第几次购买该商品
priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1
# _prod_tot_cnts: 该商品被购买的总次数,表明被喜欢的程度
# _reorder_tot_cnts_of_this_prod: 这件商品被再次购买的总次数
### 我觉得下面两个很不好理解，考虑改变++++++++++++++++++++++++++
# _prod_order_once: 该商品被购买一次的总次数
# _prod_order_more_than_once: 该商品被购买一次以上的总次数
agg_dict = {'user_id':{'_prod_tot_cnts':'count'}, 
            'reordered':{'_prod_reorder_tot_cnts':'sum'}, 
            '_user_buy_product_times': {'_prod_buy_first_time_total_cnt':lambda x: sum(x==1),
                                        '_prod_buy_second_time_total_cnt':lambda x: sum(x==2)}}
prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)

# _prod_reorder_prob: 这个指标不好理解
# _prod_reorder_ratio: 商品复购率
prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt
prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts
prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt

add stats features begin ......


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


add stats features end ......
time lapsing 134.66227793693542 s 



In [5]:
prd.head()


Unnamed: 0,product_id,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,_prod_reorder_prob,_prod_reorder_ratio,_prod_reorder_times
0,1,1852,1136.0,716,276,0.385475,0.613391,2.586592
1,2,90,12.0,78,8,0.102564,0.133333,1.153846
2,3,277,203.0,74,36,0.486486,0.732852,3.743243
3,4,329,147.0,182,64,0.351648,0.446809,1.807692
4,5,15,9.0,6,4,0.666667,0.6,2.5


In [6]:
# _user_total_orders: 用户的总订单数
# 可以考虑加入其它统计指标++++++++++++++++++++++++++
# _user_sum_days_since_prior_order: 距离上次购买时间(和),这个只能在orders表里面计算，priors_orders_detail不是在order level上面unique
# _user_mean_days_since_prior_order: 距离上次购买时间(均值)
agg_dict_2 = {'order_number':{'_user_total_orders':'max'},
              'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', 
                                        '_user_mean_days_since_prior_order': 'mean'}}
users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

# _user_reorder_ratio: reorder的总次数 / 第一单后买后的总次数
# _user_total_products: 用户购买的总商品数
# _user_distinct_products: 用户购买的unique商品数
agg_dict_3 = {'reordered':
              {'_user_reorder_ratio': 
               lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                         sum(priors_orders_detail.ix[x.index,'order_number'] > 1)},
              'product_id':{'_user_total_products':'count', 
                            '_user_distinct_products': lambda x: x.nunique()}}
us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
users = users.merge(us, how='inner')

# 平均每单的商品数
# 每单中最多的商品数，最少的商品数++++++++++++++
users['_user_average_basket'] = users._user_total_products / users._user_total_orders

us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

users = users.merge(us, how='inner')

add stats features begin ......


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


add stats features end ......
time lapsing 0.459503173828125 s 

add stats features begin ......


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app


add stats features end ......
time lapsing 1039.9600129127502 s 



In [7]:
# 这里应该还有很多变量可以被添加
# _up_order_count: 用户购买该商品的次数
# _up_first_order_number: 用户第一次购买该商品所处的订单数
# _up_last_order_number: 用户最后一次购买该商品所处的订单数
# _up_average_cart_position: 该商品被添加到购物篮中的平均位置
agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                              '_up_first_order_number': 'min', 
                              '_up_last_order_number':'max'}, 
              'add_to_cart_order':{'_up_average_cart_position': 'mean'}}

data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, 
                                                      group_columns_list=['user_id', 'product_id'], 
                                                      agg_dict=agg_dict_4)

data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id')
# 该商品购买次数 / 总的订单数
# 最近一次购买商品 - 最后一次购买该商品
# 该商品购买次数 / 第一次购买该商品到最后一次购买商品的的订单数
data['_up_order_rate'] = data._up_order_count / data._user_total_orders
data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

# add user_id to train set
train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')

# release Memory
# del train, prd, users
# gc.collect()
# release Memory
del priors_orders_detail, orders
gc.collect()

add stats features begin ......


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


add stats features end ......
time lapsing 19.971490144729614 s 



211

In [11]:
X_train = data.loc[data.eval_set == "train",:]
X_train.to_csv('public_train.csv', index=False)

In [12]:
X_test = data.loc[data.eval_set == "test",:]
X_test.to_csv('public_test.csv', index=False)

In [13]:
X_train[:4]

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_average_cart_position,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,...,_user_total_products,_user_distinct_products,_user_average_basket,order_id,eval_set,time_since_last_order,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order,reordered
0,1,196,10,1,10,1.4,35791,27791.0,8000,4660,...,59,18,5.9,1187899,train,14.0,1.0,0,1.0,1.0
1,1,10258,9,2,10,3.333333,1946,1389.0,557,308,...,59,18,5.9,1187899,train,14.0,0.9,0,1.0,1.0
2,1,10326,1,5,5,5.0,5526,3603.0,1923,1003,...,59,18,5.9,1187899,train,14.0,0.1,5,0.166667,
3,1,12427,10,1,10,3.3,6476,4797.0,1679,889,...,59,18,5.9,1187899,train,14.0,1.0,0,1.0,


In [14]:
X_test[:4]

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_average_cart_position,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,...,_user_total_products,_user_distinct_products,_user_average_basket,order_id,eval_set,time_since_last_order,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order,reordered
18,15,196,5,15,22,2.2,35791,27791.0,8000,4660,...,72,13,3.272727,2161313,test,7.0,0.227273,0,0.625,
19,15,12427,10,1,20,2.1,6476,4797.0,1679,889,...,72,13,3.272727,2161313,test,7.0,0.454545,2,0.454545,
20,15,1747,4,8,19,3.5,1448,886.0,562,221,...,72,13,3.272727,2161313,test,7.0,0.181818,3,0.266667,
21,15,10441,8,1,22,2.375,2909,2042.0,867,465,...,72,13,3.272727,2161313,test,7.0,0.363636,0,0.363636,


In [65]:
X_train = data.loc[data.eval_set == "train",:]

from sklearn.model_selection import GroupKFold
group_kfold = GroupKFold(n_splits=4)

folds = []
for itr, ite in group_kfold.split(X_train, y_train, X_train.user_id):
    folds += [[itr, ite]]
    
X_train.index = range(len(X_train))
X = X_train[['eval_set',  'product_id', 'order_id', 'user_id']]  
X_train.drop(['eval_set',  'product_id', 'order_id', 'user_id'], axis=1, inplace=True)
X_train.loc[:, 'reordered'] = X_train.reordered.fillna(0)
y_train = pd.DataFrame(X_train['reordered'])
del X_train['reordered']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [56]:
X.shape

(8474661, 4)

In [35]:
y_train.shape

(8474661, 1)

In [28]:
from sklearn.model_selection import GroupKFold
group_kfold = GroupKFold(n_splits=4)

folds = []
for itr, ite in group_kfold.split(X_train, y_train, X_train.user_id):
    folds += [[itr, ite]]

In [14]:
folds

[[array([      0,       1,       2, ..., 8474658, 8474659, 8474660]),
  array([    650,     651,     652, ..., 8474645, 8474646, 8474647])],
 [array([     18,      19,      20, ..., 8474655, 8474657, 8474660]),
  array([      0,       1,       2, ..., 8474656, 8474658, 8474659])],
 [array([      0,       1,       2, ..., 8474657, 8474658, 8474659]),
  array([    208,     209,     210, ..., 8474653, 8474654, 8474660])],
 [array([      0,       1,       2, ..., 8474658, 8474659, 8474660]),
  array([     18,      19,      20, ..., 8474648, 8474655, 8474657])]]

In [50]:
#LGBM
params = {
    #'task': 'train',
    #'boosting_type': 'gbdt',
    #'objective': 'regression',
    #'metric': {'l2', 'auc'},
    #'num_leaves': 31,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    #'subsample': 0.8,
    #'colsample_bytree': 0.7,
    'objective': 'binary',
    'nthread': -1,
    'learning_rate': 0.1,
    #'min_child_weight': 50,
    #'max_depth': 7
    #'num_class': 3
    
}


pred_train = np.zeros(len(y_train))
pred_train_value = np.zeros(len(y_train))

xgbs = []
sc,sc_mean = [],[]

for itr, ite in folds:
    ypred = []
    lgb = lgbm.sklearn.LGBMClassifier(n_estimators=100, seed=0, **params)
    lgb.fit(X_train.ix[itr, :], y_train.ix[itr])
    ypred = lgb.predict_proba(X_train.ix[ite, :])[:, 1]

    pred_train[ite] = ypred
    pred_train_value[ite] = (ypred > 0.2)*1.0
    
    
    sc.append(log_loss(y_train.ix[ite, :], pred_train[ite]))

    
print('XGB: {:.6f} +- {:.6f}'.format(np.mean(sc), np.std(sc)))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGB: 0.245127 +- 0.000304


In [38]:
def f1_score_single(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)
    
def f1_score(y_true, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_true, y_pred)])

In [57]:
len(X_train)

8474661

In [66]:
X_train['pred'] = pred_train
X_train['pred'] = X_train['pred'] > 0.21
X_train = pd.concat([X_train, X], axis=1)
tr_y = pd.DataFrame(train.groupby('order_id')['product_id'].aggregate(lambda x: tuple(x)))
tr_y['order_id'] = tr_y.index
tmp = pd.DataFrame(tr_y['order_id'])

tr_y = pd.DataFrame(train[train.reordered==1].groupby('order_id')['product_id'].aggregate(lambda x: tuple(x)))
tr_y['product_id'] = tr_y['product_id'].apply(lambda x: list(x))

res = pd.DataFrame(X_train[X_train.pred].groupby('order_id')['product_id'].aggregate(lambda x: tuple(x)))
res['product_id'] = res['product_id'].apply(lambda x: list(x))

res['order_id'] = res.index
tr_y['order_id'] = tr_y.index

res = pd.merge(tmp, res, on=['order_id'], how='left')
res.index = tmp.index
res.columns = ['order_id','products']
res = res.fillna('')

tr_y = pd.merge(tmp, tr_y, on=['order_id'], how='left')
tr_y.index = tmp.index
tr_y.columns = ['order_id','products']
tr_y = tr_y.fillna('')

tmp = []
for i in res['products']:
    if i == '':
        tmp += [[]]
    else:
        tmp += [i]
res['products'] = tmp


tmp = []
for i in tr_y['products']:
    if i == '':
        tmp += [[]]
    else:
        tmp += [i]
tr_y['products'] = tmp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [67]:
f1_score(tr_y.ix[res.index.values, 'products'].values, res.ix[:, 'products'].values)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


0.36256484558678492

In [55]:
X_test = data.loc[data.eval_set == "test",:]

In [65]:
X_train[:3]

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_average_cart_position,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,...,_user_reorder_ratio,_user_total_products,_user_distinct_products,_user_average_basket,order_id,time_since_last_order,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order,pred
0,1,196,10,1,10,1.4,35791,27791.0,8000,4660,...,0.759259,59,18,5.9,1187899,14.0,1.0,0,1.0,True
1,1,10258,9,2,10,3.333333,1946,1389.0,557,308,...,0.759259,59,18,5.9,1187899,14.0,0.9,0,1.0,True
2,1,10326,1,5,5,5.0,5526,3603.0,1923,1003,...,0.759259,59,18,5.9,1187899,14.0,0.1,5,0.166667,False


In [60]:
X_test[:3]

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_average_cart_position,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,...,_user_reorder_ratio,_user_total_products,_user_distinct_products,_user_average_basket,order_id,eval_set,time_since_last_order,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order
18,15,196,5,15,22,2.2,35791,27791.0,8000,4660,...,0.867647,72,13,3.272727,2161313,test,7.0,0.227273,0,0.625
19,15,12427,10,1,20,2.1,6476,4797.0,1679,889,...,0.867647,72,13,3.272727,2161313,test,7.0,0.454545,2,0.454545
20,15,1747,4,8,19,3.5,1448,886.0,562,221,...,0.867647,72,13,3.272727,2161313,test,7.0,0.181818,3,0.266667


In [58]:
del X_test['reordered']

In [59]:
test_pred = lgb.predict_proba(X_test.ix[:, :])[:, 1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


ValueError: train and valid dataset categorical_feature do not match.

In [53]:
X_test['pred'] = test_pred
X_test['pred'] = X_test['pred'] > 0.2

res = pd.DataFrame(X_test[X_test.pred].groupby('order_id')['product_id'].aggregate(lambda x: tuple(x)))
res['product_id'] = res['product_id'].apply(lambda x: list(x))

res['order_id'] = res.index

res = pd.merge(sample_submission, res, on=['order_id'], how='left')
#res.index = tmp.index
res.columns = ['order_id','products']
res = res.fillna('')


tmp = []
for i in res['products']:
    if i == '':
        tmp += [[]]
    else:
        tmp += [i]
res['products'] = tmp

NameError: name 'test_pred' is not defined

In [None]:
0.31467985459259384
0.31460959539974664