


# 数据准备
## 工具包 & 数据导入
### 工具包导入

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
%matplotlib inline 

### 数据导入
- 训练集测试集的导入

In [2]:
path = '/data/Data_JieZhang/Inscart/'
aisles = pd.read_csv(path + 'aisles.csv')
departments = pd.read_csv(path + 'departments.csv')
products = pd.read_csv(path + 'products.csv')
orders = pd.read_csv(path + 'orders.csv')
order_products__train = pd.read_csv(path + 'order_products__train.csv')
order_products__prior = pd.read_csv(path + 'order_products__prior.csv')

# 特征工程

目前很多比赛都还是特征为王的比赛，针对许多传统的比赛，因为存在很多封装好的工具包,例如XGBoost,LightGBM等工具包,所以很多做数据分析建模或者数据竞赛的人的任务往往就会相对简单，这个时候因为模型变化不大,那么我们的任务往往就会演变为两大模块,特征工程+集成，而特征工程又很难找到非常好的书籍进行系统的学习,所以很多时候网上也都将特征工程称之为艺术。

闲话不多说,下面我们还是以推荐问题为例,介绍基本的推荐算法的特征工程。

我们将该赛题的特征工程粗分为下面几个模块。

1. 用户特征;
2. 产品特征;
3. 用户产品特征;


## 用户特征 

In [3]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### 用户购物的总次数

In [4]:
user_fea = pd.DataFrame()
user_fea['user_id'] = orders['user_id'].unique()
user_fea = user_fea[['user_id']].sort_values('user_id')

In [5]:
user_fea['user_orderid_count'] = orders.groupby('user_id')['order_id'].count().values 

### 用户购物频繁度

1. 如果用户平均相隔天数少,往往说明用户是爱频繁购物的，那么他很有可能会不断地去消费某些产品;
2. 如果用户距离上次的购物最大天数都很小,那么往往说明客户是非常爱购物的,和均值特征是类似的一种.
3. 如果用户距离上次购物的时间的方差较小,说明用户购物有较强的周期性.
4. 同样的,用户距离上一次购买产品的众数也是此类特征之一.

In [6]:
user_fea['user_days_since_prior_order_mean'] = orders.groupby('user_id')['days_since_prior_order'].mean().values
user_fea['user_days_since_prior_order_max'] = orders.groupby('user_id')['days_since_prior_order'].max().values
user_fea['user_days_since_prior_order_std'] = orders.groupby('user_id')['days_since_prior_order'].std().values

In [7]:
user_fea['user_days_since_prior_order_mode'] = orders.groupby('user_id')['days_since_prior_order'].apply(lambda x: x.mode()[0]).values

### 用户时间喜好
- 不同用户购物的时间往往不同,很多用户喜欢深夜购物,有的用户则往往喜欢周五购物,这些不同的时间点往往也是一个非常重要的选择,比如有些用户喜欢周五买香蕉什么的生活习惯,可以对预测带来非常大的帮助。有人则喜欢深夜购物等等。

In [8]:
user_fea['user_order_dow_mode']         = orders.groupby('user_id')['order_dow'].apply(lambda x: x.mode()[0]).values
user_fea['user_order_hour_of_day_mode'] = orders.groupby('user_id')['order_hour_of_day'].apply(lambda x: x.mode()[0]).values

- 补充上一个二阶的用户时间喜好特征,用来表示用户喜欢在每周晚上几点开始购物.

In [9]:
orders['dow_hour'] = orders['order_dow'].values * 25 + orders['order_hour_of_day'].values
user_fea['user_dow_hour_mode'] = orders.groupby('user_id')['dow_hour'].apply(lambda x: x.mode()[0]).values

### 用户商品喜好特征
- 注意此处我们以priors的数据集进行商品特征的提取,**可能很多人会认为为什么不结合train一起提取特征**,这样数据量更大,这主要是考虑到test中并无类似的特征,而且test中的用户又未出现在train中,所以这样提取的特征是有些问题的,因为训练集中的特征来源于priors & train,但是test却只能来源于prior,所以可能造成偏差。

此处我们用如下的几类特征来表示用户商品的特征.
1. 用户购买不同产品的数量
2. 用户购买最多的三种产品,如果用户只购买了两类产品,那么最后一种产品我们用-1来表示.
3. 用户购买所有产品的数量
4. 用户平均每一单购买的产品数量

In [10]:
order_products__prior_ = order_products__prior.merge(orders, on='order_id', how = 'left')

In [11]:
user_fea['user_product_nunique'] = order_products__prior_.groupby('user_id')['product_id'].nunique().sort_index().values

In [12]:
def mode_N(x,i):
    m = x.value_counts().index
    if len(m) > i: 
        return m[i] 
    return -1
user_fea['user_product_mode'] = order_products__prior_.groupby('user_id')['product_id'].apply(lambda x: x.mode()[0]).sort_index().values
user_fea['user_product_mode_1'] = order_products__prior_.groupby('user_id')['product_id'].apply(lambda x: mode_N(x,1)).sort_index().values 
user_fea['user_product_mode_2'] = order_products__prior_.groupby('user_id')['product_id'].apply(lambda x: mode_N(x,2)).sort_index().values

In [13]:
user_fea['user_product_count'] = order_products__prior_.groupby('user_id')['product_id'].count().sort_index().values

In [14]:
user_fea['user_product_orderid_ratio'] = user_fea['user_product_count'] / user_fea['user_orderid_count']

### 用户购物疯狂性特征
1. 用户订单中的商品最多数
2. 用户每次平均购买的商品量
3. 用户购买商品的1/4,3/4位数

In [15]:
tmp = order_products__prior_.groupby(['user_id','order_id'])['add_to_cart_order'].max().reset_index()

In [16]:
user_fea['user_add_to_cart_order_max'] = tmp.groupby('user_id')['add_to_cart_order'].max().sort_index().values
user_fea['user_add_to_cart_order_mean'] = tmp.groupby('user_id')['add_to_cart_order'].mean().sort_index().values
user_fea['user_add_to_cart_order_quantile_25'] = tmp.groupby('user_id')['add_to_cart_order'].quantile().sort_index().values
user_fea['user_add_to_cart_order_quantile_75'] = tmp.groupby('user_id')['add_to_cart_order'].quantile(0.75).sort_index().values

In [17]:
del tmp
gc.collect()

61

### 用户购买商品的重构率

我们用用户购买的所有商品中的重新购买的和以及均值来表示

In [18]:
user_fea['user_reordered_sum'] = order_products__prior_.groupby('user_id')['reordered'].sum().sort_index().values
user_fea['user_reordered_mean'] = order_products__prior_.groupby('user_id')['reordered'].mean().sort_index().values

## 基于产品的特征

上面我们对用户特征进行了简单的提取,分别针对用户的购买时间偏好,用户对商品的喜好,重构的情况等等出发进行特征的构建。

下面我们从商品的特征出发进行商品特征的构建。虽然下面的特征都可以认为是产品的热度&受欢迎度特征等,但我们还是尽可能将其分开解释。

### 产品的热度
1. 产品被买多少次
2. 产品被多少不同用户所购买
3. 产品被添加到篮子中的顺序的均值 & 标准差

In [19]:
product_fea = pd.DataFrame()
product_fea['product_id'] = order_products__prior_['product_id'].unique() 
product_fea= product_fea.sort_values('product_id')

In [20]:
product_fea['product_count'] = order_products__prior_.groupby('product_id')['user_id'].count().sort_index().values

In [21]:
product_fea['product_order_nunqiue'] = order_products__prior_.groupby('product_id')['order_id'].nunique().sort_index().values
product_fea['product_user_nunqiue']  = order_products__prior_.groupby('product_id')['user_id'].nunique().sort_index().values

In [22]:
product_fea['product_add_to_cart_order_mean'] = order_products__prior_.groupby('product_id')['add_to_cart_order'].mean().sort_index().values 
product_fea['product_add_to_cart_order_std']  = order_products__prior_.groupby('product_id')['add_to_cart_order'].std().sort_index().values 

### 产品受欢迎的时间段
1. 产品被购买最多的两天
2. 产品被购买最多的两个小时段
3. 产品被购买距离上次购买最多的两个时间段
4. 产品距离上次被购买的均值以及均方差

In [23]:
product_fea['product_dow_mode'] = order_products__prior_.groupby('product_id')['order_dow'].apply(lambda x: x.mode()[0]).sort_index().values
product_fea['product_dow_mode_1'] = order_products__prior_.groupby('product_id')['order_dow'].apply(lambda x: mode_N(x,1)).sort_index().values

In [24]:
product_fea['product_hour_of_day_mode'] = order_products__prior_.groupby('product_id')['order_hour_of_day'].apply(lambda x: x.mode()[0]).sort_index().values
product_fea['product_hour_of_day_mode_1'] = order_products__prior_.groupby('product_id')['order_hour_of_day'].apply(lambda x: mode_N(x,1)).sort_index().values

In [25]:
product_fea['product_days_since_prior_order_mode'] = order_products__prior_.groupby('product_id')['days_since_prior_order'].apply(lambda x: mode_N(x,0)).sort_index().values
product_fea['product_days_since_prior_order_mode_1'] = order_products__prior_.groupby('product_id')['days_since_prior_order'].apply(lambda x: mode_N(x,1)).sort_index().values

In [26]:
product_fea['product_dow_hour_mode'] = order_products__prior_.groupby('product_id')['dow_hour'].apply(lambda x: mode_N(x,0)).sort_index().values
product_fea['product_dow_hour_mode_1'] = order_products__prior_.groupby('product_id')['dow_hour'].apply(lambda x: mode_N(x,1)).sort_index().values

In [27]:
product_fea['product_days_since_prior_order_mean'] = order_products__prior_.groupby('product_id')['days_since_prior_order'].mean().sort_index().values 
product_fea['product_days_since_prior_order_std'] = order_products__prior_.groupby('product_id')['days_since_prior_order'].std().sort_index().values 

### 产品的被重购率

产品的重构率,标准差以及和。

In [28]:
product_fea['product_reordered_mean'] = order_products__prior_.groupby('product_id')['reordered'].mean().sort_index().values 
product_fea['product_reordered_std'] = order_products__prior_.groupby('product_id')['reordered'].std().sort_index().values 
product_fea['product_reordered_sum'] = order_products__prior_.groupby('product_id')['reordered'].sum().sort_index().values 

## 基于用户 + 产品的交叉特征

In [29]:
order_products__prior_['user_product'] = order_products__prior_['user_id'].values * 10**5 + order_products__prior_['product_id'].values

In [30]:
userXproduct_fea = pd.DataFrame()
userXproduct_fea['user_product'] = order_products__prior_['user_product'].unique() 
userXproduct_fea = userXproduct_fea[['user_product']].sort_values('user_product')

**此处的用户和产品的交叉特征,我们就用下面几个简单特征进行表述**:
1. 同一用户对某一个产品的重构率;
2. 同一用户对某一个产品的加入篮子的顺序的和以及均值;
3. 同一用户购买某一个产品的次数

In [31]:
userXproduct_fea['user_product_reordered_sum'] = order_products__prior_.groupby('user_product')['reordered'].sum().sort_index().values 

In [32]:
userXproduct_fea['user_product_add_to_cart_order_sum']   = order_products__prior_.groupby('user_product')['add_to_cart_order'].sum().sort_index().values 
userXproduct_fea['user_product_add_to_cart_order_mean']   = order_products__prior_.groupby('user_product')['add_to_cart_order'].mean().sort_index().values 

In [33]:
userXproduct_fea['user_product_order_nunique']   = order_products__prior_.groupby('user_product')['order_id'].nunique().sort_index().values 

In [34]:
userXproduct_fea['user_product_last_order_num'] = order_products__prior_.groupby('user_product')['order_number'].max().sort_index().values 

In [35]:
userXproduct_fea.head()

Unnamed: 0,user_product,user_product_reordered_sum,user_product_add_to_cart_order_sum,user_product_add_to_cart_order_mean,user_product_order_nunique,user_product_last_order_num
3105781,100196,9,14,1.4,10,10
3105783,110258,8,30,3.333333,9,10
3105785,110326,0,5,5.0,1,5
3105782,112427,9,33,3.3,10,10
3767345,113032,2,19,6.333333,3,10


# 构建训练集 & 测试集
具体的构建原理此处就不再阐述,具体的可以参见前一篇文章的内容:Instacart Market Basket Analysis：数据探索分析&Baseline(0.2428)

In [36]:
orders_prior_data = orders.loc[orders.eval_set == 'prior']
orders_train_data = orders.loc[orders.eval_set == 'train'] 
orders_test_data  = orders.loc[orders.eval_set == 'test' ] 

priors = order_products__prior.merge(orders_prior_data, on =['order_id'], how='left')
trains = order_products__train.merge(orders_train_data, on =['order_id'], how='left')

In [37]:
user_product = order_products__prior_[['user_id','product_id']].copy()
user_product['user_X_product'] = user_product['user_id'].values* 10**5  + user_product['product_id'].values
train_user_X_product = trains['user_id'].values* 10**5 + trains['product_id'].values

In [38]:
user_product = user_product.drop_duplicates(subset=['user_X_product'], keep = 'last') 

In [39]:
test_user  = orders_test_data['user_id']
train_user = orders_train_data['user_id']

In [40]:
user_product['label'] = 0
train_data = user_product.loc[user_product.user_id.isin(train_user)]
train_data.loc[train_data.user_X_product.isin(train_user_X_product), 'label'] = 1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
train_data['label'].sum()

828824

In [42]:
train_data['label'].mean()

0.09780025419305857

In [43]:
train_data = train_data.merge(orders_train_data,on ='user_id', how='left')

In [44]:
test_data  = user_product.loc[user_product.user_id.isin(test_user)]
test_data = test_data.merge(orders_test_data,on ='user_id', how='left')

## 特征拼接

- orders的数据合并

In [45]:
train_data = train_data.merge(user_fea, on='user_id', how='left')
train_data = train_data.merge(product_fea, on='product_id', how='left')
train_data = train_data.merge(userXproduct_fea, left_on='user_X_product', right_on='user_product', how='left')
train_data = train_data.merge(products, on='product_id', how= 'left')

In [46]:
pd.set_option('display.max_columns',50)
train_data.head()

Unnamed: 0,user_id,product_id,user_X_product,label,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,dow_hour,user_orderid_count,user_days_since_prior_order_mean,user_days_since_prior_order_max,user_days_since_prior_order_std,user_days_since_prior_order_mode,user_order_dow_mode,user_order_hour_of_day_mode,user_dow_hour_mode,user_product_nunique,user_product_mode,user_product_mode_1,user_product_mode_2,user_product_count,user_product_orderid_ratio,...,product_user_nunqiue,product_add_to_cart_order_mean,product_add_to_cart_order_std,product_dow_mode,product_dow_mode_1,product_hour_of_day_mode,product_hour_of_day_mode_1,product_days_since_prior_order_mode,product_days_since_prior_order_mode_1,product_dow_hour_mode,product_dow_hour_mode_1,product_days_since_prior_order_mean,product_days_since_prior_order_std,product_reordered_mean,product_reordered_std,product_reordered_sum,user_product,user_product_reordered_sum,user_product_add_to_cart_order_sum,user_product_add_to_cart_order_mean,user_product_order_nunique,user_product_last_order_num,product_name,aisle_id,department_id
0,202279,9327,20227909327,0,1050357,train,9,1,14,30.0,39,9,22.625,30.0,9.455724,30.0,5,9,134,47,17794,1003,28985,90,10.0,...,5304,9.943166,7.853449,0,6,14,13,30.0,7.0,12,13,11.432298,9.184589,0.157962,0.364734,995,20227909327,0,3,3.0,1,3,Garlic Powder,104,13
1,156122,47209,15612247209,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,43453,6.775011,5.813507,0,1,15,14,7.0,6.0,14,15,10.205608,8.319953,0.796553,0.402563,170131,15612247209,0,22,22.0,1,42,Organic Hass Avocado,24,4
2,156122,46522,15612246522,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,4361,10.115824,7.885046,0,1,11,10,30.0,7.0,11,12,12.547652,9.160984,0.452754,0.497794,3608,15612246522,0,23,23.0,1,42,Spaghetti Pasta,131,9
3,156122,38693,15612238693,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,2546,8.862221,6.974303,0,1,12,14,7.0,30.0,12,11,10.664021,8.701408,0.437845,0.496176,1983,15612238693,0,24,24.0,1,42,Meyer Lemon,24,4
4,156122,48825,15612248825,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,959,9.408513,7.249891,0,6,12,13,7.0,4.0,11,12,9.690413,8.01344,0.390724,0.488068,615,15612248825,0,25,25.0,1,42,Bag of Organic Lemons,24,4


In [47]:
test_data = test_data.merge(user_fea, on='user_id', how='left')
test_data = test_data.merge(product_fea, on='product_id', how='left')
test_data = test_data.merge(userXproduct_fea, left_on='user_X_product', right_on='user_product', how='left')
test_data  =  test_data.merge(products, on='product_id', how= 'left')

In [48]:
train_data.head()

Unnamed: 0,user_id,product_id,user_X_product,label,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,dow_hour,user_orderid_count,user_days_since_prior_order_mean,user_days_since_prior_order_max,user_days_since_prior_order_std,user_days_since_prior_order_mode,user_order_dow_mode,user_order_hour_of_day_mode,user_dow_hour_mode,user_product_nunique,user_product_mode,user_product_mode_1,user_product_mode_2,user_product_count,user_product_orderid_ratio,...,product_user_nunqiue,product_add_to_cart_order_mean,product_add_to_cart_order_std,product_dow_mode,product_dow_mode_1,product_hour_of_day_mode,product_hour_of_day_mode_1,product_days_since_prior_order_mode,product_days_since_prior_order_mode_1,product_dow_hour_mode,product_dow_hour_mode_1,product_days_since_prior_order_mean,product_days_since_prior_order_std,product_reordered_mean,product_reordered_std,product_reordered_sum,user_product,user_product_reordered_sum,user_product_add_to_cart_order_sum,user_product_add_to_cart_order_mean,user_product_order_nunique,user_product_last_order_num,product_name,aisle_id,department_id
0,202279,9327,20227909327,0,1050357,train,9,1,14,30.0,39,9,22.625,30.0,9.455724,30.0,5,9,134,47,17794,1003,28985,90,10.0,...,5304,9.943166,7.853449,0,6,14,13,30.0,7.0,12,13,11.432298,9.184589,0.157962,0.364734,995,20227909327,0,3,3.0,1,3,Garlic Powder,104,13
1,156122,47209,15612247209,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,43453,6.775011,5.813507,0,1,15,14,7.0,6.0,14,15,10.205608,8.319953,0.796553,0.402563,170131,15612247209,0,22,22.0,1,42,Organic Hass Avocado,24,4
2,156122,46522,15612246522,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,4361,10.115824,7.885046,0,1,11,10,30.0,7.0,11,12,12.547652,9.160984,0.452754,0.497794,3608,15612246522,0,23,23.0,1,42,Spaghetti Pasta,131,9
3,156122,38693,15612238693,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,2546,8.862221,6.974303,0,1,12,14,7.0,30.0,12,11,10.664021,8.701408,0.437845,0.496176,1983,15612238693,0,24,24.0,1,42,Meyer Lemon,24,4
4,156122,48825,15612248825,0,1277092,train,53,1,11,5.0,36,53,6.673077,18.0,2.874643,6.0,4,14,114,154,13176,13176,48370,967,18.245283,...,959,9.408513,7.249891,0,6,12,13,7.0,4.0,11,12,9.690413,8.01344,0.390724,0.488068,615,15612248825,0,25,25.0,1,42,Bag of Organic Lemons,24,4


In [59]:
train_data['user_product_count_percent']  = train_data['user_product_order_nunique'] * 1.0 / train_data['user_orderid_count']
train_data['user_product_last_to_now']   = train_data['user_orderid_count']    - train_data['user_product_last_order_num']

test_data['user_product_count_percent']   = test_data['user_product_order_nunique'] * 1.0  / test_data['user_orderid_count']
test_data['user_product_last_to_now']   = test_data['user_orderid_count']    - test_data['user_product_last_order_num']

In [50]:
fea_not_need = ['user_id','user_X_product','order_id','eval_set','product_name','user_product_last_order_num','label']
feature_cols = [col for col in train_data.columns if col not in fea_not_need]
label_cols = 'label'

In [51]:
def validation_sample(order_ids, frac = 0.2):
    import random
    sample_number = int(frac * len(order_ids))
    sample_val_order = random.sample( order_ids , sample_number) 
    sample_train_order = list(set(order_ids) - set(sample_val_order))
    return sample_train_order,sample_val_order
sample_train_order,sample_val_order = validation_sample(list(train_data['order_id'].unique()))

In [73]:
train = train_data.loc[train_data.order_id.isin(sample_train_order)]
val   = train_data.loc[train_data.order_id.isin(sample_val_order)]

## 模型验证
### 近似的评价指标

In [74]:
import lightgbm as lgb

from sklearn.metrics import f1_score
def lgb_f1_score(y_hat, data):
    y_true = data.get_label() 
    y_hat = np.round(y_hat >= 0.2) # scikits f1 doesn't like probabilities 
    return 'f1', f1_score(y_true, y_hat), True

### 模型验证 
此处是近似的验证,我们仅仅作为参考(未加入None等),在测试集上应该会有波动(我们的训练集&测试集的比例和线上不是一致的,为了方便我们此处随机采样了20%)

<font color=red>注意: 此处我们的F1并不是Mean F1 Score,具体的Mean F1 Score的函数我们会在下一次给出,所以这边的线下的F1 Score和线上的Mean F1 Score基本是不符的。 </font>

In [75]:
d_train = lgb.Dataset(train[feature_cols], label=train[label_cols].values)   
d_val   = lgb.Dataset(val[feature_cols], label=val[label_cols].values)    

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 2 ** 5,
    'max_depth': 10,
    'learning_rate':0.5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5
}
ROUNDS = 200 
print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS, valid_sets=[d_train,d_val], feval =lgb_f1_score,verbose_eval=10)
del d_train 

light GBM train :-)
[10]	training's binary_logloss: 0.247852	training's f1: 0.432576	valid_1's binary_logloss: 0.249092	valid_1's f1: 0.432067
[20]	training's binary_logloss: 0.246291	training's f1: 0.436238	valid_1's binary_logloss: 0.247785	valid_1's f1: 0.43522
[30]	training's binary_logloss: 0.245689	training's f1: 0.437799	valid_1's binary_logloss: 0.247615	valid_1's f1: 0.436006
[40]	training's binary_logloss: 0.24537	training's f1: 0.438867	valid_1's binary_logloss: 0.248081	valid_1's f1: 0.435775
[50]	training's binary_logloss: 0.24519	training's f1: 0.439777	valid_1's binary_logloss: 0.248307	valid_1's f1: 0.435959
[60]	training's binary_logloss: 0.245059	training's f1: 0.440385	valid_1's binary_logloss: 0.248371	valid_1's f1: 0.435946
[70]	training's binary_logloss: 0.245853	training's f1: 0.440425	valid_1's binary_logloss: 0.252598	valid_1's f1: 0.435574
[80]	training's binary_logloss: 0.245137	training's f1: 0.440693	valid_1's binary_logloss: 0.251947	valid_1's f1: 0.435473

## 模型训练&提交
### 模型训练

In [55]:
import lightgbm as lgb
d_train = lgb.Dataset(train_data[feature_cols], label=train_data[label_cols].values)    

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 255,
    'max_depth': 10,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 120 
print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS, valid_sets=[d_train], feval =lgb_f1_score,verbose_eval=10)

light GBM train :-)
[10]	training's binary_logloss: 0.257502	training's f1: 0.433764
[20]	training's binary_logloss: 0.248057	training's f1: 0.439856
[30]	training's binary_logloss: 0.245456	training's f1: 0.440926
[40]	training's binary_logloss: 0.244276	training's f1: 0.442424
[50]	training's binary_logloss: 0.243474	training's f1: 0.443839
[60]	training's binary_logloss: 0.242821	training's f1: 0.445005
[70]	training's binary_logloss: 0.242281	training's f1: 0.446259
[80]	training's binary_logloss: 0.241743	training's f1: 0.447373
[90]	training's binary_logloss: 0.241224	training's f1: 0.448543
[100]	training's binary_logloss: 0.240748	training's f1: 0.449529
[110]	training's binary_logloss: 0.24033	training's f1: 0.450389
[120]	training's binary_logloss: 0.239913	training's f1: 0.451297


 ### 模型提交:0.3787363（1411/2623）

In [60]:
pred = bst.predict(test_data[feature_cols])
test_data['pred'] = pred 
test_data['product_id'] = test_data['product_id'].astype(str)

In [71]:
order_product = {}
for order_id, val, product_id in test_data[['order_id','pred','product_id']].values:
    if order_id in order_product.keys():
        if val >= 0.21:
            if order_product[order_id] == '':
                order_product[order_id] = str(product_id )
            else:
                order_product[order_id] += ' ' + str(product_id )
    else:
        order_product[order_id] = ''
        if val >= 0.21:
            order_product[order_id] = str(product_id ) 

sub = pd.DataFrame.from_dict(order_product, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.loc[sub.products =='', 'products'] = 'None'
sub.to_csv('simple_fe_0.21.csv',index = None)