# 项目简介

商家有时会在特定日期，例如Boxing-day，黑色星期五或是双十一（11月11日）开展大型促销活动或者发放优惠券以吸引消费者，然而很多被吸引来的买家都是一次性消费者，这些促销活动可能对销售业绩的增长并没有长远帮助，因此为解决这个问题，商家需要识别出哪类消费者可以转化为重复购买者。通过对这些潜在的忠诚客户进行定位，商家可以大大降低促销成本，提高投资回报率（Return on Investment, ROI）。众所周知的是，在线投放广告时精准定位客户是件比较难的事情，尤其是针对新消费者的定位。不过，利用天猫长期积累的用户行为日志，我们或许可以解决这个问题。

In [None]:
# train_format1.csv
# user_id             购物者的唯一ID编码
# merchant_id         商家的唯一ID编码
# label               给定客户是给定商家的重复购买者的概率，取值在[0, 1]

# train_format2.csv
# user_id             购物者的唯一ID编码
# age_range           用户年龄范围。
#                     <18岁为1；[18,24]为2； [25,29]为3； 
#                     [30,34]为4；[35,39]为5；[40,49]为6； > = 50时为7和8; 
#                     0和NULL表示未知
# gender              0表示女性，1表示男性，2和NULL表示未知
# merchant_id         商家的唯一ID编码
# label               取值集合为{0, 1, -1, NULL}。取1表示'userid'是'merchantid'的重复买家，
#                     取0则反之。取-1表示'user_id'不是给定商家的新客户，因此不在我们预测范围内，
#                     但这些记录可能会提供额外信息。测试集这一部分需要预测，因此为NULL。
# activity_log        {userid, merchantid}之间的每组交易中都记录有itemid, categoryid, brand_id, time，
#                     用#分隔。记录不按任何特定顺序排序。

# user_log_format1.csv
# user_id               购物者的唯一ID编码
# item_id               商品的唯一编码
# cat_id                商品所属品类的唯一编码
# seller_id             商家的唯一ID编码
# brand_id              商品品牌的唯一编码
# time_stamp            购买时间（格式：mmdd）
# action_type           包含{0, 1, 2, 3}，0表示单击，1表示添加到购物车，2表示购买，3表示添加到收藏夹

# user_info_format1.csv
# user_id             购物者的唯一ID编码
# age_range           用户年龄范围。
#                     <18岁为1；[18,24]为2； [25,29]为3； 
#                     [30,34]为4；[35,39]为5；[40,49]为6； > = 50时为7和8; 
#                     0和NULL表示未知
# gender              0表示女性，1表示男性，2和NULL表示未知

# test_format1.csv
# user_id              购物者的唯一ID编码
# merchant_id          商家的唯一ID编码
# prob                 包含{0, 1}，1表示重复买家，0表示非重复买家。测试集这一部分需要预测，因此为空。

# 导入函数工具箱

In [2]:
import gc
import pandas as pd 
import numpy as np

#导入分析库
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from sklearn.metrics import roc_auc_score, roc_curve, auc
# LOCAL_QUICK = True
LOCAL_QUICK = False
sample_percent = 0.1

MORE_FE = False
# MORE_FE = True
FE_V1 = False if MORE_FE else True



# 数据读取

In [3]:
%%time
# 加载数据
# 用户行为，使用format1进行加载
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
sub_data = pd.read_csv('./data_format1/test_format1.csv')
data_train = pd.read_csv('./data_format2/train_format2.csv') 



CPU times: user 33.7 s, sys: 11.9 s, total: 45.6 s
Wall time: 49 s


## 采样测试

In [4]:
%%time
if LOCAL_QUICK:
    print('Local quick test: {}, sample_percent {}'.format(  
        LOCAL_QUICK, sample_percent))
    data = user_log.sample(int(len(user_log) * sample_percent))
    data1 = user_info.sample(int(len(user_info) * sample_percent))
    data2 = train_data1.sample(int(len(train_data1) * sample_percent))
    # submission = sub_data.sample(int(len(sub_data) * sample_percent))
    submission = sub_data.copy()
else:
    print('全数据训练')
    data = user_log.copy()
    data1 = user_info.copy()
    data2 = train_data1.copy()
    submission = sub_data.copy() 
    del user_log, user_info, train_data1, sub_data
print('---data shape---')     
for df in [data, data1, data2, submission, data_train]:
    print(df.shape)  

全数据训练
---data shape---
(54925330, 7)
(424170, 3)
(260864, 3)
(261477, 3)
(7030723, 6)
CPU times: user 1.79 s, sys: 5.02 s, total: 6.8 s
Wall time: 8.55 s


## 数据合并格式化

In [5]:
data2['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([data2, submission], ignore_index=True, sort=False)
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(data1, on='user_id', how='left')
# 使用merchant_id（原列名seller_id）
data.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [6]:
%%time
# 格式化
data['user_id'] = data['user_id'].astype('int32')
data['merchant_id'] = data['merchant_id'].astype('int32')
data['item_id'] = data['item_id'].astype('int32')
data['cat_id'] = data['cat_id'].astype('int32')
data['brand_id'].fillna(0, inplace=True)
data['brand_id'] = data['brand_id'].astype('int32')
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%H%M')
# 缺失值填充
matrix['age_range'].fillna(0, inplace=True)
matrix['gender'].fillna(2, inplace=True)

# # gender用众数填充 表现更差
# matrix['gender'].fillna(matrix['gender'].mode()[0],inplace=True)
# # 年龄用中位数填充
# matrix['age_range'].fillna(matrix['age_range'].median(),inplace=True)

matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

del data1, data2
gc.collect()

CPU times: user 9.19 s, sys: 5.35 s, total: 14.5 s
Wall time: 15.1 s


0

## 特征处理

In [7]:
%%time


##### User特征处理
groups = data.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值个数 item_id, cat_id, merchant_id, brand_id
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(  
    columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

del temp
gc.collect()

CPU times: user 1min 43s, sys: 7.82 s, total: 1min 51s
Wall time: 1min 52s


11

In [8]:
%%time

##### 商家特征处理
groups = data.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={
    'user_id':'m2',
    'item_id':'m3', 
    'cat_id':'m4', 
    'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(  
    columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

del temp
gc.collect()



CPU times: user 1min 32s, sys: 9.14 s, total: 1min 42s
Wall time: 1min 43s


0

In [9]:
# 按照merchant_id 统计随机负采样的个数
temp = data_train[data_train['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [10]:
%%time
##### 用户+商户特征
groups = data.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={
    'item_id':'um2',
    'cat_id':'um3',
    'brand_id':'um4'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={
    0:'um5',
    1:'um6',
    2:'um7',
    3:'um8'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('frist', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.seconds/3600
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

del temp
gc.collect()



CPU times: user 3min 23s, sys: 28 s, total: 3min 51s
Wall time: 3min 57s


11

In [11]:

matrix['r1'] = matrix['u9']/matrix['u7'] # 用户购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] # 商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5'] #不同用户不同商家购买点击比

In [12]:
matrix.fillna(0, inplace=True)

In [13]:
%%time
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

del temp
gc.collect()

CPU times: user 444 ms, sys: 158 ms, total: 602 ms
Wall time: 679 ms


60

In [14]:

lbe_action_type={0:1,1:2,2:3,3:4}
data['action_type']=data['action_type'].map(lbe_action_type)
# 用户行为sequence
# 把data里同user的这些数据合并成一个list
temp=pd.DataFrame(data.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))
# 列名称改成hist_merchant_id 和 hist_action_type 
temp.columns=['hist_merchant_id','hist_action_type']

matrix = matrix.merge(temp, on=['user_id'], how='left') #统计时间间隔


del temp
gc.collect()


  temp=pd.DataFrame(data.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))


0

In [15]:

# 截取，不缺到定长M个
M=500
for feature in ['hist_merchant_id','hist_action_type']:
    matrix[feature]=matrix[feature].map(lambda x:np.array(x+[0]*(M-len(x)))[:M])


In [16]:
%%time
# train、test-setdata
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)

if not LOCAL_QUICK:
    if FE_V1:
        train_data.to_csv('train_data.csv')
        test_data.to_csv('test_data.csv')
    if MORE_FE:
        train_data.to_csv('train_data_moreFE.csv')
        test_data.to_csv('test_data_moreFE.csv')


CPU times: user 482 ms, sys: 400 ms, total: 882 ms
Wall time: 907 ms


# 模型训练与预测

In [17]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

# FeatureSelect_QUICK = True # Feature Select
FeatureSelect_QUICK = False 
if FeatureSelect_QUICK: # 使用部分样本进行快速特征选择
    train_data = train_data.sample(int(len(train_data) * sample_percent))

# train_data = train_data[train_col]
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2, random_state=42) # test_size=.3

In [18]:
print(X_train.shape)

(208691, 48)


In [19]:
train_y = train_y.astype('float')
train_y

0         0.0
1         0.0
2         1.0
3         0.0
4         0.0
         ... 
260859    0.0
260860    0.0
260861    0.0
260862    0.0
260863    0.0
Name: label, Length: 260864, dtype: float64

## XGB Model

In [0]:
%%time
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=verbose,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )
    print(model_xgb.best_score)
    return model_xgb

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 13.4 µs


In [0]:
model_xgb = xgb_train(X_train, y_train, X_valid, y_valid, verbose=False)

0.689278


In [0]:
%%time
prob = model_xgb.predict_proba(test_data)

submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_xgb.csv', index=False)

CPU times: user 4.44 s, sys: 11.2 ms, total: 4.45 s
Wall time: 4.48 s


## LGB Model

In [0]:
############DEF:lgb_train################
def lgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_lgb = lgb.LGBMClassifier(
        max_depth=10, # 8
        n_estimators=1000,
        min_child_weight=200, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_lgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=verbose,
        early_stopping_rounds=10
    )

    print(model_lgb.best_score_['valid_1']['auc'])
    return model_lgb

In [0]:
model_lgb = lgb_train(X_train, y_train, X_valid, y_valid, verbose=False)

0.6882175357469706


In [0]:
%%time
prob = model_lgb.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_lgb.csv', index=False)

CPU times: user 4.29 s, sys: 14.5 ms, total: 4.3 s
Wall time: 5.55 s


## Cat Model

In [0]:
def cat_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_cat = cat.CatBoostClassifier(learning_rate=0.02, iterations=5000, eval_metric='AUC', od_wait=50,
                                od_type='Iter', random_state=10, thread_count=8, l2_leaf_reg=1, verbose=verbose)
    model_cat.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50,
            use_best_model=True)

    print(model_cat.best_score_['validation']['AUC'])
    return model_cat

In [0]:
model_cat = cat_train(X_train, y_train, X_valid, y_valid, verbose=False)

0.6888438643692378


In [0]:
%%time
prob = model_cat.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_cat.csv', index=False)

CPU times: user 1.36 s, sys: 18.5 ms, total: 1.38 s
Wall time: 2.26 s


# StratifiedKFold

In [0]:
# 构造训练集和测试集
def get_train_testDF(train_df,label_df):
    skv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    trainX = []
    trainY = []
    testX = []
    testY = []
    for train_index, test_index in skv.split(X=train_df, y=label_df):
        train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                            train_df.iloc[test_index, :], label_df.iloc[test_index]

        trainX.append(train_x)
        trainY.append(train_y)
        testX.append(test_x)
        testY.append(test_y)
    return trainX, testX, trainY, testY

## lightgbm

In [0]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [0]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# lightgbm模型

pred_lgbms = []
for i in range(5):
    print('\n============================LGB training use Data {}/5============================\n'.format(i+1))
    model_lgb = lgb.LGBMClassifier(
        max_depth=10, # 8
        n_estimators=1000,
        min_child_weight=200, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42
    )

    model_lgb.fit(
        X_train[i], 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        verbose=False,
        early_stopping_rounds=10
    )

    print(model_lgb.best_score_['valid_1']['auc'])

    pred = model_lgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_lgbms.append(pred)
pred_lgbms = pd.concat(pred_lgbms, axis=1)
print(pred_lgbms)

submission['prob'] = pred_lgbms.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_lgb.csv', index=False)

####0.6784



0.6814285066081079


0.678075817616207


0.6769034528266582


0.678130625463615


0.6776812517271715
               0         0         0         0         0
0       0.172597  0.139495  0.174839  0.202953  0.120382
1       0.077612  0.068260  0.091190  0.073946  0.055343
2       0.085269  0.091514  0.093136  0.093108  0.082051
3       0.085579  0.094960  0.102522  0.106962  0.098253
4       0.049972  0.054350  0.054020  0.068434  0.044841
...          ...       ...       ...       ...       ...
260859  0.049012  0.062446  0.071548  0.066920  0.082217
260860  0.039762  0.029111  0.031687  0.033621  0.027339
260861  0.019739  0.020501  0.021444  0.015976  0.015180
260862  0.037017  0.034872  0.044972  0.033323  0.033951
260863  0.028490  0.035417  0.025533  0.027674  0.028257

[260864 rows x 5 columns]


## catgbm

In [0]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [0]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# catgbm模型

pred_cats = []
for i in range(5):
    print('\n============================CAT training use Data {}/5============================\n'.format(i+1))
    model_cat = cat.CatBoostClassifier(learning_rate=0.02, iterations=5000, eval_metric='AUC', od_wait=50,
                                od_type='Iter', random_state=10, thread_count=8, l2_leaf_reg=1, verbose=False)
    model_cat.fit(X_train[i], y_train[i], eval_set=[(X_valid[i], y_valid[i])], early_stopping_rounds=50,
            use_best_model=True)
    # print(model_cat.evals_result_)
    print(model_cat.best_score_['validation']['AUC'])

    pred = model_cat.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_cats.append(pred)
pred_cats = pd.concat(pred_cats, axis=1)

submission['prob'] = pred_cats.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_cat.csv', index=False)


#### 0.68001



0.6824405044370522


0.6802216199760176


0.6778579794359316


0.6791038275100539


0.6804717208509453


## xgboost

In [0]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [0]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# xgboost模型

pred_xgbs = []
for i in range(5):
    print('\n============================XGB training use Data {}/5============================\n'.format(i+1))
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train[i], 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        verbose=False,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )    

    print(model_xgb.best_score)

    pred = model_xgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_xgbs.append(pred)
pred_xgbs = pd.concat(pred_xgbs, axis=1)

# make submission
submission['prob'] = pred_xgbs.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_xgb.csv', index=False)

#### 0.6803



0.682694


0.680635


0.677834


0.681198


0.67939


In [0]:
"""
xgb:0.689278, ##KFold## 0.6784
lgb:0.688217, ##KFold## 0.6800
cat:0.688843, ##KFold## 0.6803
"""

## 模型融合

In [0]:
lgb6812 = pd.read_csv("submission_lgb0.6812968.csv")
xgb6787 = pd.read_csv("submission_xgb0.6787.csv")
cat6777 = pd.read_csv("submission_cat-val0.6827785215-onling0.6777246.csv")

In [7]:
# 先构造一个矩阵
df = np.array([lgb6812.prob, xgb6787.prob, cat6777.prob])
# 计算协方差矩阵
np.corrcoef(df)

array([[1.        , 0.94866069, 0.9108549 ],
       [0.94866069, 1.        , 0.9113983 ],
       [0.9108549 , 0.9113983 , 1.        ]])

In [0]:
sub = lgb6812.copy()

sub.prob = 0.6*lgb6812.prob + 0.4*cat6777.prob # Online test score:0.6830807
sub.to_csv('./sub_blended11.csv', index=False)
####################################0.6833209################################
sub.prob = 0.5*lgb6812.prob + 0.3*cat6777.prob + 0.2*xgb6787.prob# Online test 0.6833209
sub.to_csv('./sub_blended12.csv', index=False)

sub.prob = 0.45*lgb6812.prob + 0.3*cat6777.prob + 0.25*xgb6787.prob# Online test 0.6832934
sub.to_csv('./sub_blended13.csv', index=False)
####################################0.6833171################################
sub.prob = 0.45*lgb6812.prob + 0.35*cat6777.prob + 0.2*xgb6787.prob# Online test 0.6833171
sub.to_csv('./sub_blended14.csv', index=False)

# 使用DIN模型

In [None]:
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 用户行为，使用format1进行加载
# 加载全量样本
"""
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')
"""
# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

In [53]:

train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix)

In [54]:

# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

In [55]:

# 对离散特征做LabelEncoder
lbe_merchant_id=LabelEncoder()
lbe_merchant_id.fit(np.r_[0,user_log['merchant_id'].values])
user_log['merchant_id']=lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id']=lbe_merchant_id.transform(matrix['merchant_id'])


In [56]:

lbe_user_id=LabelEncoder()
user_log['user_id']=lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id']=lbe_user_id.transform(user_info['user_id'])
matrix['user_id']=lbe_user_id.transform(matrix['user_id'])


In [57]:

lbe_item_id=LabelEncoder()
user_log['item_id']=lbe_item_id.fit_transform(user_log['item_id'])
lbe_cat_id=LabelEncoder()
user_log['cat_id']=lbe_cat_id.fit_transform(user_log['cat_id'])
lbe_brand_id=LabelEncoder()
user_log['brand_id']=lbe_brand_id.fit_transform(user_log['brand_id'])

user_log['merchant_id'].max(),user_log['user_id'].max()
matrix = matrix.merge(user_info, on='user_id', how='left')


In [58]:

# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
print(matrix)

       user_id  merchant_id label origin  prob  age_range  gender
0        16497         1203   0.0  train   NaN          0       1
1         1950          946   0.0  train   NaN          2       0
2        10829         2278   0.0  train   NaN          3       0
3         7974          951   0.0  train   NaN          0       1
4        14604         1892   0.0  train   NaN          7       0
...        ...          ...   ...    ...   ...        ...     ...
23888     2157         1748   nan   test   0.0          0       0
23889     2673          798   nan   test   0.0          3       0
23890    11847          639   nan   test   0.0          2       1
23891    11847         3953   nan   test   0.0          2       1
23892    19079         2954   nan   test   0.0          4       0

[23893 rows x 7 columns]


In [59]:

# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')


In [60]:

# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计action_type为0，1，2，3的个数（原始操作，没有补0）
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
#print(matrix)

In [61]:

# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
#print(matrix)


  temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})


In [62]:

# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
print(temp)
print('-'*100)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
#print(matrix)

  temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数


        user_id  merchant_id       um9
0             0          186  0.883333
1             0          202  0.000000
2             0          523  0.000000
3             0          774  0.000000
4             0          956  0.000000
...         ...          ...       ...
635221    19111         2874  0.000000
635222    19111         3833  0.000000
635223    19111         4480  0.000000
635224    19111         4522  0.000000
635225    19111         4950  0.000000

[635226 rows x 3 columns]
----------------------------------------------------------------------------------------------------


In [63]:

#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
#print(matrix)

In [64]:

lbe_action_type={0:1,1:2,2:3,3:4}
user_log['action_type']=user_log['action_type'].map(lbe_action_type)
# 用户行为sequence
# 把user_log里同user的这些数据合并成一个list
temp=pd.DataFrame(user_log.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))
# 列名称改成hist_merchant_id 和 hist_action_type 
temp.columns=['hist_merchant_id','hist_action_type']
#print(temp)
matrix = matrix.merge(temp, on=['user_id'], how='left') #统计时间间隔
print(matrix)

  temp=pd.DataFrame(user_log.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))


       user_id  merchant_id label origin  prob    u1   u2  u3  u4  u5  \
0        16497         1203   0.0  train   0.0    46   29  12  16  16   
1         1950          946   0.0  train   0.0   365  198  46  46  45   
2        10829         2278   0.0  train   0.0    47   31  14  15  17   
3         7974          951   0.0  train   0.0   234  105  23  35  36   
4        14604         1892   0.0  train   0.0   186  106  34  40  39   
...        ...          ...   ...    ...   ...   ...  ...  ..  ..  ..   
23888     2157         1748   nan   test   0.0   128   97  28  39  40   
23889     2673          798   nan   test   0.0  1286  540  55  93  96   
23890    11847          639   nan   test   0.0     9    8   7   7   7   
23891    11847         3953   nan   test   0.0     9    8   7   7   7   
23892    19079         2954   nan   test   0.0   197   85  36  39  40   

             u6      u7   u8    u9   u10    m1   m2   m3  m4  m5      m6  \
0      4.933333    45.0  0.0   1.0   0.0  1915 

In [65]:

# 截取，不缺到定长M个
M=500
for feature in ['hist_merchant_id','hist_action_type']:
    matrix[feature]=matrix[feature].map(lambda x:np.array(x+[0]*(M-len(x)))[:M])


In [66]:

# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
print(train_X)

       user_id  merchant_id  prob   u1   u2  u3  u4  u5        u6     u7   u8  \
0        16497         1203   0.0   46   29  12  16  16  4.933333   45.0  0.0   
1         1950          946   0.0  365  198  46  46  45  5.866667  313.0  0.0   
2        10829         2278   0.0   47   31  14  15  17  5.833333   42.0  0.0   
3         7974          951   0.0  234  105  23  35  36  5.833333  177.0  0.0   
4        14604         1892   0.0  186  106  34  40  39  5.866667  147.0  0.0   
...        ...          ...   ...  ...  ...  ..  ..  ..       ...    ...  ...   
17832    18547         3825   0.0   40   24  13  14  15  4.916667   27.0  0.0   
17833    18693         3443   0.0   15   12  10  10  10  6.000000   10.0  0.0   
17834     4034         4172   0.0   16   14   8  11  12  5.833333   15.0  0.0   
17835    16017          993   0.0   33   25  12  15  15  4.916667   31.0  0.0   
17836     7308         4723   0.0   94   50  18  24  22  5.766667   80.0  0.0   

         u9   u10     m1   

In [78]:

print(len(train_X))

17837


In [67]:

# 使用DIN模型
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from deepctr.feature_column import DenseFeat,VarLenSparseFeat,SparseFeat,get_feature_names
from deepctr.models import DIN, DIEN, DSIN
from sklearn.metrics import classification_report

train_X['action_type']=3
feature_columns = []
for column in train_X.columns:
    if column != 'hist_merchant_id' and column != 'hist_action_type':
        print(column)
        num = train_X[column].nunique()
        if num > 10000:
            dim = 10
        else:
            if num > 1000:
                dim = 8
            else:
                dim = 4
        print(num)
        if column  == 'user_id':
            feature_columns += [SparseFeat(column, 19111+1, embedding_dim=dim)]
        elif column  == 'merchant_id':
            feature_columns += [SparseFeat(column, 4994+1, embedding_dim=dim)]
        elif column  == 'action_type':
            feature_columns += [SparseFeat(column, 4+1, embedding_dim=dim)]
        else:
            feature_columns += [DenseFeat(column, 1)]

#print(train_X['hist_merchant_id'].shape)
#M = len(train_X['hist_merchant_id'])

user_id
14488
merchant_id
1856
prob
1
u1
825
u2
539
u3
124
u4
246
u5
237
u6
184
u7
772
u8
17
u9
79
u10
161
m1
805
m2
406
m3
292
m4
56
m5
35
m6
757
m7
23
m8
208
m9
163
m10
1294
um1
170
um2
94
um3
22
um4
12
um5
166
um6
8
um7
10
um8
26
um9
184
r1
3265
r2
1413
r3
394
age_0
2
age_2
2
age_3
2
age_4
2
age_5
2
age_6
2
age_7
2
age_8
2
g_0
2
g_1
2
g_2
2
action_type
1


In [68]:

# maxlen为历史信息的长度，vocabulary_size为onehot的长度
feature_columns += [VarLenSparseFeat(SparseFeat('hist_merchant_id', vocabulary_size=19111+1, embedding_dim=8, embedding_name='merchant_id'),maxlen=M), 
                   VarLenSparseFeat(SparseFeat('hist_action_type', vocabulary_size=4+1, embedding_dim=4, embedding_name='action_type'), maxlen=M)]
hist_features=['merchant_id','action_type']
print(feature_columns)


[SparseFeat(name='user_id', vocabulary_size=19112, embedding_dim=10, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7faeedfde190>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='merchant_id', vocabulary_size=4995, embedding_dim=8, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fae46234490>, embedding_name='merchant_id', group_name='default_group', trainable=True), DenseFeat(name='prob', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u1', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u2', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u3', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u4', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u5', dimension=1, dtype='float32', transform_f

In [69]:

# 使用DIN模型
model=DIN(feature_columns, hist_features)
# 使用Adam优化器，二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])


The following Variables were used a Lambda layer's call (lambda_2), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer_3/local_activation_unit_3/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer_3/local_activation_unit_3/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [70]:
train_y = train_y.astype('float')
train_y

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
17832    0.0
17833    0.0
17834    0.0
17835    0.0
17836    0.0
Name: label, Length: 17837, dtype: float64

In [71]:

# 组装train_model_input，得到feature names，将train_X转换为字典格式
feature_names=list(train_X.columns)
train_model_input = {name:train_X[name].values for name in feature_names}
# histroy输入必须是二维数组
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(train_model_input[fea]):
        l.append(i)
    train_model_input[fea]=np.array(l)
history = model.fit(train_model_input, train_y, verbose=True, epochs=10, validation_split=0.2,batch_size=512)


100%|██████████| 17837/17837 [00:00<00:00, 1799923.02it/s]
100%|██████████| 17837/17837 [00:00<00:00, 1568061.88it/s]


Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:

# 转换test__model_input
test_data['action_type']=3
test_model_input = {name:test_data[name].values for name in feature_names}
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(test_model_input[fea]):
        l.append(i)
    test_model_input[fea]=np.array(l)

100%|██████████| 6056/6056 [00:00<00:00, 907006.07it/s]
100%|██████████| 6056/6056 [00:00<00:00, 1235142.48it/s]


In [74]:

# 得到预测结果
prob = model.predict(test_model_input)
submission['prob'] = prob
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('prediction.csv', index=False)

In [1]:
prob

NameError: name 'prob' is not defined