In [2]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split

In [3]:
user_log = pd.read_csv('D:/TianChi/RepeatBuyer/data/data_format1/user_log_format1.csv')
user_info = pd.read_csv('D:/TianChi/RepeatBuyer/data/data_format1/user_info_format1.csv')
train_format1 = pd.read_csv('D:/TianChi/RepeatBuyer/data/data_format1/train_format1.csv')
test_format1 = pd.read_csv('D:/TianChi/RepeatBuyer/data/data_format1/test_format1.csv')
train_format2 = pd.read_csv('D:/TianChi/RepeatBuyer/data/data_format2/train_format2.csv')

In [5]:
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [6]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [7]:
train_format1.head()

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


In [3]:
# 将train和test拼接在一起，方便同步添加特征,后续添加特征只需要一步一步用merge函数加入matrix就可以
train_format1['origin'] = 'train'
test_format1['origin'] = 'test'
matrix = pd.concat([train_format1, test_format1], ignore_index=True, sort=False)
matrix.drop('prob', axis=1, inplace=True)
matrix = matrix.merge(user_info, on='user_id', how='left')

In [4]:
# 把age_range和gender的异常值都转为nan，然后再向前填充
matrix['age_range'][matrix['age_range'] == 0] = np.nan
matrix['gender'][matrix['gender'] == 2] = np.nan
matrix['age_range'].fillna(method='ffill', inplace=True)
matrix['gender'].fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = user_log['time_stamp'].astype('str')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%m%d')
user_log['month'] = pd.to_datetime(user_log['time_stamp'], format='%Y%m')
user_log['month'] = user_log['time_stamp'].dt.month

In [6]:
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

### 用户user_id特征构建

In [7]:
# groups是按user_id分的组，所以后续计算值都是在user_id组内
groups = user_log.groupby(['user_id'])
# 每个user_id的数量（交互的次数）
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id组内不同item_id的个数
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id组内不同cat_id的个数
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id组内不同merchant_id的个数
temp = groups['merchant_id'].agg([('u4','nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id组内不同brand_id的个数
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id购物的天数(不同的time_stamp的个数)
temp = groups['time_stamp'].agg([('u11', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id购物的月数(不同的month的个数)
temp = groups['month'].agg([('u12', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 每个user_id组内最后一次购物和第一次购物的时间差
temp = groups['time_stamp'].agg([('first_time', 'min'), ('last_time', 'max')]).reset_index()
temp['u6'] = (temp['last_time'] - temp['first_time']).dt.days
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 每个user_id组内各个action_type的次数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

### 商家merchant_id特征构建

In [8]:
groups = user_log.groupby(['merchant_id'])
# 每个merchant_id的数量
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id组内user_id,item_id,cat_id,brand_id的数量
temp = groups[['user_id', 'item_id', 'cat_id', 'brand_id']].nunique().reset_index()
temp.rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'}, inplace=True)
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id有人光顾的天数(不同的time_stamp的个数)
temp = groups['time_stamp'].agg([('m13', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id有人光顾的月数(不同的month的个数)
# temp = groups['month'].agg([('m14', 'nunique')]).reset_index()
# matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id组内各个action_type的次数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 从train_format2里面挖掘merchant_id特征
# 每个merchant_id中不是该merchant_id的新客户的数量
temp = train_format2[train_format2['label'] == -1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id中的重复买家的数量
temp = train_format2[train_format2['label'] == 1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m11'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 每个merchant_id中的非重复买家的数量
temp = train_format2[train_format2['label'] == 0].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m12'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

### 添加user_id和merchant_id共有的特征

In [9]:
# 添加user_id和merchant_id共有的特征，groupby(['user_id', 'merchant_id'])
groups = user_log.groupby(['user_id', 'merchant_id'])
# 每个（user_id, mercant_id）组内的数量
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 每个（user_id, mercant_id）组内的item_id，cat_id，brand_id的数量
temp = groups[['item_id', 'cat_id', 'brand_id']].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 每个(user_id, merchant_id)的time_stamp的数量，表示每个用户在同一家店交互的天数
temp = groups['time_stamp'].agg([('um10', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 每个(user_id, merchant_id)的month的数量(不同的month的个数)
# temp = groups['month'].agg([('um11', 'nunique')]).reset_index()
# matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 每个（user_id, mercant_id）组内的action_type的数量
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 每个（user_id, mercant_id）组内的最后一次购物和第一次购物的时间差
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.days
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

### 添加比值特征

In [10]:
# 每个user_id的购买和点击次数之比，相当于一个购买的转化率
matrix['r1'] = matrix['u9']/matrix['u7']
# 每个merchant_id的购买和点击次数之比
matrix['r2'] = matrix['m8']/matrix['m6']
# 每个（user_id, merchant_id）的购买和点击次数之比
matrix['r3'] = matrix['um7']/matrix['um5']

In [11]:
# 将nan填充为0
matrix.fillna(0, inplace=True)

In [12]:
# 将age_range, gender类别变量进行one-hot编码
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

In [13]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u11,...,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1
0,34176,3906,0.0,train,451,256,45,109,108,47,...,0,0,0,0,0,1,0,0,1,0
1,34176,121,0.0,train,451,256,45,109,108,47,...,0,0,0,0,0,1,0,0,1,0
2,34176,4356,1.0,train,451,256,45,109,108,47,...,0,0,0,0,0,1,0,0,1,0
3,34176,2217,0.0,train,451,256,45,109,108,47,...,0,0,0,0,0,1,0,0,1,0
4,230784,4818,0.0,train,54,31,17,20,19,16,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,2004,1173,71,278,282,110,...,0,0,0,0,0,1,0,0,1,0
522337,97919,2341,,test,55,29,14,17,17,8,...,0,0,0,0,0,0,0,1,0,1
522338,97919,3971,,test,55,29,14,17,17,8,...,0,0,0,0,0,0,0,1,0,1
522339,32639,3536,,test,72,46,24,33,35,15,...,0,0,0,0,0,0,0,1,1,0


In [14]:
# 划分训练集
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['origin', 'label'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=3)

### 模型建立与训练

In [22]:
model_lightgbm = lightgbm.LGBMClassifier(num_leaves=50, n_estimators=1000, objective='binary', learning_rate=0.018, max_depth=8, subsample=0.8, colsample_bytree=0.8, min_child_weight=300)
model_lightgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, eval_metric='auc')

[1]	valid_0's auc: 0.680938	valid_0's binary_logloss: 0.228246
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.682034	valid_0's binary_logloss: 0.227861
[3]	valid_0's auc: 0.683013	valid_0's binary_logloss: 0.227609
[4]	valid_0's auc: 0.690394	valid_0's binary_logloss: 0.227183
[5]	valid_0's auc: 0.688262	valid_0's binary_logloss: 0.226949
[6]	valid_0's auc: 0.687361	valid_0's binary_logloss: 0.226742
[7]	valid_0's auc: 0.692675	valid_0's binary_logloss: 0.226348
[8]	valid_0's auc: 0.695154	valid_0's binary_logloss: 0.225968
[9]	valid_0's auc: 0.698548	valid_0's binary_logloss: 0.225592
[10]	valid_0's auc: 0.698675	valid_0's binary_logloss: 0.225267
[11]	valid_0's auc: 0.700376	valid_0's binary_logloss: 0.224924
[12]	valid_0's auc: 0.700699	valid_0's binary_logloss: 0.224607
[13]	valid_0's auc: 0.702246	valid_0's binary_logloss: 0.224275
[14]	valid_0's auc: 0.702838	valid_0's binary_logloss: 0.223971
[15]	valid_0's auc: 0.702607	valid_0's binary_loglos

LGBMClassifier(colsample_bytree=0.8, learning_rate=0.018, max_depth=8,
               min_child_weight=300, n_estimators=1000, num_leaves=50,
               objective='binary', subsample=0.8)

In [23]:
prob = model_lightgbm.predict_proba(test_data)
test_format1['prob'] = pd.Series(prob[:,1])
# test_format1.drop(['origin'], axis=1, inplace=True)
test_format1.to_csv('prediction.csv', index=False)