In [1]:
import gc
import pandas as pd

### 1 加载数据

In [2]:
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')

In [3]:
train_data = pd.read_csv('./data_format2/train_format2.csv')

### 2 数据处理

#### 2.1 给数据添加origin字段，方便数据处理完成之后将训练数据和预测数据划分开

In [4]:
train_data1.head()

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


In [5]:
submission.head()

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,
1,360576,1581,
2,98688,1964,
3,98688,3645,
4,295296,3361,


In [6]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'

In [7]:
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,prob
0,34176,3906,0.0,train,
1,34176,121,0.0,train,
2,34176,4356,1.0,train,
3,34176,2217,0.0,train,
4,230784,4818,0.0,train,


In [8]:
matrix.shape

(522341, 5)

In [9]:
matrix.drop(['prob'], axis=1, inplace=True)

#### 2.2 连接user_info表，通过user_id关联

In [10]:
matrix = matrix.merge(user_info, on='user_id', how='left')

In [11]:
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,34176,3906,0.0,train,6.0,0.0
1,34176,121,0.0,train,6.0,0.0
2,34176,4356,1.0,train,6.0,0.0
3,34176,2217,0.0,train,6.0,0.0
4,230784,4818,0.0,train,0.0,0.0


#### 2.3 将user_log的商家id(seller_id)字段名改为train_data中使用的商家id名（merchant_id）,方便下面对齐使用

In [12]:
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [13]:
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


#### 2.4 对user_log的数值类型进行格式化

In [14]:
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

#### 2.5 对matrix的数值类型进行格式化

In [15]:
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender
0,34176,3906,0.0,train,6.0,0.0
1,34176,121,0.0,train,6.0,0.0
2,34176,4356,1.0,train,6.0,0.0
3,34176,2217,0.0,train,6.0,0.0
4,230784,4818,0.0,train,0.0,0.0


In [16]:
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

#### 2.6 防止内存不够，删除并回收用不到的资源

In [17]:
del user_info, train_data1
gc.collect()

0

#### 2.7 对User进行特征处理

In [18]:
groups = user_log.groupby(['user_id'])

In [19]:
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
temp

Unnamed: 0,user_id,u1
0,1,33
1,2,63
2,3,68
3,4,50
4,5,173
...,...,...
424165,424166,90
424166,424167,35
424167,424168,223
424168,424169,297


In [20]:
matrix = matrix.merge(temp, on='user_id', how='left')
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1
0,34176,3906,0.0,train,6,0,451
1,34176,121,0.0,train,6,0,451
2,34176,4356,1.0,train,6,0,451
3,34176,2217,0.0,train,6,0,451
4,230784,4818,0.0,train,0,0,54
...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004
522337,97919,2341,,test,8,1,55
522338,97919,3971,,test,8,1,55
522339,32639,3536,,test,0,0,72


In [21]:
# 对于每个user_id 不重复的item_id的数量 => u2
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
temp

Unnamed: 0,user_id,u2
0,1,12
1,2,43
2,3,45
3,4,28
4,5,87
...,...,...
424165,424166,48
424166,424167,15
424167,424168,160
424168,424169,176


In [22]:
matrix = matrix.merge(temp, on='user_id', how='left')
matrix

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2
0,34176,3906,0.0,train,6,0,451,256
1,34176,121,0.0,train,6,0,451,256
2,34176,4356,1.0,train,6,0,451,256
3,34176,2217,0.0,train,6,0,451,256
4,230784,4818,0.0,train,0,0,54,31
...,...,...,...,...,...,...,...,...
522336,228479,3111,,test,6,0,2004,1173
522337,97919,2341,,test,8,1,55,29
522338,97919,3971,,test,8,1,55,29
522339,32639,3536,,test,0,0,72,46


In [23]:
# 对于每个user_id 不重复的cat_id的数量 => u3
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [24]:
# 时间间隔特征 u6 按照小时
# 对于每个user_id 计算time_stamp的最小时间 => F_time, 最大时间max => L_time
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

In [25]:
# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

#### 2.8 商家特征处理

In [26]:
groups = user_log.groupby(['merchant_id'])

In [27]:
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [28]:
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

  


In [29]:
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [30]:
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [31]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔

  """


In [32]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 

In [33]:
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 

In [34]:
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)

#### 2.9 修改age_range字段名称为 age_0, age_1, age_2... age_8

In [35]:
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

#### 2.10 分割训练数据和测试数据

In [36]:
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
del temp, matrix
gc.collect()

12

### 3 使用机器学习建模 

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report
import xgboost as xgb

#### 3.1 将训练数据切分成训练集和验证集 

In [38]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)

In [39]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((208691, 46), (52173, 46), (208691,), (52173,))

In [40]:
X_train.head()

Unnamed: 0,user_id,merchant_id,u1,u2,u3,u4,u5,u6,u7,u8,...,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
172820,396925,4282,112,37,9,13,13,4.783333,101.0,0.0,...,0,1,0,0,0,0,0,0,1,0
191356,157364,742,117,77,32,47,43,5.966667,96.0,0.0,...,0,0,0,0,1,0,0,0,1,0
212035,86000,4830,59,37,19,23,23,5.7,52.0,0.0,...,0,0,0,0,0,0,0,0,0,1
28724,374229,2031,66,52,15,16,15,4.866667,64.0,0.0,...,0,0,0,0,0,1,0,0,1,0
44773,384516,1677,109,47,15,12,14,5.916667,93.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
y_train.value_counts()

0.0    195853
1.0     12838
Name: label, dtype: int64

#### 3.2 建立xgb模型

In [42]:
model = xgb.XGBClassifier(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42    
)

In [43]:
%%time
model.fit(
    X_train, y_train,
    eval_metric='auc', eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    #早停法，如果auc在10epoch没有进步就stop
    early_stopping_rounds=10 
)

[0]	validation_0-auc:0.63873	validation_1-auc:0.63108
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.64753	validation_1-auc:0.64130
[2]	validation_0-auc:0.65711	validation_1-auc:0.64738
[3]	validation_0-auc:0.66408	validation_1-auc:0.65306
[4]	validation_0-auc:0.66637	validation_1-auc:0.65262
[5]	validation_0-auc:0.66674	validation_1-auc:0.65161
[6]	validation_0-auc:0.66702	validation_1-auc:0.65376
[7]	validation_0-auc:0.66776	validation_1-auc:0.65552
[8]	validation_0-auc:0.66916	validation_1-auc:0.65436
[9]	validation_0-auc:0.67005	validation_1-auc:0.65525
[10]	validation_0-auc:0.67064	validation_1-auc:0.65497
[11]	validation_0-auc:0.67160	validation_1-auc:0.65760
[12]	validation_0-auc:0.67354	validation_1-auc:0.65886
[13]	validation_0-auc:0.67633	validation_1-auc:0.66172
[14]	validation_0-auc:0.67768	validation_1-auc:0.66189
[15]	validation_0-auc:0.6796

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=300, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
%%time
model.fit(X_train, y_train)

Wall time: 3min 30s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=300, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
prob = model.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('prediction_all_xgb.csv', index=False)

### 4 分数

+ score:0.6762800

![](score.png)