In [2]:
import pandas as pd
import logging

log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [3]:
base_path = './data'
feature_path = './feature'

In [4]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-09 07:52:01,114] INFO in <ipython-input-4-af34e33feb4f>: invite (9489162, 4)
[2019-12-09 07:52:03,716] INFO in <ipython-input-4-af34e33feb4f>: test (1141683, 3)


In [5]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

train['week'] = train['day'] % 7
test['week'] = test['day'] % 7

In [46]:
data = pd.concat([train, test], axis=0, sort=True)

# feature_v1

In [50]:
# 该 uid 在该小时、该天、该周几的被邀请数
t1 = data.groupby(['uid', 'hour'])
t2 = t1['qid'].count()
t2.name = 'uid_hour_count'
data = pd.merge(data, t2, on=['uid', 'hour'], how='left')

t1 = data.groupby(['uid', 'day'])
t2 = t1['qid'].count()
t2.name = 'uid_day_count'
data = pd.merge(data, t2, on=['uid', 'day'], how='left')

t1 = data.groupby(['uid', 'week'])
t2 = t1['qid'].count()
t2.name = 'uid_week_count'
data = pd.merge(data, t2, on=['uid', 'week'], how='left')

# 该 qid 在该小时、该天、该周几的被邀请数
t1 = data.groupby(['qid', 'hour'])
t2 = t1['uid'].count()
t2.name = 'qid_hour_count'
data = pd.merge(data, t2, on=['qid', 'hour'], how='left')

t1 = data.groupby(['qid', 'day'])
t2 = t1['uid'].count()
t2.name = 'qid_day_count'
data = pd.merge(data, t2, on=['qid', 'day'], how='left')

t1 = data.groupby(['qid', 'week'])
t2 = t1['uid'].count()
t2.name = 'qid_week_count'
data = pd.merge(data, t2, on=['qid', 'week'], how='left')

In [52]:
# 基于 uid、qid 统计用户偏好时段，小时、周的平均数、中位数、方差

# uid
t1 = data.groupby('uid')
t2 = t1['hour'].agg(['mean', 'median', 'std'])
t2.columns = ['uid_hour_mean', 'uid_hour_median', 'uid_hour_std']
data = pd.merge(data, t2, on='uid', how='left')

t2 = t1['week'].agg(['mean', 'median', 'std'])
t2.columns = ['uid_week_mean', 'uid_week_median', 'uid_week_std']
data = pd.merge(data, t2, on='uid', how='left')


# qid
t1 = data.groupby('qid')
t2 = t1['hour'].agg(['mean', 'median', 'std'])
t2.columns = ['qid_hour_mean', 'qid_hour_median', 'qid_hour_std']
data = pd.merge(data, t2, on='qid', how='left')

t2 = t1['week'].agg(['mean', 'median', 'std'])
t2.columns = ['qid_week_mean', 'qid_week_median', 'qid_week_std']
data = pd.merge(data, t2, on='qid', how='left')

In [53]:
save_feat = ['uid_hour_count', 'uid_day_count', 'uid_week_count', 'qid_hour_count', 'qid_day_count', 'qid_week_count']
save_feat += ['uid_hour_mean', 'uid_hour_median', 'uid_hour_std']
save_feat += ['uid_week_mean', 'uid_week_median', 'uid_week_std']
save_feat += ['qid_hour_mean', 'qid_hour_median', 'qid_hour_std']
save_feat += ['qid_week_mean', 'qid_week_median', 'qid_week_std']

In [54]:
# 压缩数据
t = data[save_feat].dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')
    

In [57]:
data[save_feat][:len(train)].to_csv(f'{feature_path}/train_invite_feature.txt', index=False, sep='\t')
data[save_feat][len(train):].to_csv(f'{feature_path}/test_invite_feature.txt', index=False, sep='\t')

# feature_v2

In [60]:
# uid 最早、最晚的钟点
t1 = data.groupby('uid')
t2 = t1['hour'].agg(['max', 'min'])
t2.columns = ['uid_hour_max', 'uid_hour_min']
data = pd.merge(data, t2, on='uid', how='left')

Unnamed: 0_level_0,uid_hour_max,uid_hour_min
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
M1000000382,21,15
M1000000983,21,11
M1000003304,17,17
M1000008978,17,8
M1000009571,18,18
...,...,...
M99999341,9,9
M999995457,22,8
M99999571,0,0
M999998695,18,2


In [79]:
# uid 
t1 = data.groupby('uid')
t2 = t1['day'].agg(['mean', 'median', 'std'])
t2.columns = ['uid_day_mean', 'uid_day_median', 'uid_day_std']
data = pd.merge(data, t2, on='uid', how='left')

# qid
t1 = data.groupby('qid')
t2 = t1['day'].agg(['mean', 'median', 'std'])
t2.columns = ['qid_day_mean', 'qid_day_median', 'qid_day_std']
data = pd.merge(data, t2, on='qid', how='left')

In [91]:
# uid 被邀时间与自己偏好时间(mean, median)的时间差
data['uid_diff_hour_hourmean'] = 12 - abs(abs(data['hour'] - data['uid_hour_mean']) - 12)
data['uid_diff_hour_hourmedian'] = 12 - abs(abs(data['hour'] - data['uid_hour_median']) - 12)
data['uid_diff_day_daymean'] = abs(data['day'] - data['uid_day_mean'])
data['uid_diff_day_daymedian'] = abs(data['day'] - data['uid_day_median'])
data['uid_diff_week_weekmean'] = 3.5 - abs(abs(data['week'] - data['uid_week_mean']) - 3.5)
data['uid_diff_week_weekmedian'] = 3.5 - abs(abs(data['week'] - data['uid_week_median']) - 3.5)

# qid 被邀时间与自己偏好时间(mean, median)的时间差
data['qid_diff_hour_hourmean'] = 12 - abs(abs(data['hour'] - data['qid_hour_mean']) - 12)
data['qid_diff_hour_hourmedian'] = 12 - abs(abs(data['hour'] - data['qid_hour_median']) - 12)
data['qid_diff_day_daymean'] = abs(data['day'] - data['qid_day_mean'])
data['qid_diff_day_daymedian'] = abs(data['day'] - data['qid_day_median'])
data['qid_diff_week_weekmean'] = 3.5 - abs(abs(data['week'] - data['qid_week_mean']) - 3.5)
data['qid_diff_week_weekmedian'] = 3.5 - abs(abs(data['week'] - data['qid_week_median']) - 3.5)

In [97]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
ques['q_week'] = ques['q_day'] % 7
del ques['q_dt']

[2019-12-03 02:48:13,470] INFO in <ipython-input-97-9063dae39e17>: ques (1829900, 3)


In [99]:
data = pd.merge(data, ques, on='qid', how='left')

In [110]:
# 邀请与问题提出的时间差
data['diff_iq_day'] = data['day'] - data['q_day']   
data['diff_iq_hour'] = data['diff_iq_day'] * 24 + (data['hour'] - data['q_hour'])

In [113]:
# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'freq',
                'uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 
                'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 
                'score', 'follow_topic', 'inter_topic']

logging.info("user %s", user.shape)


[2019-12-03 03:11:52,824] INFO in <ipython-input-113-4187da278e6d>: user (1931654, 16)


In [114]:
data = pd.merge(data, user, on='uid', how='left')

In [119]:
# 问题对应每个性别的邀请数
t1 = data.groupby(['qid', 'gender'])
t2 = t1['uid'].count()
t2.name = 'qid_gender_count'
data = pd.merge(data, t2, on=['qid', 'gender'], how='left')

In [131]:
# 问题对应用户分类特征的邀请数
for feat in ['uf_b1', 'uf_b2','uf_b3', 'uf_b4', 'uf_b5', 'uf_c5']:  
    t1 = data.groupby(['qid', feat])
    t2 = t1['uid'].count()
    t2.name = 'qid_' + feat + '_count'
    data = pd.merge(data, t2, on=['qid', feat], how='left')
    logging.info("feat %s, min %s, max %s", feat, 
                 data['qid_' + feat + '_count'].min(), data['qid_' + feat + '_count'].max())

[2019-12-03 03:43:03,575] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_b1, min 1, max 4347
[2019-12-03 03:45:33,611] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_b2, min 1, max 5221
[2019-12-03 03:47:28,937] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_b3, min 1, max 6346
[2019-12-03 03:48:48,085] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_b4, min 1, max 6293
[2019-12-03 03:50:02,486] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_b5, min 1, max 6227
[2019-12-03 03:51:18,195] INFO in <ipython-input-131-d56a60bbaa99>: feat uf_c5, min 1, max 6177


In [156]:
# follow_topic 和 topic 交集个数
def topic_intersection_count_1(follow_topic, ques_topic):
    t_follow_topic = follow_topic.split(',')
    t_ques_topic = ques_topic.split(',')
    if t_follow_topic[0] == '-1' or t_ques_topic[0] == '-1':
        return 0
    return len(set(t_follow_topic) & set(t_ques_topic))

# inter_topic 和 topic 交集个数
def topic_intersection_count_2(inter_topic, ques_topic):
    t_inter_topic = inter_topic.split(',')
    t_ques_topic = ques_topic.split(',')
    if t_inter_topic[0] == '-1' or t_ques_topic[0] == '-1':
        return 0
    count = 0
    for t in t_inter_topic:
        tt = t.split(':')
        if tt[0] in t_ques_topic:
            count += 1
    return count

# inter_topic 和 topic 交集分数
def topic_intersection_score(inter_topic, ques_topic):
    t_inter_topic = inter_topic.split(',')
    t_ques_topic = ques_topic.split(',')
    if t_inter_topic[0] == '-1' or t_ques_topic[0] == '-1':
        return 0
    score = 0
    for t in t_inter_topic:
        tt = t.split(':')
        if tt[0] in t_ques_topic:
            score += float(tt[1])
    return score

In [169]:
# follow_topic 和 topic 交集个数
t1 = data.apply(lambda x: topic_intersection_count_1(x['follow_topic'], x['topic']), axis=1)
t1.name = 'intersection_ft_count'
data = pd.concat([data, t1], axis=1)
logging.info('%s, max: %s', t1.name, t1.max())

# inter_topic 和 topic 交集个数
t1 = data.apply(lambda x: topic_intersection_count_2(x['inter_topic'], x['topic']), axis=1)
t1.name = 'intersection_it_count'
data = pd.concat([data, t1], axis=1)
logging.info('%s, max: %s', t1.name, t1.max())

# inter_topic 和 topic 交集分数
t1 = data.apply(lambda x: topic_intersection_score(x['inter_topic'], x['topic']), axis=1)
t1.name = 'intersection_it_score'
data = pd.concat([data, t1], axis=1)
logging.info('%s, max: %s', t1.name, t1.max())

[2019-12-03 05:49:48,638] INFO in <ipython-input-169-fa4a04937d75>: intersection_ft_count, max: 6
[2019-12-03 06:11:57,893] INFO in <ipython-input-169-fa4a04937d75>: intersection_it_count, max: 5
[2019-12-03 06:22:53,170] INFO in <ipython-input-169-fa4a04937d75>: intersection_it_score, max: inf


In [180]:
# 有两个 inf
import numpy as np
data['intersection_it_score'] = data['intersection_it_score'].replace(np.inf, 10)

In [None]:
# uid 被邀时间与自己偏好时间(mean, median)的时间差
data['uid_diff_hour_hourmean'] = 12 - abs(abs(data['hour'] - data['uid_hour_mean']) - 12)
data['uid_diff_hour_hourmedian'] = 12 - abs(abs(data['hour'] - data['uid_hour_median']) - 12)
data['uid_diff_day_daymean'] = abs(data['day'] - data['uid_day_mean'])
data['uid_diff_day_daymedian'] = abs(data['day'] - data['uid_day_median'])
data['uid_diff_week_weekmean'] = 3.5 - abs(abs(data['week'] - data['uid_week_mean']) - 3.5)
data['uid_diff_week_weekmedian'] = 3.5 - abs(abs(data['week'] - data['uid_week_median']) - 3.5)

# qid 被邀时间与自己偏好时间(mean, median)的时间差
data['qid_diff_hour_hourmean'] = 12 - abs(abs(data['hour'] - data['qid_hour_mean']) - 12)
data['qid_diff_hour_hourmedian'] = 12 - abs(abs(data['hour'] - data['qid_hour_median']) - 12)
data['qid_diff_day_daymean'] = abs(data['day'] - data['qid_day_mean'])
data['qid_diff_day_daymedian'] = abs(data['day'] - data['qid_day_median'])
data['qid_diff_week_weekmean'] = 3.5 - abs(abs(data['week'] - data['qid_week_mean']) - 3.5)
data['qid_diff_week_weekmedian'] = 3.5 - abs(abs(data['week'] - data['qid_week_median']) - 3.5)

In [185]:
save_feat = ['uid_hour_max', 'uid_hour_min']
save_feat += ['uid_diff_hour_hourmean', 'uid_diff_hour_hourmedian', 'uid_diff_day_daymean', 
              'uid_diff_day_daymedian', 'uid_diff_week_weekmean', 'uid_diff_week_weekmedian',
              'qid_diff_hour_hourmean', 'qid_diff_hour_hourmedian', 'qid_diff_day_daymean', 
              'qid_diff_day_daymedian', 'qid_diff_week_weekmean', 'qid_diff_week_weekmedian']
save_feat += ['q_hour', 'q_week']
save_feat += ['diff_iq_day', 'diff_iq_hour']
save_feat += ['qid_gender_count', 'qid_uf_b1_count', 'qid_uf_b2_count', 'qid_uf_b3_count', 
              'qid_uf_b4_count', 'qid_uf_b5_count', 'qid_uf_c5_count']
save_feat += ['intersection_ft_count', 'intersection_it_count', 'intersection_it_score']

In [186]:
# 压缩数据
t = data[save_feat].dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

In [191]:
data[save_feat][:len(train)].to_csv(f'{feature_path}/train_invite_feature_2.txt', index=False, sep='\t')
data[save_feat][len(train):].to_csv(f'{feature_path}/test_invite_feature_2.txt', index=False, sep='\t')

# feature_v3