In [None]:
"""当天的 invite 情况"""

In [1]:
import pandas as pd
import logging

log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
base_path = './data'
feature_path = './feature'

In [3]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

[2019-12-01 04:07:09,012] INFO in <ipython-input-3-af34e33feb4f>: invite (9489162, 4)
[2019-12-01 04:07:10,420] INFO in <ipython-input-3-af34e33feb4f>: test (1141683, 3)


In [4]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

train['week'] = train['day'] % 7
test['week'] = test['day'] % 7

In [46]:
data = pd.concat([train, test], axis=0, sort=True)

In [50]:
# 该 uid 在该小时、该天、该周几的被邀请数
t1 = data.groupby(['uid', 'hour'])
t2 = t1['qid'].count()
t2.name = 'uid_hour_count'
data = pd.merge(data, t2, on=['uid', 'hour'], how='left')

t1 = data.groupby(['uid', 'day'])
t2 = t1['qid'].count()
t2.name = 'uid_day_count'
data = pd.merge(data, t2, on=['uid', 'day'], how='left')

t1 = data.groupby(['uid', 'week'])
t2 = t1['qid'].count()
t2.name = 'uid_week_count'
data = pd.merge(data, t2, on=['uid', 'week'], how='left')

In [51]:
# 该 qid 在该小时、该天、该周几的被邀请数
t1 = data.groupby(['qid', 'hour'])
t2 = t1['uid'].count()
t2.name = 'qid_hour_count'
data = pd.merge(data, t2, on=['qid', 'hour'], how='left')

t1 = data.groupby(['qid', 'day'])
t2 = t1['uid'].count()
t2.name = 'qid_day_count'
data = pd.merge(data, t2, on=['qid', 'day'], how='left')

t1 = data.groupby(['qid', 'week'])
t2 = t1['uid'].count()
t2.name = 'qid_week_count'
data = pd.merge(data, t2, on=['qid', 'week'], how='left')

In [52]:
# 基于 uid、qid 统计用户偏好时段，小时、周的平均数、中位数、方差

# uid
t1 = data.groupby('uid')
t2 = t1['hour'].agg(['mean', 'median', 'std'])
t2.columns = ['uid_hour_mean', 'uid_hour_median', 'uid_hour_std']
data = pd.merge(data, t2, on='uid', how='left')

t2 = t1['week'].agg(['mean', 'median', 'std'])
t2.columns = ['uid_week_mean', 'uid_week_median', 'uid_week_std']
data = pd.merge(data, t2, on='uid', how='left')


# qid
t1 = data.groupby('qid')
t2 = t1['hour'].agg(['mean', 'median', 'std'])
t2.columns = ['qid_hour_mean', 'qid_hour_median', 'qid_hour_std']
data = pd.merge(data, t2, on='qid', how='left')

t2 = t1['week'].agg(['mean', 'median', 'std'])
t2.columns = ['qid_week_mean', 'qid_week_median', 'qid_week_std']
data = pd.merge(data, t2, on='qid', how='left')

In [53]:
save_feat = ['uid_hour_count', 'uid_day_count', 'uid_week_count', 'qid_hour_count', 'qid_day_count', 'qid_week_count']
save_feat += ['uid_hour_mean', 'uid_hour_median', 'uid_hour_std']
save_feat += ['uid_week_mean', 'uid_week_median', 'uid_week_std']
save_feat += ['qid_hour_mean', 'qid_hour_median', 'qid_hour_std']
save_feat += ['qid_week_mean', 'qid_week_median', 'qid_week_std']

In [54]:
# 压缩数据
t = data[save_feat].dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

In [55]:
data[save_feat][:len(train)]

Unnamed: 0,uid_hour_count,uid_day_count,uid_week_count,qid_hour_count,qid_day_count,qid_week_count,uid_hour_mean,uid_hour_median,uid_hour_std,uid_week_mean,uid_week_median,uid_week_std,qid_hour_mean,qid_hour_median,qid_hour_std,qid_week_mean,qid_week_median,qid_week_std
0,2,1,2,29,17,17,20.400000,22.0,4.722288,2.200000,2.0,1.643168,18.093023,22.0,6.903463,1.604651,1.0,1.953502
1,2,2,2,1,2,2,9.875000,9.5,3.044316,3.625000,4.0,1.922610,14.533334,13.0,6.334336,3.333333,3.0,1.988060
2,4,1,3,1,3,3,13.071428,13.0,3.561855,3.357143,3.5,1.736803,14.500000,13.5,4.847680,3.166667,4.0,2.228602
3,2,1,1,2,2,2,8.714286,8.0,6.421690,3.428571,3.0,1.718249,18.000000,20.5,4.956958,4.125000,4.0,1.885092
4,2,2,2,2,14,49,14.900000,16.0,6.740425,3.000000,3.0,2.108185,12.405941,13.0,6.369700,2.775578,2.0,1.618129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,3,2,8,3,3,3,13.648149,12.0,5.837929,2.925926,3.0,2.126667,11.000000,11.0,0.000000,6.000000,6.0,0.000000
9489158,7,3,9,7,8,8,13.648149,12.0,5.837929,2.925926,3.0,2.126667,12.750000,12.0,2.121320,5.000000,5.0,0.000000
9489159,6,2,11,49,49,49,13.648149,12.0,5.837929,2.925926,3.0,2.126667,8.137255,8.0,1.131717,2.058824,2.0,0.310597
9489160,10,3,11,34,34,34,13.648149,12.0,5.837929,2.925926,3.0,2.126667,7.243243,7.0,1.498247,0.432432,0.0,1.500751


In [56]:
data[save_feat][len(train):]

Unnamed: 0,uid_hour_count,uid_day_count,uid_week_count,qid_hour_count,qid_day_count,qid_week_count,uid_hour_mean,uid_hour_median,uid_hour_std,uid_week_mean,uid_week_median,uid_week_std,qid_hour_mean,qid_hour_median,qid_hour_std,qid_week_mean,qid_week_median,qid_week_std
9489162,2,1,1,4,3,3,17.799999,20.0,5.493431,3.600000,4.0,1.837873,11.000000,9.0,2.581989,4.857143,5.0,1.345185
9489163,2,2,2,3,3,3,19.000000,20.5,4.516636,2.000000,1.0,2.449490,10.344828,12.0,6.820536,3.413793,2.0,2.625660
9489164,1,1,2,1,3,3,10.333333,8.5,5.851703,2.583333,3.0,1.729862,15.666667,16.0,0.577350,3.000000,3.0,0.000000
9489165,1,1,1,1,1,1,12.750000,13.0,7.675719,2.250000,2.5,1.707825,10.428572,10.0,4.503966,3.571429,5.0,1.988060
9489166,1,1,1,1,1,1,11.800000,10.0,4.549725,3.600000,4.0,1.673320,19.000000,19.0,,1.000000,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10630840,1,1,1,11,11,11,19.777779,21.0,3.419714,3.333333,3.0,2.000000,19.500000,20.0,1.732051,5.083333,5.0,0.288675
10630841,1,1,1,2,3,3,21.000000,21.5,2.160247,4.000000,4.5,2.160247,20.666666,21.0,0.577350,1.000000,1.0,0.000000
10630842,1,1,2,1,2,2,16.916666,18.0,3.315483,3.083333,3.0,2.020726,13.285714,12.0,3.302236,3.285714,3.0,2.751623
10630843,10,1,1,28,28,28,11.421053,8.0,6.176261,3.000000,3.0,1.795055,8.333333,8.0,2.022858,0.100000,0.0,0.402578


In [57]:
data[save_feat][:len(train)].to_csv(f'{feature_path}/train_invite_feature.txt', index=False, sep='\t')
data[save_feat][len(train):].to_csv(f'{feature_path}/test_invite_feature.txt', index=False, sep='\t')