In [1]:
# import ray
# ray.init(num_cpus=36)
import pandas as pd
from collections import Counter
import numpy as np
import logging

log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
base_path = './data'
feature_path = './feature'

In [3]:
# 话题向量
# topic_table = pd.read_csv(f'{base_path}/topic_vectors_64d.txt', sep='\t', header=None)
# topic_table.columns = ['topic', 'vec']

# def str2vec(s):
#     tmp = s.split(' ')
#     res = []
#     for num in tmp:
#         res.append(float(num))
#     return res

# topic_table['vec'] = topic_table['vec'].apply(str2vec)

# import pickle
# pickle.dump(topic_table, open(f'{base_path}/topic_vec.pkl', 'wb'))

In [6]:
# 问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['q_dt'], ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']

logging.info("ques %s", ques.shape)

[2019-12-05 03:19:12,678] INFO in <ipython-input-6-f4a7178965f4>: ques (1829900, 2)


In [7]:
# 回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['is_good'], ans['is_rec'], ans['is_dest'], ans['has_img'], ans['has_video'] 
del ans['word_count']
del ans['reci_cheer'], ans['reci_uncheer'], ans['reci_comment'], ans['reci_mark'], ans['reci_tks']
del ans['reci_xxx'], ans['reci_no_help'], ans['reci_dis']
del ans['ans_dt'], ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

[2019-12-05 03:21:30,110] INFO in <ipython-input-7-c99731a9feaa>: ans (4513735, 3)


In [8]:
# ans 对应的问题的话题
ans = pd.merge(ans, ques, on='qid', how='left')

In [10]:
user_topic = ans[ans['topic']!='-1'].groupby('uid')['topic'].agg(','.join)  # 用户回答过的话题
user_topic

uid
M1000000382                                             T13,T567
M1000000983    T6192,T1667,T2255,T1103,T70,T100,T20647,T1878,...
M1000008978                                T1891,T56,T5,T39,T131
M1000020034                                                 T597
M1000022555                             T1440,T608,T103,T13,T103
                                     ...                        
M999984680     T26,T277,T76,T245,T542,T2274,T9526,T51857,T200...
M999988985     T3259,T11393,T2231,T3928,T2229,T1814,T1257,T72...
M999995457     T2,T488,T11088,T6248,T355,T7752,T674,T2,T22,T3...
M999998695                      T810,T5568,T5568,T810,T5568,T810
M999998888                       T35,T296,T4979,T3,T162,T67,T296
Name: topic, Length: 767269, dtype: object

In [11]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
del test['dt']
logging.info("test %s", test.shape)

[2019-12-05 03:24:25,075] INFO in <ipython-input-11-340cfdc733a3>: invite (9489162, 3)
[2019-12-05 03:24:27,759] INFO in <ipython-input-11-340cfdc733a3>: test (1141683, 2)


In [12]:
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [13]:
train['user_topic'] = train['uid'].map(user_topic).fillna('-1')
test['user_topic'] = test['uid'].map(user_topic).fillna('-1')

In [17]:
import pickle
topic_table = pickle.load(open(f'{base_path}/topic_vec.pkl', 'rb'))

In [19]:
def cal_sim(vec1, vec2):
    return np.dot(vec1, vec2)/((np.linalg.norm(vec1)*(np.linalg.norm(vec2))))

In [113]:
def get_topic_sim(q_topic, u_topic, mode):
    assert mode in ['train', 'test']
    if q_topic == '-1' or u_topic == '-1':
        return [0]*4 + [np.nan]*10
    
    q_topic_2 = q_topic.split(',')
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)
    if mode == 'train':
        counter = counter - Counter(q_topic_2)    # 去当前记录             

    u_topic_2 = counter.keys()      # 其他时期回答的话题 list
    if len(u_topic_2)== 0:         # 没有其他时期的回答
        return [0]*4 + [np.nan]*10
    count_list = counter.values()    # 上面话题对应的计数
    count_norm_list = np.array(list(count_list)) / np.sum(list(count_list))   # normalize
   
    sim_list = []                   # 两两交叉计算相似度
    sim_norm_list = []
    count_ut_in_qt_weighted = 0     # 问题话题在过往回答过的话题里的出现次数
    count_ut_in_qt = 0             # 问题话题与过往回答过的话题的交集个数
    count_norm_ut_in_qt = 0
    for qt in q_topic_2:
        for ut, count, count_norm in zip(u_topic_2, count_list, count_norm_list):  # 话题、过往回答数、占比
            q_topic_index = int(qt[1:]) - 1   # q_topic 索引
            u_topic_index = int(ut[1:]) - 1   # u_topic 索引
        
            if q_topic_index == u_topic_index:
                count_ut_in_qt_weighted += count
                count_ut_in_qt += 1
                count_norm_ut_in_qt += count_norm
                
            sim = cal_sim(topic_table.iloc[q_topic_index]['vec'], topic_table.iloc[u_topic_index]['vec'])
            
            sim_list.append(sim)
            sim_norm_list.append(sim*count_norm)
            
    rate_ut_in_qt = count_ut_in_qt / len(q_topic_2)
    
    min_sim = np.min(sim_list)    
    max_sim = np.max(sim_list)
    sum_sim = np.sum(sim_list)
    mean_sim = np.mean(sim_list)
    std_sim = np.std(sim_list)
    
    min_sim_norm = np.min(sim_norm_list)
    max_sim_norm = np.max(sim_norm_list)
    sum_sim_norm = np.sum(sim_norm_list)
    mean_sim_norm = np.mean(sim_norm_list)
    std_sim_norm = np.std(sim_norm_list)
    
    res = [count_ut_in_qt_weighted, count_ut_in_qt, rate_ut_in_qt, count_norm_ut_in_qt]
    res += [min_sim, max_sim, sum_sim, mean_sim, std_sim]
    res += [min_sim_norm, max_sim_norm, sum_sim_norm, mean_sim_norm, std_sim_norm]
    return res

In [110]:
from joblib import Parallel, delayed

In [111]:
def parallel_apply(df, mode):
    df_group = df.groupby(df.index)
    results = Parallel(n_jobs=10)(delayed(func)(group) for name,group in df_grouped)
    res = Parallel(n_jobs=36)(delayed(get_topic_sim)(row['topic'], row['user_topic'], mode) for name, row in df_group)
    return pd.concat(res)

In [112]:
t1 = parallel_apply(train[-5000:], 'train')
t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm', 
              'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
              'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']
t1

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [79]:
t1 = train[-60:].apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], 'train'), 
                      axis=1, result_type='expand')
t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm', 
              'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
              'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']
logging.info('train topic feature shape: %s', t1.shape)

In [80]:
# 压缩数据
for col in ['qu_topic_count_weight', 'qu_topic_count']:
    t1[col] = t1[col].astype('int32')
for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
            'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']:
    t1[col] = t1[col].astype('float32')

In [None]:
t1.to_csv(f'{feature_path}/train_topic_feature.txt', index=False, sep='\t')

In [81]:
t2 = pd.concat([train[-60:], t1], axis=1)
t2

Unnamed: 0,qid,uid,label,topic,user_topic,qu_topic_count_weight,qu_topic_count,qu_topic_rate,qu_topic_count_norm,min_sim,max_sim,sum_sim,mean_sim,std_sim,min_sim_norm,max_sim_norm,sum_sim_norm,mean_sim_norm,std_sim_norm
9489102,Q1855899859,M1798227107,1,"T4219,T472,T16656,T1156,T211","T4219,T472,T16656,T1156,T211",0,0,0.0,0.0,,,,,,,,,,
9489103,Q2347214701,M2675363843,0,"T2527,T4083","T10,T47,T1316,T42,T1363,T42,T10,T10,T2766,T283...",0,0,0.0,0.0,-0.072659,0.532069,3.294546,0.18303,0.169048,-0.005589,0.068892,0.367703,0.020428,0.018838
9489104,Q156421418,M1473425485,0,"T73,T61",-1,0,0,0.0,0.0,,,,,,,,,,
9489105,Q1407870651,M2232383625,0,"T55,T578","T233,T55,T12996,T176",0,0,0.0,0.0,0.193532,0.849284,2.480381,0.413397,0.222797,0.064511,0.283095,0.826794,0.137799,0.074266
9489106,Q823826498,M1928435133,1,"T10036,T1029",-1,0,0,0.0,0.0,,,,,,,,,,
9489107,Q3123528207,M4105993935,0,"T5820,T1016,T456","T48693,T2782,T27309,T57872,T11613,T548,T6217",0,0,0.0,0.0,-0.087964,0.399151,1.682642,0.080126,0.109365,-0.012566,0.057022,0.240377,0.011447,0.015624
9489108,Q3563164529,M3448708166,0,"T47,T3,T296,T2,T293",-1,0,0,0.0,0.0,,,,,,,,,,
9489109,Q188442117,M2863864025,1,"T23,T5317","T23,T491,T272,T34472,T23,T491,T2,T23,T336,T166...",14,1,0.5,0.285714,-0.173353,1.0,12.484225,0.260088,0.27699,-0.003538,0.285714,0.763376,0.015904,0.043543
9489110,Q2573809841,M2863864025,0,"T23,T581","T23,T491,T272,T34472,T23,T491,T2,T23,T336,T166...",14,1,0.5,0.285714,-0.207789,1.0,15.902061,0.331293,0.300493,-0.004241,0.285714,1.071671,0.022326,0.050311
9489111,Q2770658646,M2863864025,0,"T23,T81371,T272,T6306","T23,T491,T272,T34472,T23,T491,T2,T23,T336,T166...",19,2,0.5,0.387755,-0.208026,1.0,25.401678,0.254017,0.274632,-0.007186,0.285714,1.548951,0.01549,0.039898


In [85]:
t1 = test.apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], 'test'), 
                      axis=1, result_type='expand')
t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm', 
              'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
              'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']


KeyboardInterrupt: 

In [None]:
logging.info('test topic feature shape: %s', t1.shape)

In [None]:
# 压缩数据
for col in ['qu_topic_count_weight', 'qu_topic_count']:
    t1[col] = t1[col].astype('int32')
for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
            'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']:
    t1[col] = t1[col].astype('float32')

In [None]:
t1.to_csv(f'{feature_path}/test_topic_feature.txt', index=False, sep='\t')