In [120]:
# import ray
# ray.init(num_cpus=36)
import pandas as pd
from collections import Counter
import numpy as np
import logging
import pickle
from joblib import Parallel, delayed
import gc
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
base_path = './data'
feature_path = './feature'

In [3]:
# 话题向量
topic_table = pd.read_csv(f'{base_path}/topic_vectors_64d.txt', sep='\t', header=None)
topic_table.columns = ['topic', 'vec']

def str2vec(s):
    tmp = s.split(' ')
    res = []
    for num in tmp:
        res.append(float(num))
    return res

topic_table['vec'] = topic_table['vec'].apply(str2vec)

# import pickle
# pickle.dump(topic_table, open(f'{base_path}/topic_vec.pkl', 'wb'))

In [4]:
# 问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['q_dt'], ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']

logging.info("ques %s", ques.shape)

[2019-12-05 11:45:33,759] INFO in <ipython-input-4-f4a7178965f4>: ques (1829900, 2)


In [5]:
# 回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['is_good'], ans['is_rec'], ans['is_dest'], ans['has_img'], ans['has_video'] 
del ans['word_count']
del ans['reci_cheer'], ans['reci_uncheer'], ans['reci_comment'], ans['reci_mark'], ans['reci_tks']
del ans['reci_xxx'], ans['reci_no_help'], ans['reci_dis']
del ans['ans_dt'], ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

[2019-12-05 11:46:41,541] INFO in <ipython-input-5-c99731a9feaa>: ans (4513735, 3)


In [6]:
# ans 对应的问题的话题
ans = pd.merge(ans, ques, on='qid', how='left')

In [7]:
user_topic = ans[ans['topic']!='-1'].groupby('uid')['topic'].agg(','.join)  # 用户回答过的话题
user_topic

uid
M1000000382                                             T13,T567
M1000000983    T6192,T1667,T2255,T1103,T70,T100,T20647,T1878,...
M1000008978                                T1891,T56,T5,T39,T131
M1000020034                                                 T597
M1000022555                             T1440,T608,T103,T13,T103
                                     ...                        
M999984680     T26,T277,T76,T245,T542,T2274,T9526,T51857,T200...
M999988985     T3259,T11393,T2231,T3928,T2229,T1814,T1257,T72...
M999995457     T2,T488,T11088,T6248,T355,T7752,T674,T2,T22,T3...
M999998695                      T810,T5568,T5568,T810,T5568,T810
M999998888                       T35,T296,T4979,T3,T162,T67,T296
Name: topic, Length: 767269, dtype: object

In [8]:
pickle.dump(user_topic, open(f'{base_path}/user_topic.pkl', 'wb'))

In [9]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
del test['dt']
logging.info("test %s", test.shape)

[2019-12-05 11:47:57,898] INFO in <ipython-input-9-340cfdc733a3>: invite (9489162, 3)
[2019-12-05 11:48:00,586] INFO in <ipython-input-9-340cfdc733a3>: test (1141683, 2)


In [10]:
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [11]:
train['user_topic'] = train['uid'].map(user_topic).fillna('-1')
test['user_topic'] = test['uid'].map(user_topic).fillna('-1')

In [12]:
topic_table = pickle.load(open(f'{base_path}/topic_vec.pkl', 'rb'))

In [111]:
def cos_sim(vec1, vec2):
    return np.dot(vec1, vec2)/((np.linalg.norm(vec1)*(np.linalg.norm(vec2))))

def eucl_sim(vec1, vec2):
    return np.linalg.norm(np.array(vec1)-np.array(vec2))

In [112]:
def get_topic_sim(q_topic, u_topic, mode):
    n_most_common = 20
    assert mode in ['train', 'test']
        
    if (q_topic == '-1') or (u_topic == '-1'):
        return [0]*4 + [np.nan]*20
    
    q_topic_2 = q_topic.split(',')
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)
    if mode == 'train':
        counter = counter - Counter(q_topic_2)    # 去当前记录             
    most_common_topic = counter.most_common(n_most_common)
    if len(most_common_topic)== 0:         # 没有其他时期的回答
        return [0]*4 + [np.nan]*20
    
    count_list = []
    for ut_c in most_common_topic: 
        count_list.append(ut_c[1])
    count_norm_list = np.array(list(count_list)) / np.sum(list(count_list))   # normalize
   
    sim_list = []                   # 两两交叉计算相似度
    sim_norm_list = []
    count_ut_in_qt_weighted = 0     # 问题话题在过往回答过的话题里的出现次数
    count_ut_in_qt = 0             # 问题话题与过往回答过的话题的交集个数
    count_norm_ut_in_qt = 0
    
    # 欧式距离相似度
    sim_list_eucl = []                   # 两两交叉计算相似度
    sim_norm_list_eucl = []

    for qt in q_topic_2:
        q_topic_index = int(qt[1:]) - 1   # q_topic 索引
        q_topic_vec = topic_table.iloc[q_topic_index]['vec']
        for ut_c, count, count_norm in zip(most_common_topic, count_list, count_norm_list):  
            u_topic_index = int(ut_c[0][1:]) - 1   # u_topic 索引
            u_topic_vec = topic_table.iloc[u_topic_index]['vec']
            if q_topic_index == u_topic_index:
                count_ut_in_qt_weighted += ut_c[1]
                count_ut_in_qt += 1
                count_norm_ut_in_qt += count_norm
            
            sim = cos_sim(q_topic_vec, u_topic_vec)
            sim_norm = sim*count_norm
            sim_eucl = eucl_sim(q_topic_vec, u_topic_vec)
            sim_norm_eucl = sim_eucl*count_norm
            
            sim_list.append(sim)
            sim_norm_list.append(sim_norm)
            sim_list_eucl.append(sim_eucl)
            sim_norm_list_eucl.append(sim_norm_eucl)
            
    
#     print('1')
    rate_ut_in_qt = count_ut_in_qt / len(q_topic_2)
    
    min_sim = np.min(sim_list)
    max_sim = np.max(sim_list)
    sum_sim = np.sum(sim_list)
    mean_sim = np.mean(sim_list)
    std_sim = np.std(sim_list)
    
#     print('2')
    min_sim_norm = np.min(sim_norm_list)
    max_sim_norm = np.max(sim_norm_list)
    sum_sim_norm = np.sum(sim_norm_list)
    mean_sim_norm = np.mean(sim_norm_list)
    std_sim_norm = np.std(sim_norm_list)
    
    # 欧式
#     print('3')
    min_sim_eucl = np.min(sim_list_eucl)
    max_sim_eucl = np.max(sim_list_eucl)
    sum_sim_eucl = np.sum(sim_list_eucl)
    mean_sim_eucl = np.mean(sim_list_eucl)
    std_sim_eucl = np.std(sim_list_eucl)
    
#     print('4')
    min_sim_norm_eucl = np.min(sim_norm_list_eucl)
    max_sim_norm_eucl = np.max(sim_norm_list_eucl)
    sum_sim_norm_eucl = np.sum(sim_norm_list_eucl)
    mean_sim_norm_eucl = np.mean(sim_norm_list_eucl)
    std_sim_norm_eucl = np.std(sim_norm_list_eucl)

#     print('5')
    res = [count_ut_in_qt_weighted, count_ut_in_qt, rate_ut_in_qt, count_norm_ut_in_qt]
    res += [min_sim, max_sim, sum_sim, mean_sim, std_sim]
    res += [min_sim_norm, max_sim_norm, sum_sim_norm, mean_sim_norm, std_sim_norm]
    res += [min_sim_eucl, max_sim_eucl, sum_sim_eucl, mean_sim_eucl, std_sim_eucl]
    res += [min_sim_norm_eucl, max_sim_norm_eucl, sum_sim_norm_eucl, mean_sim_norm_eucl, std_sim_norm_eucl]
    return res


In [170]:
# 压缩数据
def compress_data(df):
    for col in ['qu_topic_count_weight', 'qu_topic_count']:
        df[col] = df[col].astype('int32')
    for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
                'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
                'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
        df[col] = df[col].astype('float32')
    return df

In [136]:
import traceback
def get_topic_feat(df, num, mode):
    assert mode in ['train', 'test']
    try:
        logging.info('start to extract feature.')
        t1 = df.apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], mode), axis=1, result_type='expand')
        t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm',
                  'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
                  'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm', 
                  'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                  'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']
        logging.info('extracting finish.')
        
        # 压缩数据
        t1 = compress_data(t1)
        t1.to_csv(f'./temp/{mode}_topic_feature_{num}.txt', index=False, sep='\t')
        logging.info('file %s saving finish.', num)
        del t1
        gc.collect()
    except:
        print(traceback.print_exc())
        
def multi_proc(df, mode):
    import multiprocessing
    processes = 30
    pool = multiprocessing.Pool(processes=processes)
    len_data = len(df)
    len_batch = len_data // processes
    for i in range(processes):
        start = i * len_batch
        end = (i+1) * len_batch
        if i == (processes-1):
            end = len_data
        tmp = df[start:end]
        pool.apply_async(get_topic_feat, (tmp, i, mode))
        
    pool.close()
    pool.join()

In [137]:
# train
multi_proc(train, 'train')
multi_proc(test, 'test')

[2019-12-05 15:47:53,964] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:47:54,944] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:47:56,042] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:47:58,136] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:47:58,509] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:47:59,651] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:48:00,633] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:48:01,828] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:48:02,657] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:48:03,617] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 15:48:04,707] INFO in <ipython-input-136-4de50dd

[2019-12-05 18:07:33,752] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:33,828] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:33,951] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:34,128] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:34,424] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:34,658] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:34,795] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:35,169] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:35,409] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:35,610] INFO in <ipython-input-136-4de50dd160b9>: start to extract feature.
[2019-12-05 18:07:35,885] INFO in <ipython-input-136-4de50dd

In [166]:
# 合数据

# train
topic_feat = None
original_path = './temp'
mode = 'train'
for i in range(30):
    d = pd.read_csv(f'{original_path}/{mode}_topic_feature_{i}.txt', sep='\t')
    if topic_feat is None:
        topic_feat = d
    else:
        topic_feat = pd.concat([topic_feat, d], axis=0, ignore_index=True)
logging.info('%s topic feature, shape: %s', mode, topic_feat.shape)

topic_feat = compress_data(topic_feat)
topic_feat.to_csv(f'{feature_path}/{mode}_topic_feature.txt', index=False, sep='\t')
logging.info('%s topic feature saved.', mode)

In [174]:
# test
topic_feat = None
original_path = './temp'
mode = 'test'
for i in range(30):
    d = pd.read_csv(f'{original_path}/{mode}_topic_feature_{i}.txt', sep='\t')
    if topic_feat is None:
        topic_feat = d
    else:
        topic_feat = pd.concat([topic_feat, d], axis=0, ignore_index=True)
        
logging.info('%s topic feature, shape: %s', mode, topic_feat.shape)

topic_feat = compress_data(topic_feat)
topic_feat.to_csv(f'{feature_path}/{mode}_topic_feature.txt', index=False, sep='\t')
logging.info('%s topic feature saved.', mode)

[2019-12-06 02:50:54,874] INFO in <ipython-input-174-3290fe81e121>: test topic feature, shape: (1141683, 24)
[2019-12-06 02:52:34,053] INFO in <ipython-input-174-3290fe81e121>: test topic feature saved.


# 出了点小问题

In [138]:
# def get_topic_feat_16(df, num, mode):
#     assert mode in ['train', 'test']
#     try:
#         logging.info('start to extract feature.')
#         t1 = df.apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], mode), axis=1, result_type='expand')
#         t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm',
#                   'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
#                   'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm', 
#                   'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
#                   'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']
#         logging.info('extracting finish.')
        
#         # 压缩数据
#         for col in ['qu_topic_count_weight', 'qu_topic_count']:
#             t1[col] = t1[col].astype('int32')
#         for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
#                     'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
#                     'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
#                     'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
#             t1[col] = t1[col].astype('float32')
#         t1.to_csv(f'./temp2/{mode}_topic_feature_16_{num}.txt', index=False, sep='\t')
#         logging.info('file %s saving finish.', num)
#         del t1
#         gc.collect()
#     except:
#         print(traceback.print_exc())
        
# def multi_proc_16(df, mode):
#     import multiprocessing
#     processes = 30
#     pool = multiprocessing.Pool(processes=processes)
#     len_data = len(df)
#     len_batch = len_data // processes
#     for i in range(processes):
#         start = i * len_batch
#         end = (i+1) * len_batch
#         if i == (processes-1):
#             end = len_data
#         tmp = df[start:end]
#         pool.apply_async(get_topic_feat_16, (tmp, i, mode))
        
#     pool.close()
#     pool.join()

# len_batch_origin = len(train)//30
# multi_proc_16(train[(16*len_batch_origin):(17*len_batch_origin)], 'train')

In [155]:
# topic_feat = None
# original_path = './temp2'
# for i in range(30):
# #     d = pd.read_csv(f'{original_path}/train_topic_feature_{i}.txt', sep='\t')
#     d = pd.read_csv(f'{original_path}/train_topic_feature_16_{i}.txt', sep='\t')
#     if topic_feat is None:
#         topic_feat = d
#     else:
#         topic_feat = pd.concat([topic_feat, d], axis=0, ignore_index=True)

In [165]:
# 压缩数据
# for col in ['qu_topic_count_weight', 'qu_topic_count']:
#     topic_feat[col] = topic_feat[col].astype('int32')
# for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
#             'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
#             'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
#             'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
#     topic_feat[col] = topic_feat[col].astype('float32')
# topic_feat.to_csv(f'./temp/train_topic_feature_16.txt', index=False, sep='\t')