In [120]:
# import ray
# ray.init(num_cpus=36)
import pandas as pd
from collections import Counter
import numpy as np
import logging
import pickle
from joblib import Parallel, delayed
import gc
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
base_path = './data'
feature_path = './feature'

In [3]:
# 话题向量
# topic_table = pd.read_csv(f'{base_path}/topic_vectors_64d.txt', sep='\t', header=None)
# topic_table.columns = ['topic', 'vec']

# def str2vec(s):
#     tmp = s.split(' ')
#     res = []
#     for num in tmp:
#         res.append(float(num))
#     return res

# topic_table['vec'] = topic_table['vec'].apply(str2vec)

# import pickle
# pickle.dump(topic_table, open(f'{base_path}/topic_vec.pkl', 'wb'))

In [4]:
# 问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['q_dt'], ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']

logging.info("ques %s", ques.shape)

[2019-12-05 11:45:33,759] INFO in <ipython-input-4-f4a7178965f4>: ques (1829900, 2)


In [5]:
# 回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['is_good'], ans['is_rec'], ans['is_dest'], ans['has_img'], ans['has_video'] 
del ans['word_count']
del ans['reci_cheer'], ans['reci_uncheer'], ans['reci_comment'], ans['reci_mark'], ans['reci_tks']
del ans['reci_xxx'], ans['reci_no_help'], ans['reci_dis']
del ans['ans_dt'], ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

[2019-12-05 11:46:41,541] INFO in <ipython-input-5-c99731a9feaa>: ans (4513735, 3)


In [6]:
# ans 对应的问题的话题
ans = pd.merge(ans, ques, on='qid', how='left')

In [7]:
user_topic = ans[ans['topic']!='-1'].groupby('uid')['topic'].agg(','.join)  # 用户回答过的话题
user_topic

uid
M1000000382                                             T13,T567
M1000000983    T6192,T1667,T2255,T1103,T70,T100,T20647,T1878,...
M1000008978                                T1891,T56,T5,T39,T131
M1000020034                                                 T597
M1000022555                             T1440,T608,T103,T13,T103
                                     ...                        
M999984680     T26,T277,T76,T245,T542,T2274,T9526,T51857,T200...
M999988985     T3259,T11393,T2231,T3928,T2229,T1814,T1257,T72...
M999995457     T2,T488,T11088,T6248,T355,T7752,T674,T2,T22,T3...
M999998695                      T810,T5568,T5568,T810,T5568,T810
M999998888                       T35,T296,T4979,T3,T162,T67,T296
Name: topic, Length: 767269, dtype: object

In [8]:
pickle.dump(user_topic, open(f'{base_path}/user_topic.pkl', 'wb'))

In [9]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
del train['dt']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
del test['dt']
logging.info("test %s", test.shape)

[2019-12-05 11:47:57,898] INFO in <ipython-input-9-340cfdc733a3>: invite (9489162, 3)
[2019-12-05 11:48:00,586] INFO in <ipython-input-9-340cfdc733a3>: test (1141683, 2)


In [10]:
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [11]:
train['user_topic'] = train['uid'].map(user_topic).fillna('-1')
test['user_topic'] = test['uid'].map(user_topic).fillna('-1')

In [12]:
topic_table = pickle.load(open(f'{base_path}/topic_vec.pkl', 'rb'))

In [111]:
def cos_sim(vec1, vec2):
    return np.dot(vec1, vec2)/((np.linalg.norm(vec1)*(np.linalg.norm(vec2))))

def eucl_sim(vec1, vec2):
    return np.linalg.norm(np.array(vec1)-np.array(vec2))

In [112]:
def get_topic_sim(q_topic, u_topic, mode):
    n_most_common = 20
    assert mode in ['train', 'test']
        
    if (q_topic == '-1') or (u_topic == '-1'):
        return [0]*4 + [np.nan]*20
    
    q_topic_2 = q_topic.split(',')
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)
    if mode == 'train':
        counter = counter - Counter(q_topic_2)    # 去当前记录             
    most_common_topic = counter.most_common(n_most_common)
    if len(most_common_topic)== 0:         # 没有其他时期的回答
        return [0]*4 + [np.nan]*20
    
    count_list = []
    for ut_c in most_common_topic: 
        count_list.append(ut_c[1])
    count_norm_list = np.array(list(count_list)) / np.sum(list(count_list))   # normalize
   
    sim_list = []                   # 两两交叉计算相似度
    sim_norm_list = []
    count_ut_in_qt_weighted = 0     # 问题话题在过往回答过的话题里的出现次数
    count_ut_in_qt = 0             # 问题话题与过往回答过的话题的交集个数
    count_norm_ut_in_qt = 0
    
    # 欧式距离相似度
    sim_list_eucl = []                   # 两两交叉计算相似度
    sim_norm_list_eucl = []

    for qt in q_topic_2:
        q_topic_index = int(qt[1:]) - 1   # q_topic 索引
        q_topic_vec = topic_table.iloc[q_topic_index]['vec']
        for ut_c, count, count_norm in zip(most_common_topic, count_list, count_norm_list):  
            u_topic_index = int(ut_c[0][1:]) - 1   # u_topic 索引
            u_topic_vec = topic_table.iloc[u_topic_index]['vec']
            if q_topic_index == u_topic_index:
                count_ut_in_qt_weighted += ut_c[1]
                count_ut_in_qt += 1
                count_norm_ut_in_qt += count_norm
            
            sim = cos_sim(q_topic_vec, u_topic_vec)
            sim_norm = sim*count_norm
            sim_eucl = eucl_sim(q_topic_vec, u_topic_vec)
            sim_norm_eucl = sim_eucl*count_norm
            
            sim_list.append(sim)
            sim_norm_list.append(sim_norm)
            sim_list_eucl.append(sim_eucl)
            sim_norm_list_eucl.append(sim_norm_eucl)
            
    
#     print('1')
    rate_ut_in_qt = count_ut_in_qt / len(q_topic_2)
    
    min_sim = np.min(sim_list)
    max_sim = np.max(sim_list)
    sum_sim = np.sum(sim_list)
    mean_sim = np.mean(sim_list)
    std_sim = np.std(sim_list)
    
#     print('2')
    min_sim_norm = np.min(sim_norm_list)
    max_sim_norm = np.max(sim_norm_list)
    sum_sim_norm = np.sum(sim_norm_list)
    mean_sim_norm = np.mean(sim_norm_list)
    std_sim_norm = np.std(sim_norm_list)
    
    # 欧式
#     print('3')
    min_sim_eucl = np.min(sim_list_eucl)
    max_sim_eucl = np.max(sim_list_eucl)
    sum_sim_eucl = np.sum(sim_list_eucl)
    mean_sim_eucl = np.mean(sim_list_eucl)
    std_sim_eucl = np.std(sim_list_eucl)
    
#     print('4')
    min_sim_norm_eucl = np.min(sim_norm_list_eucl)
    max_sim_norm_eucl = np.max(sim_norm_list_eucl)
    sum_sim_norm_eucl = np.sum(sim_norm_list_eucl)
    mean_sim_norm_eucl = np.mean(sim_norm_list_eucl)
    std_sim_norm_eucl = np.std(sim_norm_list_eucl)

#     print('5')
    res = [count_ut_in_qt_weighted, count_ut_in_qt, rate_ut_in_qt, count_norm_ut_in_qt]
    res += [min_sim, max_sim, sum_sim, mean_sim, std_sim]
    res += [min_sim_norm, max_sim_norm, sum_sim_norm, mean_sim_norm, std_sim_norm]
    res += [min_sim_eucl, max_sim_eucl, sum_sim_eucl, mean_sim_eucl, std_sim_eucl]
    res += [min_sim_norm_eucl, max_sim_norm_eucl, sum_sim_norm_eucl, mean_sim_norm_eucl, std_sim_norm_eucl]
    return res


In [134]:
import traceback
def get_topic_feat(df, num, mode):
    assert mode in ['train', 'test']
    try:
        logging.info('start to extract feature.')
        t1 = df.apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], mode), axis=1, result_type='expand')
        t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm',
                  'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
                  'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm', 
                  'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                  'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']
        logging.info('extracting finish.')
        
        # 压缩数据
        for col in ['qu_topic_count_weight', 'qu_topic_count']:
            t1[col] = t1[col].astype('int32')
        for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
                    'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
                    'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                    'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
            t1[col] = t1[col].astype('float32')
        t1.to_csv(f'./temp/{mode}_topic_feature_{num}.txt', index=False, sep='\t')
        logging.info('file %s saving finish.', num)
        del t1
        gc.collect()
    except:
        print(traceback.print_exc())
        
def multi_proc(df, mode):
    import multiprocessing
    processes = 20
    pool = multiprocessing.Pool(processes=processes)
    len_data = len(df)
    len_batch = len_data // processes
    for i in range(processes):
        start = i * len_batch
        end = (i+1) * len_batch
        if i == (processes-1):
            end = len_data
        tmp = df[start:end]
        pool.apply_async(get_topic_feat, (tmp, i, mode))
        
    pool.close()
    pool.join()

In [133]:
# train
multi_proc(train, 'train')
multi_proc(test, 'test')

[2019-12-05 15:42:04,328] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,351] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,332] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,331] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,333] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,332] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,334] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,337] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,339] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,341] INFO in <ipython-input-132-6cde6ddefe1f>: start to extract feature.
[2019-12-05 15:42:04,344] INFO in <ipython-input-132-6cde6dd

[2019-12-05 15:42:19,094] INFO in <ipython-input-132-6cde6ddefe1f>: file 2 saving finish.
[2019-12-05 15:42:19,099] INFO in <ipython-input-132-6cde6ddefe1f>: file 19 saving finish.
[2019-12-05 15:42:19,109] INFO in <ipython-input-132-6cde6ddefe1f>: file 14 saving finish.
[2019-12-05 15:42:19,118] INFO in <ipython-input-132-6cde6ddefe1f>: extracting finish.
[2019-12-05 15:42:19,143] INFO in <ipython-input-132-6cde6ddefe1f>: extracting finish.
[2019-12-05 15:42:19,146] INFO in <ipython-input-132-6cde6ddefe1f>: file 11 saving finish.
[2019-12-05 15:42:19,197] INFO in <ipython-input-132-6cde6ddefe1f>: extracting finish.
[2019-12-05 15:42:19,209] INFO in <ipython-input-132-6cde6ddefe1f>: file 17 saving finish.
[2019-12-05 15:42:19,247] INFO in <ipython-input-132-6cde6ddefe1f>: file 12 saving finish.
[2019-12-05 15:42:19,319] INFO in <ipython-input-132-6cde6ddefe1f>: extracting finish.
[2019-12-05 15:42:19,346] INFO in <ipython-input-132-6cde6ddefe1f>: extracting finish.
[2019-12-05 15:42:19

In [116]:
# test
logging.info('start to extract test feature.')
t1 = test.apply(lambda x: get_topic_sim(x['topic'], x['user_topic'], 'test'), axis=1, result_type='expand')
t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm',
              'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
              'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
              'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
              'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']
# 压缩数据
for col in ['qu_topic_count_weight', 'qu_topic_count']:
    t1[col] = t1[col].astype('int32')
for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
            'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
            'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
            'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
    t1[col] = t1[col].astype('float32')
logging.info('extracting finish.')
t1.to_csv(f'./temp/test_topic_feature.txt', index=False, sep='\t')

[2019-12-05 15:24:07,397] INFO in <ipython-input-116-fb7b0de69793>: start to extract test feature.
[2019-12-05 15:24:11,660] INFO in <ipython-input-116-fb7b0de69793>: extracting finish.


In [117]:
t1

Unnamed: 0,qu_topic_count_weight,qu_topic_count,qu_topic_rate,qu_topic_count_norm,min_sim,max_sim,sum_sim,mean_sim,std_sim,min_sim_norm,...,min_sim_eucl,max_sim_eucl,sum_sim_eucl,mean_sim_eucl,std_sim_eucl,min_sim_norm_eucl,max_sim_norm_eucl,sum_sim_norm_eucl,mean_sim_norm_eucl,std_sim_norm_eucl
0,1,1,0.333333,0.333333,0.182818,1.000000,5.182607,0.575845,0.266533,0.060939,...,0.000000,26.387470,156.674423,17.408268,7.488229,0.000000,8.795823,52.224804,5.802756,2.496076
1,0,0,0.000000,0.000000,,,,,,,...,,,,,,,,,,
2,0,0,0.000000,0.000000,-0.261990,0.437958,6.933745,0.092450,0.127885,-0.016374,...,21.936098,53.715172,2554.952637,34.066032,6.316459,1.371006,4.382416,169.320709,2.257610,0.584930
3,0,0,0.000000,0.000000,0.029782,0.262181,1.488354,0.148835,0.070377,0.005956,...,19.959232,31.828415,251.595764,25.159576,3.665920,3.991847,6.365683,50.319153,5.031915,0.733184
4,0,0,0.000000,0.000000,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0.000000,0.000000,,,,,,,...,,,,,,,,,,
196,0,0,0.000000,0.000000,-0.242100,0.102523,-0.660771,-0.066077,0.103988,-0.048420,...,23.737389,30.974230,273.232422,27.323242,2.682940,4.747478,6.194846,54.646484,5.464648,0.536588
197,5,2,1.000000,0.119048,-0.155177,1.000000,5.982937,0.149573,0.280408,-0.011560,...,0.000000,27.481993,719.755554,17.993889,6.167226,0.000000,2.192888,35.847294,0.896182,0.526181
198,0,0,0.000000,0.000000,-0.250825,0.715847,6.481430,0.144032,0.205985,-0.016722,...,14.746570,48.806080,1489.623047,33.102734,8.111034,0.983105,3.253739,99.308205,2.206849,0.540736


In [55]:
def get_topic_len(df):
    u_topic = df
    
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)

    u_topic_2 = counter .keys()     # 其他时期回答的话题 list
    return len(u_topic_2)

t1 = test['user_topic'].apply(get_topic_len)

In [22]:
for col in ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm']:
    test[col] = 0
for col in ['min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
            'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']:
    test[col] = np.nan

In [70]:
# test
logging.info('start to extract test feature.')
group = test[:10000].groupby(test[:10000].index)
mode = 'test'
res = Parallel(n_jobs=-1)(delayed(get_topic_sim)(row, mode) for name, row in group)
res = pd.concat(res)
res
logging.info('extracting finish.')

[2019-12-05 13:18:31,751] INFO in <ipython-input-70-20359dce79de>: start to extract test feature.
[2019-12-05 13:31:33,179] ERROR in _base: exception calling callback for <Future at 0x7fa85c4129e8 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
'''
Traceback (most recent call last):
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/root/anaconda3/envs/zhcup/lib/python3.6/multiprocessing/queues.py", line 99, in get
    if not self._rlock.acquire(block, timeout):
KeyboardInterrupt
'''

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-

KeyboardInterrupt: 

In [25]:
# test
logging.info('start to extract test feature.')
group = test.groupby(test.index)
mode = 'test'
res = Parallel(n_jobs=-1)(delayed(get_topic_sim)(row, mode) for name, row in group)
test = pd.concat(res)
del res
logging.info('extracting finish.')

[2019-12-05 11:58:32,170] INFO in <ipython-input-25-0836ff36dc8b>: start to extract test feature.
[2019-12-05 12:00:51,266] ERROR in _base: exception calling callback for <Future at 0x7fa859bf56d8 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
'''
Traceback (most recent call last):
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/root/anaconda3/envs/zhcup/lib/python3.6/multiprocessing/queues.py", line 99, in get
    if not self._rlock.acquire(block, timeout):
KeyboardInterrupt
'''

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/root/anaconda3/envs/zhcup/lib/python3.6/site-

KeyboardInterrupt: 

In [None]:
# train
logging.info('start to extract train feature.')
group = train.groupby(train.index)
mode = 'train'
res = Parallel(n_jobs=36)(delayed(get_topic_sim)(row, mode) for name, row in group)
train = pd.concat(res)
del res
logging.info('extracting finish.')

In [None]:
res

In [None]:
test = res

In [None]:
# 压缩数据
for col in ['qu_topic_count_weight', 'qu_topic_count']:
    t1[col] = t1[col].astype('int32')
for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
            'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']:
    t1[col] = t1[col].astype('float32')

In [None]:
t1.to_csv(f'{feature_path}/train_topic_feature.txt', index=False, sep='\t')

In [None]:
# 压缩数据
for col in ['qu_topic_count_weight', 'qu_topic_count']:
    t1[col] = t1[col].astype('int32')
for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
            'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm']:
    t1[col] = t1[col].astype('float32')

In [None]:
t1.to_csv(f'{feature_path}/test_topic_feature.txt', index=False, sep='\t')

In [43]:
def get_topic_sim_bak(df, mode):
#     q_topic = df['topic']
#     u_topic = df['user_topic']
    q_topic = df['topic'].item()
    u_topic = df['user_topic'].item()
    assert mode in ['train', 'test']
        
    if (q_topic == '-1') or (u_topic == '-1'):
        return df
    
    q_topic_2 = q_topic.split(',')
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)
    if mode == 'train':
        counter = counter - Counter(q_topic_2)    # 去当前记录             

    u_topic_2 = counter.keys()      # 其他时期回答的话题 list
    if len(u_topic_2)== 0:         # 没有其他时期的回答
        return df
    count_list = counter.values()    # 上面话题对应的计数
    count_norm_list = np.array(list(count_list)) / np.sum(list(count_list))   # normalize
   
    sim_list = []                   # 两两交叉计算相似度
    sim_norm_list = []
    count_ut_in_qt_weighted = 0     # 问题话题在过往回答过的话题里的出现次数
    count_ut_in_qt = 0             # 问题话题与过往回答过的话题的交集个数
    count_norm_ut_in_qt = 0
    for qt in q_topic_2:
        for ut, count, count_norm in zip(u_topic_2, count_list, count_norm_list):  # 话题、过往回答数、占比
            q_topic_index = int(qt[1:]) - 1   # q_topic 索引
            u_topic_index = int(ut[1:]) - 1   # u_topic 索引
        
            if q_topic_index == u_topic_index:
                count_ut_in_qt_weighted += count
                count_ut_in_qt += 1
                count_norm_ut_in_qt += count_norm
            
            sim = cos_sim(topic_table.iloc[q_topic_index]['vec'], topic_table.iloc[u_topic_index]['vec'])
            
            sim_list.append(sim)
            sim_norm_list.append(sim*count_norm)
            
    rate_ut_in_qt = count_ut_in_qt / len(q_topic_2)

#     df['qu_topic_count_weight'] = count_ut_in_qt_weighted
#     df['qu_topic_count'] = count_ut_in_qt
#     df['qu_topic_rate'] = rate_ut_in_qt
#     df['qu_topic_count_norm'] = count_norm_ut_in_qt
    
#     df['min_sim'] = np.min(sim_list)
#     df['max_sim'] = np.max(sim_list)
#     df['sum_sim'] = np.sum(sim_list)
#     df['mean_sim'] = np.mean(sim_list)
#     df['std_sim'] = np.std(sim_list)
    
#     df['min_sim_norm'] = np.min(sim_norm_list)
#     df['max_sim_norm'] = np.max(sim_norm_list)
#     df['sum_sim_norm'] = np.sum(sim_norm_list)
#     df['mean_sim_norm'] = np.mean(sim_norm_list)
#     df['std_sim_norm'] = np.std(sim_norm_list)
    df['qu_topic_count_weight'] = 1
    df['qu_topic_count'] = 1
    df['qu_topic_rate'] = 1
    df['qu_topic_count_norm'] = 1
    
    df['min_sim'] = 1
    df['max_sim'] = 1
    df['sum_sim'] = 1
    df['mean_sim'] = 1
    df['std_sim'] = 1
    
    df['min_sim_norm'] = 1
    df['max_sim_norm'] = 1
    df['sum_sim_norm'] = 1
    df['mean_sim_norm'] = 1
    df['std_sim_norm'] = 1
    return df
