In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import logging
import pickle
from joblib import Parallel, delayed
import gc
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
base_path = './data'
feature_path = './feature'

In [3]:
# 话题向量
# topic_table = pd.read_csv(f'{base_path}/topic_vectors_64d.txt', sep='\t', header=None)
# topic_table.columns = ['topic', 'vec']

# def str2vec(s):
#     tmp = s.split(' ')
#     res = []
#     for num in tmp:
#         res.append(float(num))
#     return res

# topic_table['vec'] = topic_table['vec'].apply(str2vec)

# import pickle
# pickle.dump(topic_table, open(f'{base_path}/topic_vec.pkl', 'wb'))

In [7]:
topic_table = pickle.load(open(f'{base_path}/topic_vec.pkl', 'rb'))

In [10]:
# 问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t', usecols=[0, 6])
ques.columns = ['qid', 'topic']

logging.info("ques %s", ques.shape)

[2019-12-06 06:06:40,505] INFO in <ipython-input-10-f32b8856bc65>: ques (1829900, 2)


In [18]:
# 回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t', usecols=[0, 1, 2, 3])
ans.columns = ['aid', 'qid', 'uid', 'ans_dt']
logging.info("ans %s", ans.shape)

[2019-12-06 06:13:35,310] INFO in <ipython-input-18-48fd985f6174>: ans (4513735, 4)


In [19]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))

ans['day'] = extract_day(ans['ans_dt'])
del ans['ans_dt']
logging.info("ans %s", ans.shape)

[2019-12-06 06:13:52,191] INFO in <ipython-input-19-18df387346ac>: ans (4513735, 4)


In [21]:
# ans 对应的问题的话题
ans = pd.merge(ans, ques, on='qid', how='left')

In [23]:
# 4折统计
def fold_fn(x):
    if 3838<=x<=3846:
        return 0
    if 3847<=x<=3853:
        return 1
    if 3854<=x<=3860:
        return 2
    if 3861<=x<=3867:
        return 3
    else:
        return -1     # 更前的一个月

In [24]:
# train
ans['fold'] = ans['day'].apply(fold_fn)

In [28]:
ans_with_topic = ans[ans['topic']!='-1']

In [75]:
user_topic_0 = ans_with_topic[ans_with_topic['fold']!=0].groupby('uid')['topic'].agg(','.join)
logging.info('fold 0, user num: %s', len(user_topic_0))

user_topic_1 = ans_with_topic[ans_with_topic['fold']!=1].groupby('uid')['topic'].agg(','.join)
logging.info('fold 1, user num: %s', len(user_topic_1))

user_topic_2 = ans_with_topic[ans_with_topic['fold']!=2].groupby('uid')['topic'].agg(','.join)
logging.info('fold 2, user num: %s', len(user_topic_2))

user_topic_3 = ans_with_topic[ans_with_topic['fold']!=3].groupby('uid')['topic'].agg(','.join)
logging.info('fold 3, user num: %s', len(user_topic_3))

[2019-12-06 12:01:17,614] INFO in <ipython-input-75-bfcd1724525a>: fold 0, user num: 701146
[2019-12-06 12:01:37,540] INFO in <ipython-input-75-bfcd1724525a>: fold 1, user num: 693487
[2019-12-06 12:01:57,398] INFO in <ipython-input-75-bfcd1724525a>: fold 2, user num: 693658
[2019-12-06 12:02:16,517] INFO in <ipython-input-75-bfcd1724525a>: fold 3, user num: 666274


In [76]:
pickle.dump(user_topic_0, open(f'{base_path}/user_topic_fold_0.pkl', 'wb'))
pickle.dump(user_topic_1, open(f'{base_path}/user_topic_fold_1.pkl', 'wb'))
pickle.dump(user_topic_2, open(f'{base_path}/user_topic_fold_2.pkl', 'wb'))
pickle.dump(user_topic_3, open(f'{base_path}/user_topic_fold_3.pkl', 'wb'))

In [77]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
train['day'] = extract_day(train['dt'])
del train['dt']
logging.info("train %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
del test['dt']
logging.info("test %s", test.shape)

[2019-12-06 12:03:54,565] INFO in <ipython-input-77-229aa93b02cb>: train (9489162, 4)
[2019-12-06 12:03:55,686] INFO in <ipython-input-77-229aa93b02cb>: test (1141683, 2)


In [78]:
train = pd.merge(train, ques, on='qid', how='left')
test = pd.merge(test, ques, on='qid', how='left')

In [79]:
# train
train['fold'] = train['day'].apply(fold_fn)

train['user_topic_kfold'] = -10000

ut_list = [user_topic_0, user_topic_1, user_topic_2, user_topic_3]
for i in range(4):
    train.loc[train['fold']==i, 'user_topic_kfold'] = train[train['fold']==i]['uid'].map(ut_list[i]).fillna('-1')

assert len(train[train['user_topic_kfold']==-10000])==0

In [80]:
train

Unnamed: 0,qid,uid,label,day,topic,fold,user_topic_kfold
0,Q2166419046,M401693808,0,3865,"T456,T112,T9566,T5310",3,-1
1,Q1550017551,M3392373099,0,3844,"T2,T3095",0,-1
2,Q604029601,M2317670257,0,3862,"T6090,T2156,T97,T456",3,"T10,T10186,T1203,T4454,T5649,T5122"
3,Q2350061229,M1618461867,0,3849,T856,1,-1
4,Q2443223942,M3544409350,0,3867,"T26,T76,T17",3,-1
...,...,...,...,...,...,...,...
9489157,Q2372512458,M4285896253,1,3849,"T40428,T2643,T2577,T1630,T22708",1,"T2961,T725,T2184,T11191,T3267,T443,T423,T99,T5..."
9489158,Q3516644442,M4285896253,1,3862,T79,3,"T2961,T725,T2184,T11191,T3267,T443,T423,T99,T5..."
9489159,Q3847094730,M4285896253,0,3852,"T79,T53833,T668,T4568",1,"T2961,T725,T2184,T11191,T3267,T443,T423,T99,T5..."
9489160,Q2358485548,M4285896253,0,3864,"T2156,T741,T16804",3,"T2961,T725,T2184,T11191,T3267,T443,T423,T99,T5..."


In [81]:
# test
user_topic = pickle.load(open(f'{base_path}/user_topic.pkl', 'rb'))       # 没有打折
test['user_topic_kfold'] = test['uid'].map(user_topic).fillna('-1')    # 实际并不是 kfold

In [82]:
def cos_sim(vec1, vec2):
    return np.dot(vec1, vec2)/((np.linalg.norm(vec1)*(np.linalg.norm(vec2))))

def eucl_sim(vec1, vec2):
    return np.linalg.norm(np.array(vec1)-np.array(vec2))

In [83]:
def get_topic_sim(q_topic, u_topic, mode):
    n_most_common = 20
    assert mode in ['train', 'test']
        
    if (q_topic == '-1') or (u_topic == '-1'):
        return [0]*4 + [np.nan]*20
    
    q_topic_2 = q_topic.split(',')
    u_topic_2 = u_topic.split(',')
    counter = Counter(u_topic_2)
    if mode == 'train':
        counter = counter - Counter(q_topic_2)    # 去当前记录             
    most_common_topic = counter.most_common(n_most_common)
    if len(most_common_topic)== 0:         # 没有其他时期的回答
        return [0]*4 + [np.nan]*20
    
    count_list = []
    for ut_c in most_common_topic: 
        count_list.append(ut_c[1])
    count_norm_list = np.array(list(count_list)) / np.sum(list(count_list))   # normalize
   
    sim_list = []                   # 两两交叉计算相似度
    sim_norm_list = []
    count_ut_in_qt_weighted = 0     # 问题话题在过往回答过的话题里的出现次数
    count_ut_in_qt = 0             # 问题话题与过往回答过的话题的交集个数
    count_norm_ut_in_qt = 0
    
    # 欧式距离相似度
    sim_list_eucl = []                   # 两两交叉计算相似度
    sim_norm_list_eucl = []

    for qt in q_topic_2:
        q_topic_index = int(qt[1:]) - 1   # q_topic 索引
        q_topic_vec = topic_table.iloc[q_topic_index]['vec']
        for ut_c, count, count_norm in zip(most_common_topic, count_list, count_norm_list):  
            u_topic_index = int(ut_c[0][1:]) - 1   # u_topic 索引
            u_topic_vec = topic_table.iloc[u_topic_index]['vec']
            if q_topic_index == u_topic_index:
                count_ut_in_qt_weighted += ut_c[1]
                count_ut_in_qt += 1
                count_norm_ut_in_qt += count_norm
            
            sim = cos_sim(q_topic_vec, u_topic_vec)
            sim_norm = sim*count_norm
            sim_eucl = eucl_sim(q_topic_vec, u_topic_vec)
            sim_norm_eucl = sim_eucl*count_norm
            
            sim_list.append(sim)
            sim_norm_list.append(sim_norm)
            sim_list_eucl.append(sim_eucl)
            sim_norm_list_eucl.append(sim_norm_eucl)
            
    
#     print('1')
    rate_ut_in_qt = count_ut_in_qt / len(q_topic_2)
    
    min_sim = np.min(sim_list)
    max_sim = np.max(sim_list)
    sum_sim = np.sum(sim_list)
    mean_sim = np.mean(sim_list)
    std_sim = np.std(sim_list)
    
#     print('2')
    min_sim_norm = np.min(sim_norm_list)
    max_sim_norm = np.max(sim_norm_list)
    sum_sim_norm = np.sum(sim_norm_list)
    mean_sim_norm = np.mean(sim_norm_list)
    std_sim_norm = np.std(sim_norm_list)
    
    # 欧式
#     print('3')
    min_sim_eucl = np.min(sim_list_eucl)
    max_sim_eucl = np.max(sim_list_eucl)
    sum_sim_eucl = np.sum(sim_list_eucl)
    mean_sim_eucl = np.mean(sim_list_eucl)
    std_sim_eucl = np.std(sim_list_eucl)
    
#     print('4')
    min_sim_norm_eucl = np.min(sim_norm_list_eucl)
    max_sim_norm_eucl = np.max(sim_norm_list_eucl)
    sum_sim_norm_eucl = np.sum(sim_norm_list_eucl)
    mean_sim_norm_eucl = np.mean(sim_norm_list_eucl)
    std_sim_norm_eucl = np.std(sim_norm_list_eucl)

#     print('5')
    res = [count_ut_in_qt_weighted, count_ut_in_qt, rate_ut_in_qt, count_norm_ut_in_qt]
    res += [min_sim, max_sim, sum_sim, mean_sim, std_sim]
    res += [min_sim_norm, max_sim_norm, sum_sim_norm, mean_sim_norm, std_sim_norm]
    res += [min_sim_eucl, max_sim_eucl, sum_sim_eucl, mean_sim_eucl, std_sim_eucl]
    res += [min_sim_norm_eucl, max_sim_norm_eucl, sum_sim_norm_eucl, mean_sim_norm_eucl, std_sim_norm_eucl]
    return res

In [84]:
# 压缩数据
def compress_data(df):
    for col in ['qu_topic_count_weight', 'qu_topic_count']:
        df[col] = df[col].astype('int32')
    for col in ['qu_topic_rate', 'qu_topic_count_norm', 'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 
                'std_sim', 'min_sim_norm', 'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm',
                'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']:
        df[col] = df[col].astype('float32')
    return df

In [85]:
import traceback

processes = 36   # 设置进程数

def get_topic_feat(df, num, mode):
    assert mode in ['train', 'test']
    try:
        logging.info('start to extract feature.')
        t1 = df.apply(lambda x: get_topic_sim(x['topic'], x['user_topic_kfold'], mode), axis=1, result_type='expand')
        t1.columns = ['qu_topic_count_weight', 'qu_topic_count', 'qu_topic_rate', 'qu_topic_count_norm',
                  'min_sim', 'max_sim', 'sum_sim', 'mean_sim', 'std_sim', 'min_sim_norm', 
                  'max_sim_norm', 'sum_sim_norm', 'mean_sim_norm', 'std_sim_norm', 
                  'min_sim_eucl', 'max_sim_eucl', 'sum_sim_eucl', 'mean_sim_eucl', 'std_sim_eucl',
                  'min_sim_norm_eucl', 'max_sim_norm_eucl', 'sum_sim_norm_eucl', 'mean_sim_norm_eucl', 'std_sim_norm_eucl']
        logging.info('extracting finish.')
        
        # 压缩数据
        t1 = compress_data(t1)
        t1.to_csv(f'./temp/{mode}_kfold_topic_feature_{num}.txt', index=False, sep='\t')
        logging.info('file %s saving finish.', num)
        del t1
        gc.collect()
    except:
        print(traceback.print_exc())
        
def multi_proc(df, mode):
    import multiprocessing
    pool = multiprocessing.Pool(processes=processes)
    len_data = len(df)
    len_batch = len_data // processes
    for i in range(processes):
        start = i * len_batch
        end = (i+1) * len_batch
        if i == (processes-1):
            end = len_data
        tmp = df[start:end]
        pool.apply_async(get_topic_feat, (tmp, i, mode))
        
    pool.close()
    pool.join()

In [86]:
multi_proc(test, 'test')


[2019-12-06 12:04:52,401] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,477] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,551] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,636] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,774] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,874] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:52,981] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:53,083] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:53,182] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:53,247] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:04:53,430] INFO in <ipython-input-85-bcc05c6e223e>: sta

[2019-12-06 12:18:45,792] INFO in <ipython-input-85-bcc05c6e223e>: file 32 saving finish.
[2019-12-06 12:18:46,145] INFO in <ipython-input-85-bcc05c6e223e>: file 25 saving finish.
[2019-12-06 12:18:49,314] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 12:18:50,698] INFO in <ipython-input-85-bcc05c6e223e>: file 17 saving finish.
[2019-12-06 12:18:58,529] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 12:18:59,969] INFO in <ipython-input-85-bcc05c6e223e>: file 28 saving finish.
[2019-12-06 12:19:06,717] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 12:19:08,146] INFO in <ipython-input-85-bcc05c6e223e>: file 27 saving finish.
[2019-12-06 12:19:13,570] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 12:19:14,985] INFO in <ipython-input-85-bcc05c6e223e>: file 23 saving finish.
[2019-12-06 12:19:22,969] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 12:19:24,362] INFO

In [87]:
multi_proc(train, 'train')

[2019-12-06 12:19:53,040] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:54,043] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:55,120] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:56,085] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:57,012] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:58,229] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:19:59,090] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:20:00,209] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:20:00,970] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:20:01,629] INFO in <ipython-input-85-bcc05c6e223e>: start to extract feature.
[2019-12-06 12:20:02,345] INFO in <ipython-input-85-bcc05c6e223e>: sta

[2019-12-06 14:06:44,922] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:06:56,236] INFO in <ipython-input-85-bcc05c6e223e>: file 9 saving finish.
[2019-12-06 14:07:53,083] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:08:04,455] INFO in <ipython-input-85-bcc05c6e223e>: file 7 saving finish.
[2019-12-06 14:08:28,622] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:08:39,941] INFO in <ipython-input-85-bcc05c6e223e>: file 24 saving finish.
[2019-12-06 14:09:47,384] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:09:58,920] INFO in <ipython-input-85-bcc05c6e223e>: file 20 saving finish.
[2019-12-06 14:12:24,384] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:12:35,856] INFO in <ipython-input-85-bcc05c6e223e>: file 3 saving finish.
[2019-12-06 14:12:46,936] INFO in <ipython-input-85-bcc05c6e223e>: extracting finish.
[2019-12-06 14:12:58,309] INFO in <ip

In [88]:
# 合数据

# test
topic_feat = None
original_path = './temp'
mode = 'test'
for i in range(processes):
    d = pd.read_csv(f'{original_path}/{mode}_kfold_topic_feature_{i}.txt', sep='\t')
    if topic_feat is None:
        topic_feat = d
    else:
        topic_feat = pd.concat([topic_feat, d], axis=0, ignore_index=True)
        
logging.info('%s topic feature, shape: %s', mode, topic_feat.shape)

topic_feat = compress_data(topic_feat)
topic_feat.to_csv(f'{feature_path}/{mode}_kfold_topic_feature.txt', index=False, sep='\t')
logging.info('%s topic feature saved.', mode)

[2019-12-06 14:13:50,189] INFO in <ipython-input-88-ec6b0f27b465>: test topic feature, shape: (1141683, 24)
[2019-12-06 14:15:30,141] INFO in <ipython-input-88-ec6b0f27b465>: test topic feature saved.


In [89]:
# train
topic_feat = None
original_path = './temp'
mode = 'train'
for i in range(processes):
    d = pd.read_csv(f'{original_path}/{mode}_kfold_topic_feature_{i}.txt', sep='\t')
    if topic_feat is None:
        topic_feat = d
    else:
        topic_feat = pd.concat([topic_feat, d], axis=0, ignore_index=True)
logging.info('%s topic feature, shape: %s', mode, topic_feat.shape)

topic_feat = compress_data(topic_feat)
topic_feat.to_csv(f'{feature_path}/{mode}_kfold_topic_feature.txt', index=False, sep='\t')
logging.info('%s topic feature saved.', mode)

[2019-12-06 14:16:45,395] INFO in <ipython-input-89-1fe54e2cf7eb>: train topic feature, shape: (9489162, 24)
[2019-12-06 14:24:59,356] INFO in <ipython-input-89-1fe54e2cf7eb>: train topic feature saved.


In [92]:
topic_feat['min_sim'].count()

6732831

In [93]:
topic_feat

Unnamed: 0,qu_topic_count_weight,qu_topic_count,qu_topic_rate,qu_topic_count_norm,min_sim,max_sim,sum_sim,mean_sim,std_sim,min_sim_norm,...,min_sim_eucl,max_sim_eucl,sum_sim_eucl,mean_sim_eucl,std_sim_eucl,min_sim_norm_eucl,max_sim_norm_eucl,sum_sim_norm_eucl,mean_sim_norm_eucl,std_sim_norm_eucl
0,0,0,0.00,0.000000,,,,,,,...,,,,,,,,,,
1,0,0,0.00,0.000000,,,,,,,...,,,,,,,,,,
2,0,0,0.00,0.000000,-0.067982,0.641366,4.854249,0.202260,0.182464,-0.011330,...,18.058924,36.388226,734.665039,30.611042,3.960869,3.009821,6.064704,122.444168,5.101840,0.660145
3,0,0,0.00,0.000000,,,,,,,...,,,,,,,,,,
4,0,0,0.00,0.000000,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9489157,0,0,0.00,0.000000,-0.373491,0.957168,23.243734,0.232437,0.294816,-0.013581,...,6.399706,41.512833,3097.050537,30.970505,6.270434,0.349075,2.885756,152.851822,1.528518,0.455995
9489158,0,0,0.00,0.000000,-0.253062,0.852932,6.087426,0.304371,0.270822,-0.009373,...,12.895073,36.809788,537.390686,26.869534,5.295059,0.477595,2.298447,26.485655,1.324283,0.402318
9489159,2,1,0.25,0.037037,-0.304637,1.000000,23.679790,0.295997,0.328946,-0.011283,...,0.000000,39.365086,2161.830322,27.022879,6.468279,0.000000,2.713555,108.029366,1.350367,0.469586
9489160,0,0,0.00,0.000000,-0.360813,0.282249,-1.867945,-0.031132,0.140648,-0.015660,...,18.471516,51.345558,2108.511475,35.141857,5.983054,0.684130,3.793854,106.610008,1.776833,0.696057


In [94]:
t1 = pd.read_csv(f'{feature_path}/test_kfold_topic_feature.txt', sep='\t')

In [95]:
t1

Unnamed: 0,qu_topic_count_weight,qu_topic_count,qu_topic_rate,qu_topic_count_norm,min_sim,max_sim,sum_sim,mean_sim,std_sim,min_sim_norm,...,min_sim_eucl,max_sim_eucl,sum_sim_eucl,mean_sim_eucl,std_sim_eucl,min_sim_norm_eucl,max_sim_norm_eucl,sum_sim_norm_eucl,mean_sim_norm_eucl,std_sim_norm_eucl
0,1,1,0.333333,0.333333,0.182818,1.000000,5.182607,0.575845,0.266533,0.060939,...,0.000000,26.387470,156.674420,17.408268,7.488229,0.000000,8.795823,52.224804,5.802756,2.496076
1,0,0,0.000000,0.000000,,,,,,,...,,,,,,,,,,
2,0,0,0.000000,0.000000,-0.261990,0.437958,6.933745,0.092450,0.127885,-0.016374,...,21.936098,53.715170,2554.952600,34.066032,6.316459,1.371006,4.382416,169.320710,2.257610,0.584930
3,0,0,0.000000,0.000000,0.029782,0.262181,1.488354,0.148835,0.070377,0.005956,...,19.959232,31.828415,251.595760,25.159576,3.665920,3.991847,6.365683,50.319153,5.031915,0.733184
4,0,0,0.000000,0.000000,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,0,0,0.000000,0.000000,0.133210,0.681883,1.359764,0.271953,0.209336,0.019030,...,11.946775,30.106020,125.616936,25.123386,6.671156,1.706682,12.274233,26.128098,5.225620,3.642238
1141679,2,1,0.500000,1.000000,0.508004,1.000000,1.508004,0.754002,0.245998,0.508004,...,0.000000,24.140650,24.140650,12.070325,12.070325,0.000000,24.140650,24.140650,12.070325,12.070325
1141680,0,0,0.000000,0.000000,-0.054333,0.040416,-0.077724,-0.015545,0.031238,-0.010867,...,42.324570,47.914980,225.705810,45.141163,2.196178,8.464914,9.582996,45.141163,9.028233,0.439236
1141681,0,0,0.000000,0.000000,-0.312825,0.236951,-0.400969,-0.010024,0.123962,-0.009480,...,24.175558,39.920662,1218.932400,30.473309,3.610362,0.780544,4.735730,59.416980,1.485425,1.054115


In [96]:
t1['min_sim'].count()

865493