# 基于简单 TF-IDF的 标签测试

基本思路就是将问题与问题描述用到的词放在一起，当做一篇文章，然后计算全部问题的IDF,再将标签当做关键词计算标签与问题的相似度。

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time 
import operator
from collections import Counter
from ast import literal_eval
import os
import gc

%matplotlib inline

## Parallelization of pandas.apply() 

根据 https://stackoverflow.com/questions/37078880/status-of-parallelization-of-pandas-apply

找到下面两个并行计算的网址

https://github.com/pandas-dev/pandas/issues/13111

http://www.racketracer.com/2016/07/06/pandas-in-parallel/

我试验了dask 超快，下面有几个框，先用传统的pandas做，然后再用dask做，你可以比较一下，最好能比较一下看看结果是不是一样的。如果 结果一样的话，就可以把传统的实现删除了。




In [2]:
import dask
import dask.dataframe as dd
from dask import threaded, multiprocessing

## Read in data sets

将原来的数据路径放在环境变量里面，这样就不用每次改程序了

nrows: 考虑到曾经在328877那里曾经报过错，所以编程时至少加载35万条数据, 在read_csv时会用到

In [3]:
#data_path=os.environ.get('zhihu_data_path')+'/' 
data_path = 'ieee_zhihu_cup/'
nrows=350 * 1000 

In [4]:
start_time = time.time()
print('Start time:', start_time)
df_questions = pd.read_csv(data_path+'question_train_set.txt',header=None, names=['question_id', 'ct', 'wt','cd','wd'], sep='\t', nrows=nrows)
print('time cost:', time.time() - start_time)

Start time: 1497736935.981434
time cost: 3.8162386417388916


### 将DataFrame 转成dask

In [5]:
start_time = time.time()
ddf_questions = dd.from_pandas(df_questions, npartitions =4)
print('time cost:', time.time() - start_time)

time cost: 1.6201422214508057


## Prepare df_questions

In [6]:
def split(row):
    return [] if type(row) == float else row.split(',')

### 两个数据异常
3 是没有description

328877 是有title 里面的字不成词

In [7]:
print (df_questions.loc[3])
print (df_questions.loc[328877])

question_id                                    -5698296155734268
ct             c473,c1528,c528,c428,c295,c15,c101,c188,c146,c...
wt             w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...
cd                                                           NaN
wd                                                           NaN
Name: 3, dtype: object
question_id                                  -930822265097814910
ct             c149,c148,c148,c42,c185,c95,c95,c190,c42,c106,...
wt                                                           NaN
cd             c346,c818,c740,c630,c101,c1128,c386,c740,c630,...
wd                              w61033,w54,w26250,w973,w54,w9892
Name: 328877, dtype: object


In [8]:
df_questions.head(5)

Unnamed: 0,question_id,ct,wt,cd,wd
0,6555699376639805223,"c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,...","w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...","c335,c101,c611,c189,c97,c144,c147,c101,c15,c76...","w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1..."
1,2887834264226772863,"c44,c110,c101,c286,c106,c150,c101,c892,c632,c1...","w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...","c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3...","w12508,w1380,w72,w27045,w276,w111"
2,-2687466858632038806,"c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c...","w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...","c693,c100,c279,c99,c189,c532,c101,c189,c145,c1...","w140340,w54,w48398,w54,w140341,w54,w12856,w54,..."
3,-5698296155734268,"c473,c1528,c528,c428,c295,c15,c101,c188,c146,c...","w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...",,
4,-6719100304248915192,"c190,c147,c105,c219,c220,c101,c647,c219,c220,c...","w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...","c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1...","w4821,w1301,w16003,w928,w1961,w2565,w50803,w11..."


### 两种转list的方法，速度差的不是一点儿半点儿

In [9]:
start_time = time.time()

df_questions['wt_list'] = df_questions.wt.apply(split)

print('time cost:', time.time() - start_time)

start_time = time.time()

ddf_questions['wt_list'] = ddf_questions.wt.apply(split)

print('time cost:', time.time() - start_time)

time cost: 0.8648364543914795
time cost: 0.002998828887939453


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [10]:
start_time = time.time()

df_questions['wd_list'] = df_questions.wd.apply(split)

print('time cost:', time.time() - start_time)

start_time = time.time()

ddf_questions['wd_list'] = ddf_questions.wd.apply(split)

print('time cost:', time.time() - start_time)

time cost: 2.20782732963562
time cost: 0.001997709274291992


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [11]:
start_time = time.time()
df_questions['bag_of_words'] = df_questions.apply(lambda x : x['wt_list'] + x['wd_list'], axis = 1)
print('time cost:', time.time() - start_time)

time cost: 9.607444763183594


In [12]:
start_time = time.time()
ddf_questions['bag_of_words'] = ddf_questions.apply(lambda x : x['wt_list'] + x['wd_list'], axis = 1)
print('time cost:', time.time() - start_time)

time cost: 0.005997180938720703


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [13]:
ddf_questions.head(5)

Unnamed: 0,question_id,ct,wt,cd,wd,wt_list,wd_list,bag_of_words
0,6555699376639805223,"c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,...","w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...","c335,c101,c611,c189,c97,c144,c147,c101,c15,c76...","w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1...","[w305, w13549, w22752, w11, w7225, w2565, w110...","[w231, w54, w1681, w54, w11506, w5714, w7, w54...","[w305, w13549, w22752, w11, w7225, w2565, w110..."
1,2887834264226772863,"c44,c110,c101,c286,c106,c150,c101,c892,c632,c1...","w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...","c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3...","w12508,w1380,w72,w27045,w276,w111","[w377, w54, w285, w57, w349, w54, w108215, w6,...","[w12508, w1380, w72, w27045, w276, w111]","[w377, w54, w285, w57, w349, w54, w108215, w6,..."
2,-2687466858632038806,"c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c...","w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...","c693,c100,c279,c99,c189,c532,c101,c189,c145,c1...","w140340,w54,w48398,w54,w140341,w54,w12856,w54,...","[w875, w15450, w42394, w15863, w6, w95421, w25...","[w140340, w54, w48398, w54, w140341, w54, w128...","[w875, w15450, w42394, w15863, w6, w95421, w25..."
3,-5698296155734268,"c473,c1528,c528,c428,c295,c15,c101,c188,c146,c...","w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...",,,"[w8646, w2744, w1462, w9, w54, w138, w54, w50,...",[],"[w8646, w2744, w1462, w9, w54, w138, w54, w50,..."
4,-6719100304248915192,"c190,c147,c105,c219,c220,c101,c647,c219,c220,c...","w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...","c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1...","w4821,w1301,w16003,w928,w1961,w2565,w50803,w11...","[w380, w54, w674, w133, w54, w134, w614, w54, ...","[w4821, w1301, w16003, w928, w1961, w2565, w50...","[w380, w54, w674, w133, w54, w134, w614, w54, ..."


In [14]:
start_time = time.time()
df=df_questions[['question_id', 'bag_of_words']]
df.to_pickle(data_path+'question_words_bag.pickle')
print('time cost:', time.time() - start_time)

time cost: 10.140463590621948


### Remove all dataframe 

In [15]:
start_time = time.time()
del df
del df_questions
gc.collect()
print('time cost:', time.time() - start_time)

time cost: 1.772684097290039


### Reload data

Can we load the data to dask directly?

In [16]:
start_time = time.time()
df_bag=pd.read_pickle(data_path+'question_words_bag.pickle')
print('time cost:', time.time() - start_time)
print('End time:', time.time())

time cost: 3.7484612464904785
End time: 1497736993.767359


In [17]:
df_bag.head(5)

Unnamed: 0,question_id,bag_of_words
0,6555699376639805223,"[w305, w13549, w22752, w11, w7225, w2565, w110..."
1,2887834264226772863,"[w377, w54, w285, w57, w349, w54, w108215, w6,..."
2,-2687466858632038806,"[w875, w15450, w42394, w15863, w6, w95421, w25..."
3,-5698296155734268,"[w8646, w2744, w1462, w9, w54, w138, w54, w50,..."
4,-6719100304248915192,"[w380, w54, w674, w133, w54, w134, w614, w54, ..."


下面这一步很不可思议，只用了5秒多就统计好了全部数据

In [18]:
start_time = time.time()
df_bag['wt_counter'] = df_bag.bag_of_words.apply(Counter)
print('time cost:', time.time() - start_time)

df_bag.head(5)

time cost: 5.386658668518066


Unnamed: 0,question_id,bag_of_words,wt_counter
0,6555699376639805223,"[w305, w13549, w22752, w11, w7225, w2565, w110...","{'w5714': 1, 'w11506': 1, 'w11': 2, 'w111': 3,..."
1,2887834264226772863,"[w377, w54, w285, w57, w349, w54, w108215, w6,...","{'w1380': 1, 'w21790': 1, 'w500': 1, 'w111': 2..."
2,-2687466858632038806,"[w875, w15450, w42394, w15863, w6, w95421, w25...","{'w12856': 1, 'w362': 1, 'w140340': 1, 'w14034..."
3,-5698296155734268,"[w8646, w2744, w1462, w9, w54, w138, w54, w50,...","{'w9': 1, 'w359': 1, 'w6': 1, 'w111': 2, 'w212..."
4,-6719100304248915192,"[w380, w54, w674, w133, w54, w134, w614, w54, ...","{'w11': 1, 'w111': 2, 'w614': 1, 'w674': 1, 'w..."


In [19]:
df_bag.wt_counter.loc[0]

Counter({'w1019': 1,
         'w1042': 1,
         'w109': 1,
         'w11': 2,
         'w1106': 1,
         'w111': 3,
         'w1110': 1,
         'w11506': 1,
         'w1166': 1,
         'w13549': 1,
         'w16': 1,
         'w1681': 1,
         'w22752': 1,
         'w23': 1,
         'w231': 1,
         'w25': 1,
         'w2565': 1,
         'w26377': 1,
         'w305': 1,
         'w31389': 1,
         'w3332': 1,
         'w37031': 1,
         'w54': 6,
         'w5714': 1,
         'w6': 2,
         'w69288': 1,
         'w7': 1,
         'w7225': 1,
         'w744': 2,
         'w7734': 1,
         'w9': 1})

## Count Occurancy of a word that occurs in a question, including description and title

In [20]:
def CountWords(row):
    for w in row:
        if w not in word_dict.keys():
            word_dict[w] = 1
        else:
            word_dict[w] += 1
    return
word_dict = {}
#word_dict = dict.fromset
start_time = time.time()
_ = df_bag.bag_of_words.apply(CountWords)
print('time cost:', time.time() - start_time)

print(len(word_dict))

time cost: 6.47197413444519
248913


## 计算逆文本频率指数 IDF

$$ IDF = log(\frac{D}{D_w}) $$

D： 所有的Question的总数

Dw：词 w 出现在Dw 篇文章中

比如 的 几乎出现在所有的问题中，其IDF 就几乎为零。

秒执行

In [1]:
D

NameError: name 'D' is not defined

In [21]:
idf_dict={}
D = len(df_bag)
for k,v in word_dict.items():
    idf_dict[k] = np.log2(float(D)/v)

In [22]:
idf_dict

{'w13705': 14.83203289577326,
 'w458539': 18.416995396494414,
 'w145530': 11.510104800885896,
 'w212506': 16.416995396494414,
 'w198170': 16.83203289577326,
 'w156584': 18.416995396494414,
 'w263176': 18.416995396494414,
 'w199125': 16.83203289577326,
 'w148987': 16.83203289577326,
 'w22973': 16.83203289577326,
 'w164644': 18.416995396494414,
 'w191587': 18.416995396494414,
 'w126082': 11.267248276989733,
 'w12035': 13.462799086107541,
 'w420307': 18.416995396494414,
 'w55663': 13.510104800885896,
 'w433135': 18.416995396494414,
 'w16959': 16.83203289577326,
 'w203859': 17.416995396494414,
 'w95712': 13.059443391876332,
 'w12924': 15.83203289577326,
 'w159814': 13.510104800885896,
 'w33493': 9.3137075880823943,
 'w153290': 12.394627583465962,
 'w50646': 15.416995396494416,
 'w99107': 14.024677973715656,
 'w50444': 18.416995396494414,
 'w156782': 15.609640474436812,
 'w191635': 17.416995396494414,
 'w81033': 16.83203289577326,
 'w207208': 15.095067301607054,
 'w189615': 18.4169953964944

# 根据 TF-IDF 计算 Question与Topic的相关性

$$ TF-IDF = TF_1\cdot IDF_1 + TF_2 \cdot IDF_2 + ... + TF_N \cdot IDF_N $$

TF1: 词1在此Question 出现的频率 $$ TFx = \frac{词_x在此question中出现的次数}{此Question中的总词数}$$ 

此处的词指来自Topic中的词。如Topic为w32,w1234 则计算每一篇文章与W32, w1234的相关性。



用的时候再加载，减少不必要的内存占用

In [23]:
df_topics = pd.read_csv(data_path+'topic_info.txt', header=None, names=['topic_id', 'pid', 'cn', 'wn', 'cd', 'wd'],sep='\t')
df_question_topic = pd.read_csv(data_path+'question_topic_train_set.txt', header=None, names=['question_id', 'topic_id'],sep='\t')

In [24]:
df_topics[df_topics['topic_id'] == 738845194850773558].wd[0]

'w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w15,w6,w19,w20,w21,w22,w23'

### 数据预处理

1. 在df_bag 中增加一列，包含每一个question的词的总数， 就是下面的 total_word
2. 按照上面处理 question的方法，处理topics, 也要有 wt_counter 与 total_word


#### Adding bag_of_words, wt_counter and total_words to df_topics

In [25]:
def topic_word_bag(row):
    if type(row.wn) == float and type(row.wd) == float:
        return []
    elif type(row.wn) == float:
        return row.wd.split(',')
    elif type(row.wd) == float:
        return row.wn.split(',')
    return (row.wn + ',' + row.wd).split(',') 

df_topics['bag_of_words'] = df_topics.apply(topic_word_bag, axis = 1)

In [26]:
def topic_wt_counter(row):
    d = dict()
    for word in row:
        if word not in d.keys():
            d[word] = 1
        else:
            d[word] += 1
    return d
df_topics['wt_counter'] = df_topics.bag_of_words.apply(topic_wt_counter)

In [27]:
def total_word(row):
    return sum(row.values())
df_topics['total_word'] = df_topics.wt_counter.apply(total_word)

#### Adding total_words to df_bag

In [28]:
df_bag['total_word'] = df_bag.wt_counter.apply(total_word)

In [29]:
def tf_idf(topic_id, question_id):
    index1 = df_topics[df_topics['topic_id'] == topic_id].index[0]
    topic_word = df_topics[df_topics['topic_id'] == topic_id].bag_of_words[index1]

    index2 = df_bag[df_bag['question_id'] == question_id].index[0]
    word_dict = df_bag[df_bag['question_id'] == question_id].wt_counter[index2]

    total_word = df_bag[df_bag['question_id'] == question_id].total_word[index2]

    
    tf_idf_value = 0
    for word in topic_word:
        if word in word_dict:
            tf_idf_value += idf_dict[word]*word_dict[word]/total_word
    return tf_idf_value

In [31]:
tf_idf(-3149765934180654494, 2887834264226772863)

-0.076145875146753028

In [32]:
def split_to_list(row):
    return row.split(',')
df_question_topic['topic_id_list'] = df_question_topic.topic_id.apply(split_to_list)

In [33]:
df_question_topic['topic_count'] = df_question_topic.topic_id_list.apply(len)

In [34]:
sub_df = df_question_topic[df_question_topic['topic_count'] == 1]

In [35]:
sub_df.loc[1][1]

'-3149765934180654494'

In [36]:
def tf_idf2(ser):
    return tf_idf(int(ser.topic_id_list[0]), ser.question_id)

In [37]:
sub_df.head(5)

Unnamed: 0,question_id,topic_id,topic_id_list,topic_count
1,2887834264226772863,-3149765934180654494,[-3149765934180654494],1
2,-2687466858632038806,-760432988437306018,[-760432988437306018],1
9,3174606710238304130,-4115748438709160582,[-4115748438709160582],1
18,-3679570071031716995,-9176307901497282391,[-9176307901497282391],1
21,6004514913022607006,-8966465280115387956,[-8966465280115387956],1


In [38]:
df_topics[df_topics['topic_id'] == -3149765934180654494]

Unnamed: 0,topic_id,pid,cn,wn,cd,wd,bag_of_words,wt_counter,total_word
769,-3149765934180654494,-5388067101870430963,"c1487,c378",w17245,"c70,c110,c1216,c1303,c11,c2308,c1201,c212,c378","w10730,w2741,w6,w17246,w17247","[w17245, w10730, w2741, w6, w17246, w17247]","{'w17245': 1, 'w17247': 1, 'w2741': 1, 'w10730...",6


In [39]:
df_topics[ df_topics['topic_id'] == 738845194850773558]

Unnamed: 0,topic_id,pid,cn,wn,cd,wd,bag_of_words,wt_counter,total_word
0,738845194850773558,-5833678375673307423,"c0,c1",w0,"c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1...","w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,...","[w0, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w...","{'w9': 1, 'w11': 1, 'w21': 1, 'w17': 1, 'w2': ...",27


In [40]:
print('df_topics[df_topics[\'topic_id\'] == 738845194850773558].wn=', df_topics[df_topics['topic_id'] == 738845194850773558].wn)
print('df_topics[df_topics[\'topic_id\'] == 738845194850773558].wn[0]=', df_topics[df_topics['topic_id'] == 738845194850773558].wn[0])
print()

print('df_topics[df_topics[\'topic_id\'] == -3149765934180654494].wn=', df_topics[df_topics['topic_id'] == -3149765934180654494].wn)
print('df_topics[df_topics[\'topic_id\'] == -3149765934180654494].wn[0]=', df_topics[df_topics['topic_id'] == -3149765934180654494].wn[769])

# why? !!!

df_topics[df_topics['topic_id'] == 738845194850773558].wn= 0    w0
Name: wn, dtype: object
df_topics[df_topics['topic_id'] == 738845194850773558].wn[0]= w0

df_topics[df_topics['topic_id'] == -3149765934180654494].wn= 769    w17245
Name: wn, dtype: object
df_topics[df_topics['topic_id'] == -3149765934180654494].wn[0]= w17245


In [41]:
df_topics[df_topics['topic_id'] == -3149765934180654494].index[0]

769

In [42]:
#tf_idf2(sub_df.loc[1])
print(int(sub_df.loc[1].topic_id_list[0]),sub_df.loc[1].question_id)
tf_idf(-3149765934180654494 ,2887834264226772863)

-3149765934180654494 2887834264226772863


-0.076145875146753028

In [68]:
start_time = time.time()
a = sub_df.head(10000).apply(tf_idf2, axis = 1)
print('time cost:', time.time() - start_time)

time cost: 29.167520999908447


In [65]:
sub_df.head(114900)

Unnamed: 0,question_id,topic_id,topic_id_list,topic_count
1,2887834264226772863,-3149765934180654494,[-3149765934180654494],1
2,-2687466858632038806,-760432988437306018,[-760432988437306018],1
9,3174606710238304130,-4115748438709160582,[-4115748438709160582],1
18,-3679570071031716995,-9176307901497282391,[-9176307901497282391],1
21,6004514913022607006,-8966465280115387956,[-8966465280115387956],1
23,-6317692530143404667,-4175044003751472418,[-4175044003751472418],1
26,-2335474937104234672,6493581724141198741,[6493581724141198741],1
28,-6473476649740451635,3247902953025099742,[3247902953025099742],1
31,554647747767379078,2460528390051567153,[2460528390051567153],1
32,7964675238481673415,-2288652390863265229,[-2288652390863265229],1


In [71]:
a.sort()

  if __name__ == '__main__':


In [72]:
a

29903   -12.791465
16334    -9.152065
12222    -6.644008
26163    -6.525465
20484    -5.648120
23688    -5.631324
10790    -5.327675
4860     -5.209778
5840     -5.021450
9203     -4.875695
20385    -4.859214
15565    -4.641171
26520    -4.630682
33079    -4.493579
10564    -4.257684
18511    -4.193270
29564    -4.104370
10592    -4.063277
24156    -4.033454
36566    -3.965241
11123    -3.954650
2359     -3.902604
13609    -3.835908
28063    -3.724715
22010    -3.722333
11841    -3.672386
15375    -3.637692
18776    -3.597893
30440    -3.579471
12089    -3.515495
           ...    
1673      7.715427
26904     7.996431
11653     8.053217
28309     8.107379
18026     8.161997
10802     8.420521
38257     8.446879
2948      8.466720
18384     8.534985
35527     8.640950
15100     8.650466
17618     9.068033
29607     9.855932
12989     9.862407
39040     9.994931
1449     10.166691
35650    10.172235
36711    10.392527
29837    11.113215
14089    11.711950
19944    11.817373
27867    12.

In [None]:
sub_df['tf_idf'] = sub_df.apply(tf_idf2, axis = 1)

## 考查TF_IDF的分布情况

### 计算单topic的Question的TF-IDF分布情况

### 计算多topic的Question的TF-IDF分布情况

In [None]:
# your code here ...

## 分析多Topic时，Topic的位置与TF-IDF的关系

In [None]:
# your code here ...

## 研究 Topic 继承关系对Topic赋值的影响

## 研究同义词对 Topic赋值的影响