# TF-IDF简单示例

### 1. 定义数据和预处理

In [10]:
docA = "西甲|法甲|意甲|德甲|中超|中超"
docB = "西甲|意甲|中超"
docC = "阿甲|德甲|巴甲|中甲|中乙|中超"

bowA = docA.split("|")
bowB = docB.split("|")
bowC = docC.split("|")
#bowA

wordSet = set(bowA).union(set(bowB)).union(set(bowC))
wordSet

{'中乙', '中甲', '中超', '巴甲', '德甲', '意甲', '法甲', '西甲', '阿甲'}

### 2. 统计词的频数

In [11]:
wordCountA = dict.fromkeys(wordSet, 0)
wordCountB = dict.fromkeys(wordSet, 0)
wordCountC = dict.fromkeys(wordSet, 0)

for word in bowA:
    wordCountA[word] += 1
for word in bowB:
    wordCountB[word] += 1
for word in bowC:
    wordCountC[word] += 1
    
import pandas as pd
pd.DataFrame([wordCountA, wordCountB, wordCountC])

Unnamed: 0,中乙,中甲,中超,巴甲,德甲,意甲,法甲,西甲,阿甲
0,0,0,2,0,1,1,1,1,0
1,0,0,1,0,0,1,0,1,0
2,1,1,1,1,1,0,0,0,1


### 3. 计算词频

In [12]:
# 在每个文档里的词频计算
def computeTF(wordCount, bow):
    # 记录tf结果
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordCount.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

tfA = computeTF(wordCountA, bowA)
tfB = computeTF(wordCountB, bowB)
tfC = computeTF(wordCountC, bowC)
tfA, tfB, tfC

({'德甲': 0.16666666666666666,
  '中超': 0.3333333333333333,
  '西甲': 0.16666666666666666,
  '巴甲': 0.0,
  '中乙': 0.0,
  '中甲': 0.0,
  '阿甲': 0.0,
  '法甲': 0.16666666666666666,
  '意甲': 0.16666666666666666},
 {'德甲': 0.0,
  '中超': 0.3333333333333333,
  '西甲': 0.3333333333333333,
  '巴甲': 0.0,
  '中乙': 0.0,
  '中甲': 0.0,
  '阿甲': 0.0,
  '法甲': 0.0,
  '意甲': 0.3333333333333333},
 {'德甲': 0.16666666666666666,
  '中超': 0.16666666666666666,
  '西甲': 0.0,
  '巴甲': 0.16666666666666666,
  '中乙': 0.16666666666666666,
  '中甲': 0.16666666666666666,
  '阿甲': 0.16666666666666666,
  '法甲': 0.0,
  '意甲': 0.0})

### 4. 计算逆文档频率

In [13]:
# 统一传入所有文档的wordCount字典
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    print('idfDict: ', idfDict)
    
    for doc in docList:
        for word, count in doc.items():
            if count > 0:
                idfDict[word] += 1
    for word, count in idfDict.items():
        idfDict[word] = math.log10( (N+1)/float(count + 1) )
    return idfDict

idfs = computeIDF([wordCountA, wordCountB, wordCountC])
idfs

idfDict:  {'德甲': 0, '中超': 0, '西甲': 0, '巴甲': 0, '中乙': 0, '中甲': 0, '阿甲': 0, '法甲': 0, '意甲': 0}


{'德甲': 0.12493873660829993,
 '中超': 0.0,
 '西甲': 0.12493873660829993,
 '巴甲': 0.3010299956639812,
 '中乙': 0.3010299956639812,
 '中甲': 0.3010299956639812,
 '阿甲': 0.3010299956639812,
 '法甲': 0.3010299956639812,
 '意甲': 0.12493873660829993}

### 5. 计算TF-IDF

In [14]:
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, tf in tf.items():
        tfidf[word] = tf * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)
pd.DataFrame([tfidfA, tfidfB, tfidfC])

Unnamed: 0,中乙,中甲,中超,巴甲,德甲,意甲,法甲,西甲,阿甲
0,0.0,0.0,0.0,0.0,0.020823,0.020823,0.050172,0.020823,0.0
1,0.0,0.0,0.0,0.0,0.0,0.041646,0.0,0.041646,0.0
2,0.050172,0.050172,0.0,0.050172,0.020823,0.0,0.0,0.0,0.050172
