In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

\begin{align}
TF_IDF(t, d) &= TF(t, d) \times IDF(t) \tag{1} \\
IDF(t) &= \log \frac{1+N}{1 + DF(t)} + 1  \tag{2} \\
IDF(t) &= \log \frac{1}{DF(t)} + 1 \tag{3} \\
TF'(T) &= 1 + \log  TF(t) \tag{4}
\end{align}

其中$TF(t, d)$代表的式单词$t$在文档$d$中出现的频率,$IDF(t)$则是单词在所有文档中分布$DF(t)$的一个函数,其中$N$是所有文档的数量,$DF(t)$是包含单词$t$的文档的数目.

其含义为:某个单词在某个文档种出现频率很高,但是在其他文档中出现的频率很低,则说明该单词在该文档中比较重要

In [2]:
# 默认参数如下
vectorizer = TfidfVectorizer(norm='l2', # 归一化类型,可以对每个向量进行L1归一化或L2归一化
                             use_idf=True, # 是否计算IDF
                             smooth_idf=True, # 如果设置为True,则使用式(2)计算IDF,否则使用式(3)计算IDF
                             sublinear_tf=True) # 如果设置为True,则使用式(4)来计算新的TF,并且用新的TF来计算TF-IDF

with open('data.txt', 'r', encoding='utf-8') as f:
    a = f.readlines()

X = vectorizer.fit_transform(a)

In [3]:
vectorizer.get_feature_names()

['advance',
 'after',
 'again',
 'age',
 'air',
 'already',
 'also',
 'alumni',
 'always',
 'an',
 'and',
 'appearance',
 'are',
 'arm',
 'as',
 'at',
 'back',
 'backed',
 'basketball',
 'be',
 'became',
 'been',
 'beginning',
 'belief',
 'blue',
 'broken',
 'but',
 'can',
 'childhood',
 'children',
 'clear',
 'clothes',
 'constantly',
 'corner',
 'could',
 'cowardice',
 'crush',
 'dark',
 'day',
 'days',
 'declared',
 'deep',
 'deeply',
 'determined',
 'different',
 'disillusionment',
 'dislike',
 'do',
 'don',
 'doomed',
 'during',
 'encounter',
 'end',
 'eventually',
 'factor',
 'feel',
 'feelings',
 'finally',
 'for',
 'found',
 'fresh',
 'friendship',
 'from',
 'give',
 'go',
 'graduation',
 'green',
 'grow',
 'growth',
 'guilty',
 'has',
 'have',
 'haven',
 'he',
 'high',
 'himself',
 'his',
 'hoops',
 'ideas',
 'if',
 'important',
 'in',
 'infarction',
 'injection',
 'inner',
 'into',
 'is',
 'isn',
 'it',
 'junior',
 'just',
 'last',
 'later',
 'life',
 'like',
 'line',
 'looke

In [4]:
vectorizer.vocabulary_

{'age': 3,
 'has': 70,
 'reached': 135,
 'the': 162,
 'end': 52,
 'of': 113,
 'beginning': 22,
 'word': 189,
 'may': 100,
 'be': 19,
 'guilty': 69,
 'in': 81,
 'his': 76,
 'seems': 143,
 'to': 174,
 'passing': 125,
 'lot': 97,
 'different': 44,
 'life': 93,
 'became': 20,
 'appearance': 11,
 'same': 140,
 'day': 38,
 'back': 16,
 'past': 126,
 'oneself': 117,
 'paranoid': 123,
 'weird': 184,
 'belief': 23,
 'disillusionment': 45,
 'these': 165,
 'days': 39,
 'my': 108,
 'mind': 103,
 'been': 21,
 'very': 180,
 'messy': 102,
 'constantly': 32,
 'always': 8,
 'feel': 55,
 'should': 150,
 'go': 64,
 'do': 47,
 'something': 153,
 'or': 119,
 'write': 190,
 'twenty': 176,
 'years': 193,
 'trajectory': 175,
 'deeply': 42,
 'shallow': 147,
 'suddenly': 159,
 'it': 88,
 'during': 50,
 'childhood': 28,
 'think': 167,
 'lucky': 99,
 'money': 106,
 'and': 10,
 'new': 110,
 'clothes': 31,
 'are': 12,
 'necessary': 109,
 'for': 58,
 'year': 192,
 'but': 26,
 'as': 14,
 'advance': 0,
 'will': 188,
 