In [37]:
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

documents = [
    'apple orange dog cat swim orange',
    'apple banana',
    'apple grape',
    'apple lemmon',
]

texts = [[word for word in document.split()] for document in documents]
print('words by document')
print(texts)

dictionary = Dictionary(texts)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary.token2id)
print(corpus)

tfidf_model = TfidfModel(corpus)
print(tfidf_model)

for tf_doc in corpus:
    print(tfidf_model[tf_doc]) 


words by document
[['apple', 'orange', 'dog', 'cat', 'swim', 'orange'], ['apple', 'banana'], ['apple', 'grape'], ['apple', 'lemmon']]
Dictionary(8 unique tokens: ['apple', 'cat', 'dog', 'orange', 'swim']...)
{'apple': 0, 'cat': 1, 'dog': 2, 'orange': 3, 'swim': 4, 'banana': 5, 'grape': 6, 'lemmon': 7}
[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1)], [(0, 1), (5, 1)], [(0, 1), (6, 1)], [(0, 1), (7, 1)]]
TfidfModel(num_docs=4, num_nnz=11)
[(1, 0.3779644730092272), (2, 0.3779644730092272), (3, 0.7559289460184544), (4, 0.3779644730092272)]
[(5, 1.0)]
[(6, 1.0)]
[(7, 1.0)]


### Memo
- The words appearing across many documents are not so important. So they are removed from corpus.
- The words less appearing across many documents are so important. The word will be more informative.

### Ref
- [models.tfidfmodel – TF-IDF model](https://radimrehurek.com/gensim/models/tfidfmodel.html)
- [tf–idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)