In [1]:
import numpy as np
from corpus import Corpus

### Comparison with gensim

In [2]:
# This is an example from gensim
# https://radimrehurek.com/gensim/tut2.html 

text1 = [['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

corpus = Corpus(text1)

I. tfidf computed using **tfidf_fit** or an **Corpus** object

In [3]:
for doc in corpus.tfidf_fit():
    print doc

[(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.32448702061385554), (5, 0.32448702061385554), (6, 0.44424552527467476), (7, 0.44424552527467476)]
[(1, 0.5710059809418182), (4, 0.41707573620227772), (5, 0.41707573620227772), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (5, 0.71848116070837698), (8, 0.49182558987264147)]
[(4, 0.45889394536615252), (6, 0.62825804686700459), (7, 0.62825804686700459)]
[(9, 1.0)]
[(9, 0.70710678118654757), (10, 0.70710678118654757)]
[(9, 0.50804290089167503), (10, 0.50804290089167503), (11, 0.69554641952003715)]
[(3, 0.62825804686700459), (10, 0.45889394536615252), (11, 0.62825804686700459)]


II. tfidf computed using **models.TfidfModel** from **gensim**

In [4]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary(text1)
gensim_corpus = [dictionary.doc2bow(text) for text in text1]

In [5]:
gensim_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(5, 1), (10, 1), (11, 1)]]

In [6]:
tfidf = models.TfidfModel(gensim_corpus)
for doc in tfidf[gensim_corpus]:
    print doc

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


### Comparison with scikit-learn

In [7]:
text2 = [["apple"] * 3 + ["cat"],
["apple"] * 2,
["apple"] * 3,
["apple"] * 4,
["apple"] * 3 + ["banana"] * 2,
["apple"] * 3 + ["cat"] * 2]

text2

[['apple', 'apple', 'apple', 'cat'],
 ['apple', 'apple'],
 ['apple', 'apple', 'apple'],
 ['apple', 'apple', 'apple', 'apple'],
 ['apple', 'apple', 'apple', 'banana', 'banana'],
 ['apple', 'apple', 'apple', 'cat', 'cat']]

I. tfidf computed using **tfidf_fit** or an **Corpus** object

In [8]:
corpus = Corpus(text2)

In [9]:
idf_nosmooth = lambda x, n_doc: np.log(n_doc / float(x)) + 1.
idf_func = lambda x, n_doc: np.log((n_doc + 1) / float(x + 1)) + 1

In [10]:
tfidf = corpus.tfidf_fit(idf_func=idf_func)

In [11]:
tfidf

[[(0, 0.85151334721046001), (1, 0.52433292813100962)],
 [(0, 1.0)],
 [(0, 1.0)],
 [(0, 1.0)],
 [(0, 0.55422893279980634), (2, 0.83236427725340778)],
 [(0, 0.63035730725644001), (1, 0.77630513664950718)]]

II. tfidf computed using **TfidfVectorizer** in **scikit-learn**

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vectorizor = TfidfVectorizer()

In [14]:
tfidf = vectorizor.fit_transform([" ".join(doc) for doc in text2]).toarray()

tfidf = [list(enumerate(text)) for text in tfidf]
tfidf = [filter(lambda pair: pair[1] > 0, text) for text in tfidf]

In [15]:
tfidf

[[(0, 0.85151334721046001), (2, 0.52433292813100962)],
 [(0, 1.0)],
 [(0, 1.0)],
 [(0, 1.0)],
 [(0, 0.55422893279980634), (1, 0.83236427725340778)],
 [(0, 0.63035730725644001), (2, 0.77630513664950718)]]