In [1]:
import numpy as np

this document is used to explain how to calculate document similarity using tf-idf method



In [2]:
documents = [
    "we are doing just fine fine fine",
    "we are doing just fine ",
]

In [3]:
# from tfidf import tf_idf

In [4]:
def tokenize(documents):
    result = []
    for document in documents:
        result.append(document.split(" "))
    print(result)
    return result

In [5]:
tokens = tokenize(documents)

[['we', 'are', 'doing', 'just', 'fine', 'fine', 'fine'], ['we', 'are', 'doing', 'just', 'fine', '']]


In [6]:
terms = []
for document in tokens:
    for word in document:
        if word not in terms:
            terms.append(word)

In [7]:
terms

['we', 'are', 'doing', 'just', 'fine', '']

In [8]:
def calc_tf(terms,documents,method):
    tf_vector = np.zeros((len(documents),len(terms)))
    for d, document in enumerate(documents):
        for t, term in enumerate(terms):
            tf = method(term,document)
            tf_vector.itemset((d,t),tf)
    return tf_vector

in `tf_raw` operation, we use `raw count` to calculate `tf`
so that ${tf}_{(t,d)} = f_{(t,d)}$

In [9]:
def tf_raw(term, document):
    tf = 0
    for word in document:
        if word == term:
            tf += 1
    return tf

`tf_binary` tf is 1 if the term exist and 0 if term nonexistent on the document

In [10]:
def tf_binary(term,document):
    return 1 if term in document else 0 

`tf_termfrequency` 

$\mathrm{tf} = \frac{f_{(t,d)}}{\displaystyle\sum_{{w\in d}}{f_{(w,d)}}}$

In [11]:
def tf_termfrequency (term,document):
    tf = 0
    for word in document:
        if term == word:
            tf += 1
    return tf/len(document)

after choosing which tf method to use, you can calculate tf this way:

In [12]:
def calc_idf (terms,documents,method):
    idf_vector = np.zeros((1,len(terms)))
    for t, term in enumerate(terms):
        idf = method(term, documents)
        idf_vector.itemset((0,t),idf)
    return idf_vector

$\mathrm{idf}_{(t,D)} = \log_{10}\frac{N}{\mathit{df}} $

In [13]:
def idf_norm(term,documents):
    N = len(documents)
    df = 0
    for document in documents:
        if term in document:
            df += 1
    print(f"np.log10({N}/{np.abs(df)})")
    return np.log10(N/np.abs(df))

there's some chance the term (`term in terms`) is not in the corpus (`documents`) so you adjust the equation to avoid division by zero as such

$\mathrm{idf}_{(t,D)} = \log_{10}\frac{N}{\mathit{df}+1} $

In [14]:
def idf_smooth(term,documents):
    df = 0
    for document in documents:
        if term in document:
            df += 1
    return np.log((len(documents)+1)/(np.abs(df)+1))

In [15]:
tf_vector = calc_tf(terms,tokens,tf_termfrequency)
print(tf_vector)

[[0.14285714 0.14285714 0.14285714 0.14285714 0.42857143 0.        ]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667]]


In [16]:
idf_vector = calc_idf(terms,documents,idf_norm)
print(idf_vector)

np.log10(2/2)
np.log10(2/2)
np.log10(2/2)
np.log10(2/2)
np.log10(2/2)
np.log10(2/2)
[[0. 0. 0. 0. 0. 0.]]


In [17]:
def calculate_tfidf(tf_vector, idf_vector):
    tfidf_vector = np.ones(tf_vector.shape)
    for i in range(tf_vector.shape[0]):
        for j in range(tf_vector.shape[1]):
            tfidf = tf_vector.item((i,j)) * idf_vector.item((0,j))
            tfidf_vector.itemset((i,j),tfidf)
    return tfidf_vector

In [18]:
tfidf_v = calculate_tfidf(tf_vector,idf_vector)

In [19]:
np.dot(tfidf_v,tfidf_v.T)

array([[0., 0.],
       [0., 0.]])

In [25]:
np.log10(100)

2.0