In [1]:
import numpy as np
import pandas as pd

# Given documents
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great"
]

# Word set of the corpus
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

print('Number of words in the corpus:', len(words_set))
print('The words in the corpus:\n', words_set)

# Initialize the term frequency (TF) matrix
n_docs = len(corpus)
n_words_set = len(words_set)
tf_matrix = np.zeros((n_docs, n_words_set))

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ')
    total_words = len(words)
    for j, w in enumerate(words_set):
        tf_matrix[i, j] = words.count(w) / total_words

print("\nTerm Frequency (TF):\n", pd.DataFrame(tf_matrix, columns=list(words_set)))

# Compute Inverse Document Frequency (IDF)
idf = {}
for w in words_set:
    k = 0
    for i in range(n_docs):
        if w in corpus[i]:
            print(f"word {w} is present in {corpus[i]}")
            k = k+1     
    #print(f"the given word is {w} and k is {k}")
    idf[w] = np.log10(n_docs / k) if k > 0 else 0

print("\nInverse Document Frequency (IDF):")
for w, value in idf.items():
    print(f"{w:>15}: {value:.4f}")

# Compute TF-IDF
tfidf_matrix = tf_matrix * np.array(list(idf.values()))

print("\nTF-IDF:\n", pd.DataFrame(tfidf_matrix, columns=list(words_set)))


Number of words in the corpus: 12
The words in the corpus:
 {'sat', 'and', 'cats', 'dog', 'dogs', 'great', 'mat', 'cat', 'log', 'are', 'the', 'on'}

Term Frequency (TF):
         sat  and  cats       dog  dogs  great       mat       cat       log  \
0  0.166667  0.0   0.0  0.000000   0.0    0.0  0.166667  0.166667  0.000000   
1  0.166667  0.0   0.0  0.166667   0.0    0.0  0.000000  0.000000  0.166667   
2  0.000000  0.2   0.2  0.000000   0.2    0.2  0.000000  0.000000  0.000000   

   are       the        on  
0  0.0  0.333333  0.166667  
1  0.0  0.333333  0.166667  
2  0.2  0.000000  0.000000  
word sat is present in the cat sat on the mat
word sat is present in the dog sat on the log
word and is present in cats and dogs are great
word cats is present in cats and dogs are great
word dog is present in the dog sat on the log
word dog is present in cats and dogs are great
word dogs is present in cats and dogs are great
word great is present in cats and dogs are great
word mat is present