In [6]:
import math
# ---------------------------
# 1️⃣  Sample Corpus
# ---------------------------
docs = [
    "the sky is blue",
    "the sun is bright",
    "the sun in the sky is bright",
    "we can see the shining sun, the bright sun"
]

# Preprocess: simple tokenization
corpus = [doc.lower().replace(",", "").split() for doc in docs]
N = len(corpus)
print("Sample Corpus:\n", corpus)
print("Number of Documents:", N)

Sample Corpus:
 [['the', 'sky', 'is', 'blue'], ['the', 'sun', 'is', 'bright'], ['the', 'sun', 'in', 'the', 'sky', 'is', 'bright'], ['we', 'can', 'see', 'the', 'shining', 'sun', 'the', 'bright', 'sun']]
Number of Documents: 4


In [7]:
# ---------------------------
# 5️⃣ Calculate word counts
# ---------------------------
def count_words(doc):
    return len(doc)

word_counts = [count_words(doc) for doc in corpus]

print("\nWord counts per document:\n", word_counts)


Word counts per document:
 [4, 4, 7, 9]


In [8]:
# ---------------------------
# 6️⃣ Calculate unique word counts
# ---------------------------
def count_unique_words(doc):
    return len(set(doc))

unique_word_counts = [count_unique_words(doc) for doc in corpus]

print("\nUnique word counts per document:\n", unique_word_counts)

all_words = [word for doc in corpus for word in doc]
unique_words_in_corpus = len(set(all_words))

print("\nTotal number of unique words in the corpus:", unique_words_in_corpus)


Unique word counts per document:
 [4, 4, 6, 7]

Total number of unique words in the corpus: 11


In [2]:
# ---------------------------
# 2️⃣  Compute Term Frequency (TF)
# ---------------------------
def compute_tf(doc):
    tf = {}
    total_terms = len(doc)
    for term in doc:
        tf[term] = tf.get(term, 0) + 1
    for term in tf:
        tf[term] = tf[term] / total_terms   # normalize
    return tf

tf_docs = [compute_tf(doc) for doc in corpus]

print("TF Example for Doc 1:\n", tf_docs[0])

TF Example for Doc 1:
 {'the': 0.25, 'sky': 0.25, 'is': 0.25, 'blue': 0.25}


In [9]:
# ---------------------------
# 7️⃣ Print all TF values
# ---------------------------
print("\nTerm Frequency (TF) values for all documents:")
for i, tf_doc in enumerate(tf_docs):
    print(f"\nDocument {i+1}:")
    for term, value in tf_doc.items():
        print(f"  {term}: {round(value, 3)}")


Term Frequency (TF) values for all documents:

Document 1:
  the: 0.25
  sky: 0.25
  is: 0.25
  blue: 0.25

Document 2:
  the: 0.25
  sun: 0.25
  is: 0.25
  bright: 0.25

Document 3:
  the: 0.286
  sun: 0.143
  in: 0.143
  sky: 0.143
  is: 0.143
  bright: 0.143

Document 4:
  we: 0.111
  can: 0.111
  see: 0.111
  the: 0.222
  shining: 0.111
  sun: 0.222
  bright: 0.111


In [3]:
# ---------------------------
# 3️⃣  Compute Inverse Document Frequency (IDF)
# ---------------------------
def compute_idf(corpus):
    idf = {}
    all_terms = set(term for doc in corpus for term in doc)
    for term in all_terms:
        containing_docs = sum(1 for doc in corpus if term in doc)
        idf[term] = math.log10((N + 1) / (containing_docs + 1)) + 1  # This is add 1 smoothing, +1 smoothing
    return idf

idf = compute_idf(corpus)

print("\nSample IDF values:\n", {k: round(v, 3) for k, v in list(idf.items())[:5]})



Sample IDF values:
 {'is': 1.097, 'sun': 1.097, 'can': 1.398, 'bright': 1.097, 'sky': 1.222}


In [10]:
# ---------------------------
# 8️⃣ Print all IDF values
# ---------------------------
print("\nInverse Document Frequency (IDF) values for all terms:")
for term, value in idf.items():
    print(f"  {term}: {round(value, 3)}")


Inverse Document Frequency (IDF) values for all terms:
  is: 1.097
  sun: 1.097
  can: 1.398
  bright: 1.097
  sky: 1.222
  the: 1.0
  in: 1.398
  shining: 1.398
  see: 1.398
  blue: 1.398
  we: 1.398


In [4]:
# ---------------------------
# 4️⃣  Compute TF-IDF
# ---------------------------
def compute_tfidf(tf_doc, idf):
    tfidf = {}
    for term, tf_value in tf_doc.items():
        tfidf[term] = tf_value * idf.get(term, 0.0)
    return tfidf

tfidf_docs = [compute_tfidf(tf_doc, idf) for tf_doc in tf_docs]

print("\nTF-IDF Example for Doc 1:\n", {k: round(v, 3) for k, v in tfidf_docs[0].items()})



TF-IDF Example for Doc 1:
 {'the': 0.25, 'sky': 0.305, 'is': 0.274, 'blue': 0.349}


In [11]:
# ---------------------------
# 9️⃣ Print all TF-IDF values
# ---------------------------
print("\nTF-IDF values for all documents:")
for i, tfidf_doc in enumerate(tfidf_docs):
    print(f"\nDocument {i+1}:")
    for term, value in tfidf_doc.items():
        print(f"  {term}: {round(value, 3)}")


TF-IDF values for all documents:

Document 1:
  the: 0.25
  sky: 0.305
  is: 0.274
  blue: 0.349

Document 2:
  the: 0.25
  sun: 0.274
  is: 0.274
  bright: 0.274

Document 3:
  the: 0.286
  sun: 0.157
  in: 0.2
  sky: 0.175
  is: 0.157
  bright: 0.157

Document 4:
  we: 0.155
  can: 0.155
  see: 0.155
  the: 0.222
  shining: 0.155
  sun: 0.244
  bright: 0.122


In [12]:
# ---------------------------
# 10️⃣ Create Vocabulary
# ---------------------------
all_words = sorted(list(set(word for doc in corpus for word in doc)))
vocab_size = len(all_words)

print("Vocabulary size:", vocab_size)
# print("Vocabulary:", all_words)

Vocabulary size: 11


In [13]:
# ---------------------------
# 11️⃣ Create TF-IDF Document Vectors
# ---------------------------
import numpy as np

tfidf_vectors = []
for tfidf_doc in tfidf_docs:
    vector = np.zeros(vocab_size)
    for i, term in enumerate(all_words):
        vector[i] = tfidf_doc.get(term, 0.0) # Get TF-IDF value, or 0 if term not in doc
    tfidf_vectors.append(vector)

# Convert to a NumPy array for easier manipulation
tfidf_vectors = np.array(tfidf_vectors)

print("\nTF-IDF Document Vectors (shape):", tfidf_vectors.shape)
# print("\nTF-IDF Document Vectors:\n", tfidf_vectors)


TF-IDF Document Vectors (shape): (4, 11)


In [14]:
# ---------------------------
# 12️⃣ Print TF-IDF Document Vectors
# ---------------------------
print("\nTF-IDF Document Vectors:\n", tfidf_vectors)


TF-IDF Document Vectors:
 [[0.349485   0.         0.         0.         0.2742275  0.
  0.         0.30546219 0.         0.25       0.        ]
 [0.         0.2742275  0.         0.         0.2742275  0.
  0.         0.         0.2742275  0.25       0.        ]
 [0.         0.15670143 0.         0.19970572 0.15670143 0.
  0.         0.17454982 0.15670143 0.28571429 0.        ]
 [0.         0.12187889 0.15532667 0.         0.         0.15532667
  0.15532667 0.         0.24375778 0.22222222 0.15532667]]


The `tfidf_vectors` array now contains the TF-IDF vector representation for each document. Each row corresponds to a document, and each column corresponds to a unique word in the vocabulary.

 The zeros in the TF-IDF document vectors indicate that a particular word from the overall vocabulary does not appear in that specific document.

The TF-IDF value for a term in a document is calculated as:

TF-IDF = Term Frequency (TF) * Inverse Document Frequency (IDF)

Let's look at why the product can become zero:

Term Frequency (TF): This is the number of times a term appears in a document, divided by the total number of terms in that document. If a term does not appear in a document at all, its TF is 0.
Inverse Document Frequency (IDF): This measures how common or rare a term is across the entire corpus. Even if a term appears in only one document, its IDF value will be greater than 0 (due to the smoothing added in the calculation math.log10((N + 1) / (containing_docs + 1)) + 1).
Therefore, the product of TF and IDF becomes 0 only when the Term Frequency (TF) is 0. This happens when a word is part of the overall vocabulary but does not exist in a particular document. The zero in the vector at the position corresponding to that word simply reflects its absence in that document.

In summary, the zeros in the TF-IDF matrix represent the absence of a specific word in a specific document.