# Implementation

In [121]:
from datasets import load_dataset
from collections import OrderedDict
from rich.pretty import pprint
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [122]:
corpus = [
    "The sun is the largest celestial body in the solar system",
    "The solar system consists of the sun and eight revolving planets",
    "Ra was the Egyptian Sun God",
    "The Pyramids were the pinnacle of Egyptian architecture",
    "The quick brown fox jumps over the lazy dog",
]

corpus_names = ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5"]

num_documents = len(corpus)
vocabulary = " ".join(corpus).split()
vocabulary = [word.lower() for word in vocabulary]  # lower case
vocabulary = list(set(vocabulary))  # unique vocabulary
vocabulary.sort()  # sort alphabetically
num_vocabs = len(vocabulary)

print(f"Number of documents: {num_documents}")
print(f"Number of unique words: {num_vocabs}")

Number of documents: 5
Number of unique words: 30


In [123]:
pipe = Pipeline(
    [
        ("count", CountVectorizer(vocabulary=vocabulary)),
        ("tfid", TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=False)),
    ]
).fit(corpus)
count = pipe["count"].transform(corpus).toarray()
print(count)


idf = pipe["tfid"].idf_
print(idf)


tfidf_matrix = pipe.transform(corpus)
print(tfidf_matrix.toarray().T)

[[0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 3 0 0]
 [1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 2 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 2 0 1]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 2 0 0]]
[2.60943791 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791
 2.60943791 1.91629073 2.60943791 2.60943791 2.60943791 2.60943791
 2.60943791 2.60943791 2.60943791 2.60943791 1.91629073 2.60943791
 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791
 1.91629073 1.51082562 1.91629073 1.         2.60943791 2.60943791]
[[0.         2.60943791 0.         0.         0.        ]
 [0.         0.         0.         2.60943791 0.        ]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         2.60943791 0.         0.         0.        ]


The code below creates an ordered dictionary `word_freq` to store the frequency of each word in each document in the corpus. The structure of the dictionary is `{word: {doc_1: freq, doc_2: freq, ...}, ...}`. 

1. The code loops through each word in the vocabulary, and for each word, it checks if it is already in the `word_freq` dictionary. If it is not, it adds the word to the dictionary. 

2. It then loops through each document and for each document, it checks if the document name is already in the `word_freq[word]` dictionary. If it is not, it adds the document name to the dictionary. 

3. Finally, it counts the number of times the word appears in the document using the `count()` method of Python strings, and adds this count to the corresponding entry in the `word_freq` dictionary. 

4. The `pprint()` function from the `pprint` module is then used to print the dictionary `word_freq` in a readable format.


In [124]:
# Create an ordered dictionary to store the frequency of each word in each document in the corpus
word_freq = OrderedDict() # {word: {doc_1: freq, doc_2: freq, ...}, ...}

# Loop through each word in the vocabulary
for word in vocabulary:
    
    # If the word is not in the word_freq dictionary, add it
    if word not in word_freq:
        word_freq[word] = OrderedDict()
    
    # Loop through each document and count the number of times the word appears in the document
    for doc, doc_name in zip(corpus, corpus_names):
        
        # If the document name is not in the word_freq[word] dictionary, add it
        if doc_name not in word_freq[word]:
            word_freq[word][doc_name] = 0
        
        # Count the number of times the word appears in the document using the count() method
        word_freq[word][doc_name] += doc.lower().split().count(word)

# Print the word_freq dictionary using the pprint function for pretty printing
pprint(dict(word_freq.items()))

5. Convert the dictionary `word_freq` to a Pandas DataFrame. The `transpose()` method is used to transpose the DataFrame, so that the document names are the column names and the words are the row names.

In [125]:
df = pd.DataFrame(word_freq).fillna(0).astype(int).T
df

Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5
and,0,1,0,0,0
architecture,0,0,0,1,0
body,1,0,0,0,0
brown,0,0,0,0,1
celestial,1,0,0,0,0
consists,0,1,0,0,0
dog,0,0,0,0,1
egyptian,0,0,1,1,0
eight,0,1,0,0,0
fox,0,0,0,0,1


6. Convert the DataFrame to a NumPy array. Essentially turning 
the corpus/df into a matrix of shape $n_{words} \times n_{docs}$.

In [126]:
X = df.values
print(f"Word frequency array shape: {X.shape}")

Word frequency array shape: (30, 5)


6. Sanity Check: We check our implementation by comparing the results to the results from the `CountVectorizer` class in the `sklearn.feature_extraction.text` module. Note
that the `CountVectorizer` will return a sparse matrix, so we need to convert it to a dense matrix using the `toarray()` method. Furthermore, the matrix is in the shape
$n_{docs} \times n_{words}$, so we need to transpose it using the `T` attribute.

In [127]:
pipe = Pipeline(
    [
        ("count", CountVectorizer(vocabulary=vocabulary)),
        ("tfid", TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=False)),
    ]
).fit(corpus)
count = pipe["count"].transform(corpus).toarray()


assert np.array_equal(
    count.T, X
), "The word frequency array does not match the count array"

7. Create `X_tf` which holds the term frequency values for each word in each document.
   
   $$
   \operatorname{tf}_{t, d} = \log_{10}(\text{count}_{t, d} + 1)
   $$
   
   - Create a new numpy array `X_tf` with the same shape and data type as `X`, but filled with uninitialized floats.
   - Loop through each row and its index in `X` using the `enumerate()` function.
   - Calculate the term frequency of each word in the row using the formula `tf = log10(each_row + 1)`, where count is the frequency count of the word in the row. Now, `each_row` is just the frequency of word $t$ in document $d$. So we can directly apply `np.log10()`.
   - Assign the calculated term frequency values to the corresponding row in `X_tf` using row_index.



In [128]:
X_tf = np.empty_like(X, dtype=float)
for row_index, each_row in enumerate(X):
    tf = np.log10(each_row + 1)
    X_tf[row_index] = tf

8. Create `X_idf` which holds the inverse document frequency values for each word in the corpus.
   
   $$
   \text{idf}_{t} = \log_{10}\left(\frac{n_{docs}}{1 + \text{df}_{t}}\right)
   $$
   
   - Create a new numpy array `X_idf` with the same shape and data type as `X`, but filled with uninitialized floats.
   - Loop through each column and its index in `X` using the `enumerate()` function.
   - Calculate the inverse document frequency of each word in the column using the formula `idf = log10(n_docs / (1 + df))`, where `df` is the document frequency of the word in the column.
   - Assign the calculated inverse document frequency values to the corresponding column in `X_idf` using col_index.

In [129]:

X_idf = np.zeros(num_vocabs, dtype=float)

for row_index, each_row in enumerate(X):
   
    df = np.count_nonzero(each_row) # df is document frequency and it answers how many documents contain this word
    idf = np.log(num_documents / df) + 1 # for eg, if we have 4 documents and a word that appears in 2 documents, then idf = log10(4/2) = 0.301
    X_idf[row_index] = idf

In [130]:

X_idf

array([2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 1.91629073, 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 1.91629073, 2.60943791, 2.60943791, 2.60943791,
       2.60943791, 2.60943791, 2.60943791, 2.60943791, 1.91629073,
       1.51082562, 1.91629073, 1.        , 2.60943791, 2.60943791])

In [131]:
idf = pipe["tfid"].idf_
print(idf)

assert np.allclose(X_idf, idf), "The idf array does not match the idf array"

[2.60943791 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791
 2.60943791 1.91629073 2.60943791 2.60943791 2.60943791 2.60943791
 2.60943791 2.60943791 2.60943791 2.60943791 1.91629073 2.60943791
 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791 2.60943791
 1.91629073 1.51082562 1.91629073 1.         2.60943791 2.60943791]


In [132]:
X_idf.reshape(-1, 1)

array([[2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [1.91629073],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [1.91629073],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [2.60943791],
       [1.91629073],
       [1.51082562],
       [1.91629073],
       [1.        ],
       [2.60943791],
       [2.60943791]])

In [133]:
X_tfidf = X_tf * X_idf.reshape(-1, 1)
X_tfidf.shape, X_tfidf

((30, 5),
 array([[0.        , 0.78551908, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.78551908, 0.        ],
        [0.78551908, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.78551908],
        [0.78551908, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.78551908, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.78551908],
        [0.        , 0.        , 0.57686099, 0.57686099, 0.        ],
        [0.        , 0.78551908, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.78551908],
        [0.        , 0.        , 0.78551908, 0.        , 0.        ],
        [0.78551908, 0.        , 0.        , 0.        , 0.        ],
        [0.78551908, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.78551908],
        [0

In [136]:
X_tfidf = X * X_idf.reshape(-1, 1)
X_tfidf.shape, X_tfidf

((30, 5),
 array([[0.        , 2.60943791, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 2.60943791, 0.        ],
        [2.60943791, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 2.60943791],
        [2.60943791, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 2.60943791, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 2.60943791],
        [0.        , 0.        , 1.91629073, 1.91629073, 0.        ],
        [0.        , 2.60943791, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 2.60943791],
        [0.        , 0.        , 2.60943791, 0.        , 0.        ],
        [2.60943791, 0.        , 0.        , 0.        , 0.        ],
        [2.60943791, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 2.60943791],
        [2

In [137]:
tfidf_matrix = pipe.transform(corpus)
print(tfidf_matrix.toarray().T)
assert np.allclose(X_tfidf, tfidf_matrix.toarray().T), "The tfidf array does not match the tfidf array"

[[0.         2.60943791 0.         0.         0.        ]
 [0.         0.         0.         2.60943791 0.        ]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         2.60943791 0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [0.         0.         1.91629073 1.91629073 0.        ]
 [0.         2.60943791 0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [0.         0.         2.60943791 0.         0.        ]
 [2.60943791 0.         0.         0.         0.        ]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [2.60943791 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         2.60943791]
 [0.         1.91629073 0.         1.91629073 0.        ]
 [0.         0