In [130]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
data = {
    "text": [
        "dog bites man",
        "man bites dog dog dog",
        "dog eats meat",
        "man eats food",
        "man eats dude",
    ]
}
data = pd.DataFrame(data)
data

Unnamed: 0,text
0,dog bites man
1,man bites dog dog dog
2,dog eats meat
3,man eats food
4,man eats dude


In [132]:
vectorizer = TfidfVectorizer()
data_vectorized = vectorizer.fit_transform(data["text"]).toarray()
cols = vectorizer.get_feature_names_out()
data_auto = pd.DataFrame(data_vectorized, columns=cols)
data_auto

Unnamed: 0,bites,dog,dude,eats,food,man,meat
0,0.677803,0.562638,0.0,0.0,0.0,0.473309,0.0
1,0.360631,0.898069,0.0,0.0,0.0,0.251828,0.0
2,0.0,0.48624,0.0,0.48624,0.0,0.0,0.726044
3,0.0,0.0,0.0,0.503968,0.752515,0.423954,0.0
4,0.0,0.0,0.752515,0.503968,0.0,0.423954,0.0


In [133]:
corpus = data["text"].values
corpus = [doc.split(" ") for doc in corpus]
corpus

[['dog', 'bites', 'man'],
 ['man', 'bites', 'dog', 'dog', 'dog'],
 ['dog', 'eats', 'meat'],
 ['man', 'eats', 'food'],
 ['man', 'eats', 'dude']]

In [134]:
tfidf_results = []

for doc in corpus:
    result = {}
    for word in set(doc):  # Iterate over unique words in the document
        tf = doc.count(word) / len(doc)  # Term Frequency
        idf = (
            np.log((len(corpus) + 1) / (sum(1 for d in corpus if word in d) + 1)) + 1
        )  # Smoothed IDF
        tfidf = tf * idf  # TF-IDF
        result[word] = tfidf  # Store the TF-IDF for the word
    # Normalize the TF-IDF values using L2 norm
    norm = np.sqrt(sum(value**2 for value in result.values()))
    print(f"norm: {norm}")
    result = {word: value / norm for word, value in result.items()}
    tfidf_results.append(result)


data_manual = pd.DataFrame(tfidf_results).fillna(0)
data_manual = data_manual[data_auto.columns]
data_manual

norm: 0.8326639851042533
norm: 0.9389913986759031
norm: 0.9634912141886107
norm: 0.9295990755827049
norm: 0.9295990755827049


Unnamed: 0,bites,dog,dude,eats,food,man,meat
0,0.677803,0.562638,0.0,0.0,0.0,0.473309,0.0
1,0.360631,0.898069,0.0,0.0,0.0,0.251828,0.0
2,0.0,0.48624,0.0,0.48624,0.0,0.0,0.726044
3,0.0,0.0,0.0,0.503968,0.752515,0.423954,0.0
4,0.0,0.0,0.752515,0.503968,0.0,0.423954,0.0


In [135]:
pd.testing.assert_frame_equal(data_auto, data_manual)