Finding the IDF word embeddings

In [None]:
from collections import defaultdict

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Function to tokenize sentences
def tokenize(sentence):
    return sentence.split()

# Function to calculate term frequency
def compute_tf(corpus):
    tf = []
    for document in corpus:
        tokens = tokenize(document)
        tf_doc = defaultdict(float)
        total_terms = len(tokens)
        for token in tokens:
            tf_doc[token] += 1.0 / total_terms
        tf.append(tf_doc)
    return tf

# Calculate term frequency
tf = compute_tf(corpus)
print("Term Frequency:")
print(tf )

# Print term frequency for each document
for i, tf_doc in enumerate(tf):
    print(f"Term Frequency for document {i + 1}:")
    for term, freq in tf_doc.items():
        print(f"  {term}: {freq:.3f}")

Term Frequency:
[defaultdict(<class 'float'>, {'He': 0.3333333333333333, 'is': 0.3333333333333333, 'Walter': 0.3333333333333333}), defaultdict(<class 'float'>, {'He': 0.3333333333333333, 'is': 0.3333333333333333, 'William': 0.3333333333333333}), defaultdict(<class 'float'>, {'He': 0.2, 'isn’t': 0.2, 'Peter': 0.2, 'or': 0.2, 'September': 0.2})]
Term Frequency for document 1:
  He: 0.333
  is: 0.333
  Walter: 0.333
Term Frequency for document 2:
  He: 0.333
  is: 0.333
  William: 0.333
Term Frequency for document 3:
  He: 0.200
  isn’t: 0.200
  Peter: 0.200
  or: 0.200
  September: 0.200


In [4]:
import math
from collections import defaultdict

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Function to tokenize sentences
def tokenize(sentence):
    return sentence.split()

# Function to calculate document frequency
def compute_df(corpus):
    df = defaultdict(int)
    for document in corpus:
        tokens = set(tokenize(document))
        for token in tokens:
            df[token] += 1
    return df

# Function to calculate inverse document frequency
def compute_idf(corpus, df):
    idf = {}
    N = len(corpus)
    for term, count in df.items():
        idf[term] = math.log10(N / count)
    return idf

# Tokenize the corpus and calculate document frequency
df = compute_df(corpus)

# Calculate inverse document frequency
idf = compute_idf(corpus, df)

# Print IDF values
for term, value in idf.items():
    print(f"IDF('{term}') = {value:.4f}")


IDF('He') = 0.0000
IDF('Walter') = 0.4771
IDF('is') = 0.1761
IDF('William') = 0.4771
IDF('or') = 0.4771
IDF('Peter') = 0.4771
IDF('September') = 0.4771
IDF('isn’t') = 0.4771


Finding the Term Frequency Inverse Document Frequency - TF-IDF - word embeddings

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Initialize the CountVectorizer to get the term-document matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Compute the document frequency for each term
df = np.sum(X.toarray() > 0, axis=0)

# Compute the inverse document frequency (IDF)
N = X.shape[0]
idf = np.log((N+1) / (df+1)) + 1

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to map terms to their IDF values
idf_dict = dict(zip(feature_names, idf))

print("Vocabulary and IDF values:")
for term, idf_value in idf_dict.items():
    print(f"{term}: {idf_value:.4f}")


Vocabulary and IDF values:
he: 1.0000
is: 1.2877
isn: 1.6931
or: 1.6931
peter: 1.6931
september: 1.6931
walter: 1.6931
william: 1.6931
