Finding the Term Frequency - TF- word embeddings

In [20]:
import math
from collections import defaultdict, Counter

# Sample corpus
corpus = [
    "He is Walter",
    "He is William",
    "He isn’t Peter or September"
]

# Tokenize the corpus
def tokenize(sentence):
    return sentence.split()

# Calculate term frequency (TF)
def compute_tf(corpus):
    tf = []
    for document in corpus:
        tokens = tokenize(document)
        counter = Counter(tokens)
        total_terms = len(tokens)
        tf_doc = {term: count / total_terms for term, count in counter.items()}
        tf.append(tf_doc)
    print(tf)
    return tf

# Calculate document frequency (DF)
def compute_df(corpus):
    df = defaultdict(int)
    for document in corpus:
        tokens = set(tokenize(document))
        for token in tokens:
            df[token] += 1
    print(df)
    return df

# Calculate inverse document frequency (IDF)
def compute_idf(corpus, df):
    idf = {}
    N = len(corpus)
    for term, count in df.items():
        idf[term] = math.log10(N / count)
    print(idf)
    return idf

# Calculate TF-IDF
def compute_tf_idf(tf, idf):
    tf_idf = []
    for tf_doc in tf:
        tf_idf_doc = {term: tf_val * idf[term] for term, tf_val in tf_doc.items()}
        tf_idf.append(tf_idf_doc)
    #print(tf_idf)
    return tf_idf

# Get vocabulary
def get_vocabulary(corpus):
    vocab = set()
    for document in corpus:
        tokens = tokenize(document)
        vocab.update(tokens)
    return sorted(vocab)

# Main function to calculate TF-IDF
def main(corpus):
    tf = compute_tf(corpus)
    df = compute_df(corpus)
    idf = compute_idf(corpus, df)
    tf_idf = compute_tf_idf(tf, idf)

    vocab = get_vocabulary(corpus)
    tf_idf_matrix = []

    for tf_idf_doc in tf_idf:
        row = []
        for term in vocab:
            row.append(tf_idf_doc.get(term, 0))
        tf_idf_matrix.append(row)

    return vocab, tf_idf_matrix

vocab, tf_idf_matrix = main(corpus)

#print("Vocabulary:", vocab)
#print("TF-IDF Matrix:")
#for row in tf_idf_matrix:
#    print(row)


[{'He': 0.3333333333333333, 'is': 0.3333333333333333, 'Walter': 0.3333333333333333}, {'He': 0.3333333333333333, 'is': 0.3333333333333333, 'William': 0.3333333333333333}, {'He': 0.2, 'isn’t': 0.2, 'Peter': 0.2, 'or': 0.2, 'September': 0.2}]
defaultdict(<class 'int'>, {'is': 2, 'Walter': 1, 'He': 3, 'William': 1, 'September': 1, 'Peter': 1, 'or': 1, 'isn’t': 1})
{'is': 0.17609125905568124, 'Walter': 0.47712125471966244, 'He': 0.0, 'William': 0.47712125471966244, 'September': 0.47712125471966244, 'Peter': 0.47712125471966244, 'or': 0.47712125471966244, 'isn’t': 0.47712125471966244}
TF-IDF Matrix:


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the CountVectorizer with term frequency
vectorizer = CountVectorizer()

# Fit and transform the corpus to term frequency vectors
X = vectorizer.fit_transform(corpus)

# Convert the term frequency matrix to an array
tf_array = X.toarray()

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary:", feature_names)
print("Term Frequency Matrix:\n", tf_array)


Vocabulary: ['are' 'can' 'create' 'embeddings' 'example' 'is' 'small' 'tensorflow'
 'this' 'to' 'use' 'useful' 'very' 'we' 'word']
Term Frequency Matrix:
 [[0 0 0 0 1 1 1 0 1 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 1 1 0 1]
 [0 1 1 1 0 0 0 1 0 1 1 0 0 1 0]]


Finding the IDF word embeddings

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the CountVectorizer to get the term-document matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Compute the document frequency for each term
df = np.sum(X.toarray() > 0, axis=0)

# Compute the inverse document frequency (IDF)
N = X.shape[0]
idf = np.log((N + 1) / (df + 1)) + 1

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to map terms to their IDF values
idf_dict = dict(zip(feature_names, idf))

print("Vocabulary and IDF values:")
for term, idf_value in idf_dict.items():
    print(f"{term}: {idf_value:.4f}")


Vocabulary and IDF values:
are: 1.6931
can: 1.6931
create: 1.6931
embeddings: 1.2877
example: 1.6931
is: 1.6931
small: 1.6931
tensorflow: 1.6931
this: 1.6931
to: 1.6931
use: 1.6931
useful: 1.6931
very: 1.6931
we: 1.6931
word: 1.6931


Finding the Term Frequency Inverse Document Frequency - TF-IDF - word embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus to TF-IDF vectors
X = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to an array
tfidf_array = X.toarray()

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary:", feature_names)
print("TF-IDF Matrix:\n", tfidf_array)


Vocabulary: ['are' 'can' 'create' 'embeddings' 'example' 'is' 'small' 'tensorflow'
 'this' 'to' 'use' 'useful' 'very' 'we' 'word']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.5        0.5
  0.5        0.         0.5        0.         0.         0.
  0.         0.         0.        ]
 [0.46735098 0.         0.         0.35543247 0.         0.
  0.         0.         0.         0.         0.         0.46735098
  0.46735098 0.         0.46735098]
 [0.         0.38988801 0.38988801 0.29651988 0.         0.
  0.         0.38988801 0.         0.38988801 0.38988801 0.
  0.         0.38988801 0.        ]]
