Finding the Term Frequency - TF- word embeddings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the CountVectorizer with term frequency
vectorizer = CountVectorizer()

# Fit and transform the corpus to term frequency vectors
X = vectorizer.fit_transform(corpus)

# Convert the term frequency matrix to an array
tf_array = X.toarray()

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary:", feature_names)
print("Term Frequency Matrix:\n", tf_array)


Vocabulary: ['are' 'can' 'create' 'embeddings' 'example' 'is' 'small' 'tensorflow'
 'this' 'to' 'use' 'useful' 'very' 'we' 'word']
Term Frequency Matrix:
 [[0 0 0 0 1 1 1 0 1 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 1 1 0 1]
 [0 1 1 1 0 0 0 1 0 1 1 0 0 1 0]]


Finding the IDF word embeddings

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the CountVectorizer to get the term-document matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Compute the document frequency for each term
df = np.sum(X.toarray() > 0, axis=0)

# Compute the inverse document frequency (IDF)
N = X.shape[0]
idf = np.log((N + 1) / (df + 1)) + 1

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to map terms to their IDF values
idf_dict = dict(zip(feature_names, idf))

print("Vocabulary and IDF values:")
for term, idf_value in idf_dict.items():
    print(f"{term}: {idf_value:.4f}")


Vocabulary and IDF values:
are: 1.6931
can: 1.6931
create: 1.6931
embeddings: 1.2877
example: 1.6931
is: 1.6931
small: 1.6931
tensorflow: 1.6931
this: 1.6931
to: 1.6931
use: 1.6931
useful: 1.6931
very: 1.6931
we: 1.6931
word: 1.6931


Finding the Term Frequency Inverse Document Frequency - TF-IDF - word embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "this is a small example",
    "word embeddings are very useful",
    "we can use tensorflow to create embeddings"
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus to TF-IDF vectors
X = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to an array
tfidf_array = X.toarray()

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

print("Vocabulary:", feature_names)
print("TF-IDF Matrix:\n", tfidf_array)


Vocabulary: ['are' 'can' 'create' 'embeddings' 'example' 'is' 'small' 'tensorflow'
 'this' 'to' 'use' 'useful' 'very' 'we' 'word']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.5        0.5
  0.5        0.         0.5        0.         0.         0.
  0.         0.         0.        ]
 [0.46735098 0.         0.         0.35543247 0.         0.
  0.         0.         0.         0.         0.         0.46735098
  0.46735098 0.         0.46735098]
 [0.         0.38988801 0.38988801 0.29651988 0.         0.
  0.         0.38988801 0.         0.38988801 0.38988801 0.
  0.         0.38988801 0.        ]]
