In [None]:
from collections import defaultdict
import string

corpus = [
    "The sun dipped below the horizon, casting a warm golden glow across the ocean waves.",
    "Advancements in artificial intelligence have revolutionized industries ranging from healthcare to finance.",
    "She glanced at the clock, realizing she only had ten minutes to prepare for the meeting.",
]

In [None]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    tokens = text.split()
    return tokens

proc_corpus = [preprocess(sent) for sent in corpus]
print(proc_corpus)

[['the', 'sun', 'dipped', 'below', 'the', 'horizon', 'casting', 'a', 'warm', 'golden', 'glow', 'across', 'the', 'ocean', 'waves'], ['advancements', 'in', 'artificial', 'intelligence', 'have', 'revolutionized', 'industries', 'ranging', 'from', 'healthcare', 'to', 'finance'], ['she', 'glanced', 'at', 'the', 'clock', 'realizing', 'she', 'only', 'had', 'ten', 'minutes', 'to', 'prepare', 'for', 'the', 'meeting']]


In [None]:
vocab = set()

for sent in proc_corpus:
    vocab.update(sent)

vocab = sorted(list(vocab))
print("Vocab:", vocab)
print("Vocab length is", len(vocab))

Vocab: ['a', 'across', 'advancements', 'artificial', 'at', 'below', 'casting', 'clock', 'dipped', 'finance', 'for', 'from', 'glanced', 'glow', 'golden', 'had', 'have', 'healthcare', 'horizon', 'in', 'industries', 'intelligence', 'meeting', 'minutes', 'ocean', 'only', 'prepare', 'ranging', 'realizing', 'revolutionized', 'she', 'sun', 'ten', 'the', 'to', 'warm', 'waves']
Vocab length is 37


In [None]:
def create_bow(sentence, vocab):
    vector = [0] * len(vocab)
    for word in sentence:
        if word in vocab:
            idx = vocab.index(word)
            vector[idx] += 1
    return vector


bow_vector = [create_bow(sent, vocab) for sent in proc_corpus]
print("BOW Vectors:")
for vector in bow_vector:
    print(vector)

BOW Vectors:
[1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 1, 1]
[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 1, 2, 1, 0, 0]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    "The sun dipped below the horizon, casting a warm golden glow across the ocean waves.",
    "Advancements in artificial intelligence have revolutionized industries ranging from healthcare to finance.",
    "She glanced at the clock, realizing she only had ten minutes to prepare for the meeting.",
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocubulary:", vectorizer.get_feature_names_out())
print("BoW Representation")
print(X.toarray())

Vocubulary: ['across' 'advancements' 'artificial' 'at' 'below' 'casting' 'clock'
 'dipped' 'finance' 'for' 'from' 'glanced' 'glow' 'golden' 'had' 'have'
 'healthcare' 'horizon' 'in' 'industries' 'intelligence' 'meeting'
 'minutes' 'ocean' 'only' 'prepare' 'ranging' 'realizing' 'revolutionized'
 'she' 'sun' 'ten' 'the' 'to' 'warm' 'waves']
BoW Representation
[[1 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 3 0 1 1]
 [0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0]
 [0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 0 2 0 1 2 1 0 0]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    "The sun dipped below the horizon, casting a warm golden glow across the ocean waves.",
    "Advancements in artificial intelligence have revolutionized industries ranging from healthcare to finance.",
    "She glanced at the clock, realizing she only had ten minutes to prepare for the meeting.",
]
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("Vocubulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Represenation")
print(X_tfidf.toarray())

Vocubulary: ['across' 'advancements' 'artificial' 'at' 'below' 'casting' 'clock'
 'dipped' 'finance' 'for' 'from' 'glanced' 'glow' 'golden' 'had' 'have'
 'healthcare' 'horizon' 'in' 'industries' 'intelligence' 'meeting'
 'minutes' 'ocean' 'only' 'prepare' 'ranging' 'realizing' 'revolutionized'
 'she' 'sun' 'ten' 'the' 'to' 'warm' 'waves']
TF-IDF Represenation
[[0.2484091  0.         0.         0.         0.2484091  0.2484091
  0.         0.2484091  0.         0.         0.         0.
  0.2484091  0.2484091  0.         0.         0.         0.2484091
  0.         0.         0.         0.         0.         0.2484091
  0.         0.         0.         0.         0.         0.
  0.2484091  0.         0.56676458 0.         0.2484091  0.2484091 ]
 [0.         0.29388386 0.29388386 0.         0.         0.
  0.         0.         0.29388386 0.         0.29388386 0.
  0.         0.         0.         0.29388386 0.29388386 0.
  0.29388386 0.29388386 0.29388386 0.         0.         0.
  0.    