In [9]:
# Rebuilding Bag of Words manually

In [10]:
texts = [
    'Hello, how are you!',
    'Win money, win from home.',
    'Call me now.',
    'Hello, Call hello you tomorrow?'
]

In [11]:
# Convert to lowercase
texts_lower = [text.lower() for text in texts]
print("Lowercased:", texts_lower)

Lowercased: ['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [12]:
# Remove punctuation
import string
clean_texts = [''.join(ch for ch in text if ch not in string.punctuation) for text in texts_lower]
print("No punctuation:", clean_texts)

No punctuation: ['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [13]:
# Tokenize by whitespace
token_lists = [text.split() for text in clean_texts]
print("Tokenized:", token_lists)

Tokenized: [['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [14]:
# Count word frequencies
from collections import Counter
import pprint
word_freq_list = [Counter(tokens) for tokens in token_lists]
pprint.pprint(word_freq_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [15]:
# Vectorizing with sklearn

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

texts = [
    'Hello, how are you!',
    'Win money, win from home.',
    'Call me now.',
    'Hello, Call hello you tomorrow?'
]

In [17]:
vectorizer = CountVectorizer()
vectorizer.fit(texts)
print("Vocabulary:", vectorizer.get_feature_names_out())

Vocabulary: ['are' 'call' 'from' 'hello' 'home' 'how' 'me' 'money' 'now' 'tomorrow'
 'win' 'you']


In [18]:
# Transform to BOW matrix
bow_array = vectorizer.transform(texts).toarray()
print("BOW Array:\n", bow_array)

BOW Array:
 [[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


In [19]:
# Create dataframe
bow_df = pd.DataFrame(bow_array, index=texts, columns=vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
"Hello, how are you!",1,0,0,1,0,1,0,0,0,0,0,1
"Win money, win from home.",0,0,1,0,1,0,0,1,0,0,2,0
Call me now.,0,1,0,0,0,0,1,0,1,0,0,0
"Hello, Call hello you tomorrow?",0,1,0,2,0,0,0,0,0,1,0,1
