# 1: Implement Bag of Words (BoW) with Python


- Note use custom stopword instead of nltk(NLTK stopwords list contains 179 words, including:'a', 'an', 'the', 'is', 'to', 'now', 'this', 'it', 'for', ...
  ) because we want the output to match the expected output exactly in the task.


In [3]:
import re

sentence1 = "Welcome to NLP Learning , Now start learning"
sentence2 = "Learning is a good practice"

# 1. Tokenize sentences (keep comma)
def tokenize(text):
    # keep words + comma
    return re.findall(r"[A-Za-z]+|,", text.lower())

tokens1 = tokenize(sentence1)
tokens2 = tokenize(sentence2)

print(tokens1)
print(tokens2)

# 2. Combined vocabulary (preserve ORDER)
vocab = []
for word in tokens1 + tokens2:
    if word not in vocab:
        vocab.append(word)

print(vocab)

# 3. Final vocabulary (remove punctuation + small stopwords 'to','is','a')
final_vocab = [w for w in vocab if w.isalpha() and w not in ['to','is','a']]
print(final_vocab)

# 4. Create Bag of Words vector
def bow_vector(tokens, vocab):
    return [tokens.count(word) for word in vocab]

v1 = bow_vector(tokens1, final_vocab)
v2 = bow_vector(tokens2, final_vocab)

print(v1)
print(v2)


['welcome', 'to', 'nlp', 'learning', ',', 'now', 'start', 'learning']
['learning', 'is', 'a', 'good', 'practice']
['welcome', 'to', 'nlp', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']
['welcome', 'nlp', 'learning', 'now', 'start', 'good', 'practice']
[1, 1, 2, 1, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 1]


# 2. Implement Bag of Words using SKLEARN


In [4]:
# Task 2: Bag of Words Implementation using SKLEARN
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Sample sentences
sentence1 = "This is a good job. I will not miss it for anything"
sentence2 = "This is not good at all"

# Create a CountVectorizer instance
vectorizer = CountVectorizer(vocabulary= ["good" , "job", "miss"])

# Fit and transform the sentences
corpus = [sentence1, sentence2]
X = vectorizer.fit_transform(corpus)

# Extract vocabulary and BoW vectors
vocabulary = vectorizer.get_feature_names_out()

result_df = pd.DataFrame(X.toarray(), 
                         columns= vectorizer.get_feature_names_out(), 
                         index=[0, 1])

print(result_df)



   good  job  miss
0     1    1     1
1     1    0     0


# 3. Implement Bag of words using NLTK


- this part use with nltk to automatically remove all stopwords


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

# 1. Define a list of sample documents
documents = [
    "I love natural language processing.",
    "Text classification is an important NLP task.",
    "NLTK provides useful tools for NLP."
]

# 2. Initialize a list to hold all processed tokens from all documents
all_tokens = []

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# 3. Process each document
for doc in documents:
    # Tokenize the document into words
    tokens = word_tokenize(doc)

    # Convert words to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove punctuation and stopwords from the tokens
    # Also, remove any non-alphabetic characters
    filtered_tokens = [
        word for word in tokens
        if word.isalpha() and word not in stop_words
    ]

    # Add the filtered tokens to our master list
    all_tokens.extend(filtered_tokens)

# 4. Create a vocabulary by collecting all unique words, sorted alphabetically
vocabulary = sorted(list(set(all_tokens)))

# 5. Initialize a BoW dictionary with word counts set to 0
bow_representation = {word: 0 for word in vocabulary}

# 6. Iterate through the collected tokens and increment the count for each word
for token in all_tokens:
    bow_representation[token] += 1

# 7. Print the final Bag of Words representation
print("Bag of Words (BoW) representation:")
print(bow_representation)


Bag of Words (BoW) representation:
{'classification': 1, 'important': 1, 'language': 1, 'love': 1, 'natural': 1, 'nlp': 2, 'nltk': 1, 'processing': 1, 'provides': 1, 'task': 1, 'text': 1, 'tools': 1, 'useful': 1}


get this result becuase use funtion sort to sort alphabet


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

# 1. Define a list of sample documents
documents = [
    "I love natural language processing.",
    "Text classification is an important NLP task.",
    "NLTK provides useful tools for NLP."
]

# 2. Initialize a list to hold all processed tokens from all documents
all_tokens = []

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# 3. Process each document
for doc in documents:
    # Tokenize the document into words
    tokens = word_tokenize(doc)

    # Convert words to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove punctuation and stopwords from the tokens
    # Also, remove any non-alphabetic characters
    filtered_tokens = [
        word for word in tokens
        if word.isalpha() and word not in stop_words
    ]

    # Add the filtered tokens to our master list
    all_tokens.extend(filtered_tokens)

# 4. Create a vocabulary by collecting all unique words, do not sorted alphabetically
vocabulary = list(set(all_tokens))

# 5. Initialize a BoW dictionary with word counts set to 0
bow_representation = {word: 0 for word in vocabulary}

# 6. Iterate through the collected tokens and increment the count for each word
for token in all_tokens:
    bow_representation[token] += 1

# 7. Print the final Bag of Words representation
print("Bag of Words (BoW) representation:")
print(bow_representation)


Bag of Words (BoW) representation:
{'processing': 1, 'provides': 1, 'text': 1, 'natural': 1, 'nlp': 2, 'language': 1, 'useful': 1, 'nltk': 1, 'tools': 1, 'task': 1, 'important': 1, 'love': 1, 'classification': 1}


get the below output because do not use funtion sort to sort alphabet


# 4 Classify movie review is posi ve or nega ve using Bag of words for pre-processing the

text (from Sklearn) and apply with any models (RF, DT)


In [7]:
# Task 4: Classify Movie Reviews using Bag of Words and Sklearn

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
# Note: Replace the path with the actual path to the downloaded dataset
file_path = "IMDB Dataset.csv"
data = pd.read_csv(file_path)

# Step 2: Preprocess the text
def preprocess_text(text):
    """Tokenize, remove stopwords and punctuation, and convert to lowercase."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    return text

data['review'] = data['review'].apply(preprocess_text)

# Step 3: Convert text to Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=5000)  # Limit to top 5000 words
X = vectorizer.fit_transform(data['review'])
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Convert sentiment to binary

# Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8467
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.85      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [12]:
sample_review_1 = ["This movie is terrible. I hated everything about it"]
sample_vec_1 = vectorizer.transform(sample_review_1)

print("Prediction:", model.predict(sample_vec_1))

Prediction: [0]


In [13]:
sample_review_2 = ["I absolutely loved this movie! The acting was amazing"]
sample_vec_2 = vectorizer.transform(sample_review_2)

print("Prediction:", model.predict(sample_vec_2))

Prediction: [1]
