In [5]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/angel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/angel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
nltk.data.path.append("/home/angel/nltk_data...")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/angel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:
# Sample reviews
review1 = "The TechTrend X1 camera captures stunning photos, but the battery life could be better. I'm very impressed with the camera quality."
review2 = "I'm disappointed with the TechTrend X1 battery life, although the camera quality is exceptional. However, the camera features are lacking."


In [17]:
# Tokenization
tokens1 = nltk.word_tokenize(review1)
tokens2 = nltk.word_tokenize(review2)

In [18]:
tokens1

['The',
 'TechTrend',
 'X1',
 'camera',
 'captures',
 'stunning',
 'photos',
 ',',
 'but',
 'the',
 'battery',
 'life',
 'could',
 'be',
 'better',
 '.',
 'I',
 "'m",
 'very',
 'impressed',
 'with',
 'the',
 'camera',
 'quality',
 '.']

In [19]:
# Stop word removal
stop_words = set(stopwords.words('english'))
filtered_tokens1 = [word for word in tokens1 if word.lower() not in stop_words]
filtered_tokens2 = [word for word in tokens2 if word.lower() not in stop_words]

In [31]:
print(filtered_tokens1)
print(filtered_tokens2)

['TechTrend', 'X1', 'camera', 'captures', 'stunning', 'photos', ',', 'battery', 'life', 'could', 'better', '.', "'m", 'impressed', 'camera', 'quality', '.']
["'m", 'disappointed', 'TechTrend', 'X1', 'battery', 'life', ',', 'although', 'camera', 'quality', 'exceptional', '.', 'However', ',', 'camera', 'features', 'lacking', '.']


In [36]:
len(set(filtered_tokens1).union(set(filtered_tokens2)))

21

In [22]:
# Create dictionary
documents = [filtered_tokens1, filtered_tokens2]
dictionary = Dictionary(documents)

In [30]:
dictionary.token2id

{"'m": 0,
 ',': 1,
 '.': 2,
 'TechTrend': 3,
 'X1': 4,
 'battery': 5,
 'better': 6,
 'camera': 7,
 'captures': 8,
 'could': 9,
 'impressed': 10,
 'life': 11,
 'photos': 12,
 'quality': 13,
 'stunning': 14,
 'However': 15,
 'although': 16,
 'disappointed': 17,
 'exceptional': 18,
 'features': 19,
 'lacking': 20}

In [37]:
# Generate bag-of-words vectors
bow_vector1 = dictionary.doc2bow(filtered_tokens1)
bow_vector2 = dictionary.doc2bow(filtered_tokens2)

In [38]:
bow_vector1

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 2),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1)]

In [39]:
# Print results
print("Filtered Tokens 1:", filtered_tokens1)
print("Filtered Tokens 2:", filtered_tokens2)
print("Dictionary:", dictionary.token2id)
print("BoW Vector 1:", bow_vector1)
print("BoW Vector 2:", bow_vector2)

Filtered Tokens 1: ['TechTrend', 'X1', 'camera', 'captures', 'stunning', 'photos', ',', 'battery', 'life', 'could', 'better', '.', "'m", 'impressed', 'camera', 'quality', '.']
Filtered Tokens 2: ["'m", 'disappointed', 'TechTrend', 'X1', 'battery', 'life', ',', 'although', 'camera', 'quality', 'exceptional', '.', 'However', ',', 'camera', 'features', 'lacking', '.']
Dictionary: {"'m": 0, ',': 1, '.': 2, 'TechTrend': 3, 'X1': 4, 'battery': 5, 'better': 6, 'camera': 7, 'captures': 8, 'could': 9, 'impressed': 10, 'life': 11, 'photos': 12, 'quality': 13, 'stunning': 14, 'However': 15, 'although': 16, 'disappointed': 17, 'exceptional': 18, 'features': 19, 'lacking': 20}
BoW Vector 1: [(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]
BoW Vector 2: [(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (7, 2), (11, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]
