# Word2Vec Implementation

This notebook demonstrates the step-by-step implementation of a Word2Vec model using the Brown corpus from NLTK.


In [37]:
# Step 1: Install Required Libraries
#!pip install gensim nltk scikit-learn plotly

In [1]:
# Step 2: Download NLTK Corpora
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\ASUS/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
# Step 3: Load Brown Sentences
from nltk.corpus import brown

# Load sentences from the Brown corpus
brown_sentences = brown.sents()
print(f"Number of sentences: {len(brown_sentences)}")
print(f"First sentence: {brown_sentences[0]}")

Number of sentences: 57340
First sentence: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [3]:
from nltk.corpus import brown

# Get all sentences from the 'brown' corpus
brown_sentences = brown.sents()

# Tokenize and lowercase each word in every sentence
processed_sentences = []
for sentence in brown_sentences:
    processed_sentences.append([word.lower() for word in sentence])

print(f"Number of raw sentences: {len(brown_sentences)}")
print(f"Number of processed sentences: {len(processed_sentences)}")
print(f"First 3 raw sentences: {brown_sentences[:3]}")
print(f"First 3 processed sentences: {processed_sentences[:3]}")

Number of raw sentences: 57340
Number of processed sentences: 57340
First 3 raw sentences: [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 

In [8]:
# Train Word2Vec model with refined parameters
from gensim.models import Word2Vec

# Refine parameters to exclude rare words and improve embeddings
model = Word2Vec(
    sentences=processed_sentences, # Our preprocessed corpus
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)

print("Refined Word2Vec model training complete.")
model.train(processed_sentences, total_examples=len(processed_sentences), epochs=10)
print("Word2Vec model training complete.")
print(f"Number of words in vocabulary: {len(model.wv)}")

Refined Word2Vec model training complete.
Word2Vec model training complete.
Number of words in vocabulary: 14221


In [9]:
# Step 6: Print the Vector for "king"
word_to_find = 'king'

if word_to_find in model.wv:
    king_vector = model.wv[word_to_find]
    print(f"Vector for '{word_to_find}' (first 10 values):\n{king_vector[:10]}")
else:
    print(f"The word '{word_to_find}' is not in the model's vocabulary.")

Vector for 'king' (first 10 values):
[-0.16215658  0.788257   -0.28859648 -0.33889973 -0.5043302   0.22193283
  0.67197096  0.50237906 -0.48238367 -0.12506525]


In [14]:
# Step 7: Show 5 Most Similar Words to "woman"
target_word = 'woman'
if target_word in model.wv:
    similar_words = model.wv.most_similar(target_word, topn=5)
    print(f"Words most similar to '{target_word}':")
    for word, similarity in similar_words:
        print(f" {word}: {similarity:.4f}")
else:
    print(f"The word '{target_word}' is not in the model's vocabulary.")

Words most similar to 'woman':
 girl: 0.8591
 boy: 0.7656
 person: 0.7071
 man: 0.6999
 lady: 0.6834


In [18]:
# Step 8: Perform Analogy: king - man + woman (Top 1 Result)
if all(word in model.wv for word in ["king", "man", "woman"]):
    analogy_result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
    print("Result of the analogy top 1 'king - man + woman':", analogy_result[0])
else:
    print("One or more words are not in the vocabulary.")

Result of the analogy top 1 'king - man + woman': ('queen', 0.7217435240745544)


In [22]:
# Step 8: Perform Analogy: king - man + woman (Top 5 Results)
if all(word in model.wv for word in ["king", "man", "woman", "queen"]):
    analogy_result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=5)
    print("Top 5 results for the analogy top 5 'king - man + woman':")
    for word, similarity in analogy_result:
        print(f"{word}: {similarity:.4f}")
    # Check if "queen" is in the top results
    if "queen" in dict(analogy_result):
        print("✓ 'queen' is in the top results.")
    else:
        print("✗ 'queen' is not in the top results. Consider improving preprocessing or training.")
else:
    print("One or more words are not in the vocabulary.")

Top 5 results for the analogy top 5 'king - man + woman':
queen: 0.7217
mary: 0.7018
richard: 0.6820
anne: 0.6771
jane: 0.6634
✓ 'queen' is in the top results.


In [24]:
# Step 9: Check if "government" is in the Vocabulary
is_in_vocab = "government" in model.wv
print("Is 'government' in the vocabulary?", is_in_vocab)

Is 'government' in the vocabulary? True


In [25]:
# Step 10: Print the Vocabulary Size
vocab_size = len(model.wv)
print("Vocabulary size:", vocab_size)

Vocabulary size: 14221


In [26]:
# Step 11: Select Sample Word List for Visualization
word_list = [
    'king', 'queen', 'prince', 'princess',
    'man', 'woman', 'boy', 'girl',
    'government', 'country', 'state', 'law',
    'book', 'story', 'novel', 'author',
    'work', 'job', 'business', 'company',
    'good', 'bad', 'better', 'best',
    'time', 'day', 'year', 'world'
]
print("Sample word list:", word_list)

Sample word list: ['king', 'queen', 'prince', 'princess', 'man', 'woman', 'boy', 'girl', 'government', 'country', 'state', 'law', 'book', 'story', 'novel', 'author', 'work', 'job', 'business', 'company', 'good', 'bad', 'better', 'best', 'time', 'day', 'year', 'world']


In [27]:
# Step 12: Filter Words in the Word2Vec Vocabulary
filtered_words = [word for word in word_list if word in model.wv]
print("Filtered words in vocabulary:", filtered_words)

Filtered words in vocabulary: ['king', 'queen', 'prince', 'princess', 'man', 'woman', 'boy', 'girl', 'government', 'country', 'state', 'law', 'book', 'story', 'novel', 'author', 'work', 'job', 'business', 'company', 'good', 'bad', 'better', 'best', 'time', 'day', 'year', 'world']


In [29]:
# Step 13: Apply PCA to Reduce Vectors to 2D
from sklearn.decomposition import PCA
import numpy as np

# Extract vectors for filtered words
word_vectors = np.array([model.wv[word] for word in filtered_words])

# Apply PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)
print("PCA reduction complete.")
print("Reduced vectors shape:", reduced_vectors.shape)
print(reduced_vectors)

PCA reduction complete.
Reduced vectors shape: (28, 2)
[[-5.2472895e-01  1.9772613e+00]
 [-8.4274662e-01  1.5463746e+00]
 [ 2.4420901e-01  2.3696966e+00]
 [-6.0227472e-01  1.2678550e+00]
 [-6.3599529e+00  2.7383468e+00]
 [-5.2193522e+00  3.3784923e-01]
 [-4.7052002e+00  6.9756241e-04]
 [-5.2879395e+00  3.4403825e-01]
 [ 6.9966369e+00  4.0601516e+00]
 [ 2.7205911e+00 -1.1227371e+00]
 [ 7.0047665e+00  1.4163625e+00]
 [ 4.3690376e+00  4.6906352e+00]
 [-6.6030079e-01  1.0875636e+00]
 [-1.9558227e+00  7.1917856e-01]
 [-3.4955078e-01  1.1766033e+00]
 [-3.8906553e-01  2.0933511e+00]
 [ 1.0007626e+00 -2.6936791e+00]
 [ 6.5150714e-01 -1.7922530e+00]
 [ 3.7949603e+00  6.2237505e-02]
 [ 2.7909267e+00  2.0585415e+00]
 [-3.4408300e+00 -1.5288545e+00]
 [-4.0121889e+00  2.8555122e-01]
 [-1.1087791e+00 -8.1171250e-01]
 [-2.7906147e-01  4.7078568e-01]
 [ 3.5632029e-01 -8.7249460e+00]
 [ 3.1078222e-01 -6.5747299e+00]
 [ 4.1390686e+00 -8.3899956e+00]
 [ 1.3582280e+00  2.9358242e+00]]


In [30]:
from sklearn.decomposition import PCA

# Get vectors for filtered words
word_vectors = [model.wv[word] for word in filtered_words]
word_vectors = np.array(word_vectors)

# Apply PCA to reduce to 2D
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(word_vectors)

print("✓ PCA applied successfully")
print(f"  Original dimensions: {word_vectors.shape[1]}")
print(f"  Reduced dimensions: {vectors_2d.shape[1]}")
print(f"  Explained variance: {pca.explained_variance_ratio_}")
print(f"  Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

✓ PCA applied successfully
  Original dimensions: 100
  Reduced dimensions: 2
  Explained variance: [0.18996318 0.17011972]
  Total variance explained: 36.01%


In [31]:
# Step 14: Create a Plotly Scatter Plot with Labels
import plotly.graph_objects as go

# Create scatter plot
fig = go.Figure()
for i, word in enumerate(filtered_words):
    fig.add_trace(go.Scatter(x=[reduced_vectors[i, 0]], y=[reduced_vectors[i, 1]],
                             mode='markers+text',
                             text=[word],
                             textposition='top center'))

fig.update_layout(title="Word2Vec Visualization", xaxis_title="PCA1", yaxis_title="PCA2")
fig.show()

In [32]:
# Step 15: Print the First 500 Characters of the Plotly JSON
plotly_json = fig.to_json()
print("First 500 characters of Plotly JSON:", plotly_json[:500])

First 500 characters of Plotly JSON: {"data":[{"mode":"markers+text","text":["king"],"textposition":"top center","x":[-0.5247289538383484],"y":[1.9772613048553467],"type":"scatter"},{"mode":"markers+text","text":["queen"],"textposition":"top center","x":[-0.8427466154098511],"y":[1.5463745594024658],"type":"scatter"},{"mode":"markers+text","text":["prince"],"textposition":"top center","x":[0.24420900642871857],"y":[2.369696617126465],"type":"scatter"},{"mode":"markers+text","text":["princess"],"textposition":"top center","x":[-0.60
