# Word2Vec Implementation

This notebook demonstrates the step-by-step implementation of a Word2Vec model using the Brown corpus from NLTK.


In [22]:
# Step 1: Install Required Libraries
#!pip install gensim nltk scikit-learn plotly

In [23]:
# Step 2: Download NLTK Corpora
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\ASUS/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [24]:
# Step 3: Load Brown Sentences
from nltk.corpus import brown

# Load sentences from the Brown corpus
brown_sentences = brown.sents()
print(f"Number of sentences: {len(brown_sentences)}")
print(f"First sentence: {brown_sentences[0]}")

Number of sentences: 57340
First sentence: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [25]:
# Step 4: Enhanced Preprocessing with Advanced Tokenization
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#import wordninja

#lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_sentences(sentences):
    processed_sentences = []
    for i, sentence in enumerate(sentences):
        # Tokenize and preprocess each sentence
        sentence = word_tokenize(' '.join(sentence).lower())  # Tokenize and lowercase
        sentence = [word for word in sentence if word.lower() not in string.punctuation]  # Remove punctuation
        #sentence = [re.sub(r'[^a-z]', '', word) for word in sentence]  # Remove punctuation
        #sentence = [lemmatizer.lemmatize(word) for word in sentence]  # Lemmatize words
        sentence = [word for word in sentence if word not in stop_words]  # Remove stopwords
        #sentence = [word for word in sentence if len(word) > 1]  # Remove single-character words
        #sentence = [subword for word in sentence for subword in wordninja.split(word)]  # Split compound words
        sentence = [word for word in sentence if word]  # Remove empty words
        if sentence:  # Keep non-empty sentences
            processed_sentences.append(sentence)
        # Debugging: Print the first 5 sentences after processing
        if i < 5:
            print(f"Original: {sentences[i]}")
            print(f"Processed: {sentence}")
    return processed_sentences

processed_sentences = preprocess_sentences(brown_sentences)
print(f"Number of processed sentences: {len(processed_sentences)}")
print(f"First processed sentence: {processed_sentences[0] if processed_sentences else 'None'}")

Original: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
Processed: ['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'evidence', '``', 'irregularities', 'took', 'place']
Original: ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
Processed: ['jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', '``', 'deserves', 'praise', 'thanks', 'city', 

In [26]:
# Train Word2Vec model with refined parameters
from gensim.models import Word2Vec

# Refine parameters to exclude rare words and improve embeddings
model = Word2Vec(
    sentences=processed_sentences, 
    vector_size=100,       # Larger dimension of word vectors for better representation
    window=5,             # Larger context window size
    min_count=5,           # Exclude rare words with low frequency
    sg=1,                  # Skip-gram model
    epochs=10,             # More training epochs for better convergence
    workers=4,             # Number of threads
    #negative=15            # Adjust negative sampling for better embeddings
)

print("Refined Word2Vec model training complete.")

Refined Word2Vec model training complete.


In [27]:
# Step 6: Print the Vector for "king"
if "king" in model.wv:
    king_vector = model.wv["king"]
    print("Vector for 'king' (first 10 values):", king_vector[:10])
else:
    print("'king' is not in the vocabulary.")

Vector for 'king' (first 10 values): [-0.22447096  0.19917454 -0.04512258  0.35215324  0.24505028  0.02232507
  0.23743603  0.0534474  -0.5437133   0.12373687]


In [28]:
# Step 7: Show 5 Most Similar Words to "woman"
if "woman" in model.wv:
    similar_words = model.wv.most_similar("woman", topn=5)
    print("5 most similar words to 'woman':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("'woman' is not in the vocabulary.")

5 most similar words to 'woman':
lonely: 0.8022
boy: 0.7974
girl: 0.7963
lean: 0.7955
charming: 0.7922


In [29]:
# Step 8: Perform Analogy: king - man + woman
if all(word in model.wv for word in ["king", "man", "woman", "queen"]):
    analogy_result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=5)
    print("Top 5 results for the analogy 'king - man + woman':")
    for word, similarity in analogy_result:
        print(f"{word}: {similarity:.4f}")
    # Check if "queen" is in the top results
    if "queen" in dict(analogy_result):
        print("✓ 'queen' is in the top results.")
    else:
        print("✗ 'queen' is not in the top results. Consider improving preprocessing or training.")
else:
    print("One or more words are not in the vocabulary.")

Top 5 results for the analogy 'king - man + woman':
sister: 0.7333
aunt: 0.7301
meynell: 0.7172
rabbi: 0.7163
katie: 0.7126
✗ 'queen' is not in the top results. Consider improving preprocessing or training.


In [30]:
# Step 8: Perform Analogy: king - man + woman
if all(word in model.wv for word in ["king", "man", "woman"]):
    analogy_result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
    print("Result of the analogy 'king - man + woman':", analogy_result[0])
else:
    print("One or more words are not in the vocabulary.")

Result of the analogy 'king - man + woman': ('sister', 0.7332649230957031)


In [31]:
# Step 9: Check if "government" is in the Vocabulary
is_in_vocab = "government" in model.wv
print("Is 'government' in the vocabulary?", is_in_vocab)

Is 'government' in the vocabulary? True


In [32]:
# Step 10: Print the Vocabulary Size
vocab_size = len(model.wv)
print("Vocabulary size:", vocab_size)

Vocabulary size: 13924


In [33]:
# Step 11: Select Sample Word List for Visualization
word_list = [
    'king', 'queen', 'prince', 'princess',
    'man', 'woman', 'boy', 'girl',
    'government', 'country', 'state', 'law',
    'book', 'story', 'novel', 'author',
    'work', 'job', 'business', 'company',
    'good', 'bad', 'better', 'best',
    'time', 'day', 'year', 'world'
]
print("Sample word list:", word_list)

Sample word list: ['king', 'queen', 'prince', 'princess', 'man', 'woman', 'boy', 'girl', 'government', 'country', 'state', 'law', 'book', 'story', 'novel', 'author', 'work', 'job', 'business', 'company', 'good', 'bad', 'better', 'best', 'time', 'day', 'year', 'world']


In [34]:
# Step 12: Filter Words in the Word2Vec Vocabulary
filtered_words = [word for word in word_list if word in model.wv]
print("Filtered words in vocabulary:", filtered_words)

Filtered words in vocabulary: ['king', 'queen', 'prince', 'princess', 'man', 'woman', 'boy', 'girl', 'government', 'country', 'state', 'law', 'book', 'story', 'novel', 'author', 'work', 'job', 'business', 'company', 'good', 'bad', 'better', 'best', 'time', 'day', 'year', 'world']


In [35]:
# Step 13: Apply PCA to Reduce Vectors to 2D
from sklearn.decomposition import PCA
import numpy as np

# Extract vectors for filtered words
word_vectors = np.array([model.wv[word] for word in filtered_words])

# Apply PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)
print("PCA reduction complete.")

PCA reduction complete.


In [36]:
from sklearn.decomposition import PCA

# Get vectors for filtered words
word_vectors = [model.wv[word] for word in filtered_words]
word_vectors = np.array(word_vectors)

# Apply PCA to reduce to 2D
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(word_vectors)

print("✓ PCA applied successfully")
print(f"  Original dimensions: {word_vectors.shape[1]}")
print(f"  Reduced dimensions: {vectors_2d.shape[1]}")
print(f"  Explained variance: {pca.explained_variance_ratio_}")
print(f"  Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

✓ PCA applied successfully
  Original dimensions: 100
  Reduced dimensions: 2
  Explained variance: [0.14648512 0.11460395]
  Total variance explained: 26.11%


In [37]:
# Step 14: Create a Plotly Scatter Plot with Labels
import plotly.graph_objects as go

# Create scatter plot
fig = go.Figure()
for i, word in enumerate(filtered_words):
    fig.add_trace(go.Scatter(x=[reduced_vectors[i, 0]], y=[reduced_vectors[i, 1]],
                             mode='markers+text',
                             text=[word],
                             textposition='top center'))

fig.update_layout(title="Word2Vec Visualization", xaxis_title="PCA1", yaxis_title="PCA2")
fig.show()

In [38]:
# Step 15: Print the First 500 Characters of the Plotly JSON
plotly_json = fig.to_json()
print("First 500 characters of Plotly JSON:", plotly_json[:500])

First 500 characters of Plotly JSON: {"data":[{"mode":"markers+text","text":["king"],"textposition":"top center","x":[0.17573286592960358],"y":[-0.08762556314468384],"type":"scatter"},{"mode":"markers+text","text":["queen"],"textposition":"top center","x":[0.3057299852371216],"y":[0.18989183008670807],"type":"scatter"},{"mode":"markers+text","text":["prince"],"textposition":"top center","x":[-0.12383899837732315],"y":[0.13390648365020752],"type":"scatter"},{"mode":"markers+text","text":["princess"],"textposition":"top center","x":[
 {"data":[{"mode":"markers+text","text":["king"],"textposition":"top center","x":[0.17573286592960358],"y":[-0.08762556314468384],"type":"scatter"},{"mode":"markers+text","text":["queen"],"textposition":"top center","x":[0.3057299852371216],"y":[0.18989183008670807],"type":"scatter"},{"mode":"markers+text","text":["prince"],"textposition":"top center","x":[-0.12383899837732315],"y":[0.13390648365020752],"type":"scatter"},{"mode":"markers+text","text":["princ

# Evaluation
