# Word2Vec Analysis with NLTK Brown Corpus

This notebook demonstrates the training and analysis of Word2Vec embeddings using the NLTK Brown corpus. The steps include preprocessing text data, training the model, and visualizing the embeddings.


In [11]:
# Install Required Libraries
!#pip install gensim nltk scikit-learn plotly

'#pip' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
# Download NLTK Brown Corpus
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\ASUS/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [13]:
# Load Brown Sentences
from nltk.corpus import brown
brown_sentences = brown.sents()
print(f"Number of sentences: {len(brown_sentences)}")
print(f"First sentence: {brown_sentences[0]}")

Number of sentences: 57340
First sentence: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [14]:
# Preprocess Text Data
import re
from nltk.tokenize import word_tokenize
def preprocess_sentences(sentences):
    processed = []
    for sentence in sentences:
        sentence = word_tokenize(' '.join(sentence).lower())
        sentence = [re.sub(r'[^a-z]', '', word) for word in sentence]
        sentence = [word for word in sentence if word]
        if sentence:
            processed.append(sentence)
    return processed
processed_sentences = preprocess_sentences(brown_sentences)
print(f"Number of processed sentences: {len(processed_sentences)}")
print(f"First processed sentence: {processed_sentences[0]}")

Number of processed sentences: 56833
First processed sentence: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlanta', 's', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']


In [15]:
# Train Word2Vec Model
from gensim.models import Word2Vec
model = Word2Vec(
    sentences=processed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,
    epochs=10
    )
print("Word2Vec model training complete.")

Word2Vec model training complete.


In [16]:
# Print Vector for 'king'
if 'king' in model.wv:
    king_vector = model.wv['king']
    print("Vector for 'king' (first 10 values):", king_vector[:10])
else:
    print("'king' is not in the vocabulary.")

Vector for 'king' (first 10 values): [ 0.11880429 -0.1317661  -0.02480275  0.4487808  -0.16028818 -0.28191662
  0.2379029   0.9176345  -0.17331335 -0.07205094]


In [17]:
# Step 7: Show 5 Most Similar Words to "woman"
if "woman" in model.wv:
    similar_words = model.wv.most_similar("woman", topn=5)
    print("5 most similar words to 'woman':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("'woman' is not in the vocabulary.")

5 most similar words to 'woman':
girl: 0.8073
lady: 0.7150
lonely: 0.7076
boy: 0.6985
lean: 0.6884


In [19]:
# Step 8: Perform Analogy: king - man + woman
if all(word in model.wv for word in ["king", "man", "woman"]):
    analogy_result = model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
    print("Result of the analogy 'king - man + woman':", analogy_result[0])
else:
    print("One or more words are not in the vocabulary.")

Result of the analogy 'king - man + woman': ('szold', 0.6922566890716553)


In [20]:
# Step 7: Show 5 Most Similar Words to "woman"
if "woman" in model.wv:
    similar_words = model.wv.most_similar("woman", topn=5)
    print("5 most similar words to 'woman':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("'woman' is not in the vocabulary.")

5 most similar words to 'woman':
girl: 0.8073
lady: 0.7150
lonely: 0.7076
boy: 0.6985
lean: 0.6884
