In [2]:
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from bs4 import BeautifulSoup
from nltk.corpus import stopwords


In [3]:
with open('The Complete Works of William Shakespeare.html', 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'lxml')
    text_content = soup.get_text()

In [4]:
# Preprocess text, including stopword removal
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert text to lowercase

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    return(filtered_tokens)
    
text_tokens = preprocess_text(text_content)

# Model tokenized without sentence structure
model1 = Word2Vec([text_tokens], vector_size=128, window=15, min_count=3, workers=4, epochs=10)
text_tokens

['project',
 'gutenberg',
 'ebook',
 'complete',
 'works',
 'william',
 'shakespeare',
 'william',
 'shakespeare',
 'project',
 'gutenberg',
 'ebook',
 'complete',
 'works',
 'william',
 'shakespeare',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'united',
 'states',
 'parts',
 'world',
 'cost',
 'almost',
 'restrictions',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'terms',
 'project',
 'gutenberg',
 'license',
 'included',
 'ebook',
 'online',
 'located',
 'united',
 'states',
 'check',
 'laws',
 'country',
 'located',
 'using',
 'ebook',
 'title',
 'complete',
 'works',
 'william',
 'shakespeare',
 'author',
 'william',
 'shakespeare',
 'release',
 'date',
 'january',
 '1',
 '1994',
 'ebook',
 '100',
 'recently',
 'updated',
 'january',
 '18',
 '2024',
 'language',
 'english',
 'start',
 'project',
 'gutenberg',
 'ebook',
 'complete',
 'works',
 'william',
 'shakespeare',
 'complete',
 'works',
 'william',
 'shakespeare',
 'william',
 'shakespeare',
 'contents',
 'sonnets',
 'w

In [5]:
print("Vocabulary:", model1.wv.index_to_key)



In [6]:
words = ["king", "queen", "love", "death"]
# Display vector representations for the specified words
for word in words:
    if word in model1.wv:
        print(f"Vector representation for '{word}':\n", model1.wv[word])
    else:
        print(f"'{word}' not found in the model vocabulary.")

Vector representation for 'king':
 [-0.12235345 -0.30896902  0.03448595  0.08692976  0.16043544 -0.36178905
  0.05212218  0.09701214  0.01820629  0.24206945  0.2942523  -0.06111829
 -0.22960678 -0.22409844  0.12893635  0.09998746 -0.15950522  0.09986573
 -0.24654631 -0.06170949  0.27612206  0.1951789  -0.1094756  -0.2887258
 -0.0570084   0.08145791 -0.09405898  0.342319    0.05252145  0.08425724
 -0.04891726 -0.0218114   0.20972729  0.02068686 -0.03605304  0.07724807
  0.01949513 -0.24078599  0.16675448  0.02453166  0.00840398 -0.00307637
 -0.00727713 -0.28617805 -0.01036144  0.11781634 -0.22729996  0.07173115
 -0.00564019  0.10448244  0.1328765  -0.0206785   0.03308845  0.16612796
 -0.10807659 -0.09140565  0.13888554 -0.23774855  0.01074178  0.16070187
  0.1030493   0.20023875  0.10882532 -0.21239659  0.16664004 -0.27651837
  0.09341659  0.09468187  0.04394071  0.01264725 -0.19613041 -0.02455742
 -0.4836539  -0.07735424 -0.05357736 -0.30316705  0.02260751  0.00169996
 -0.30917534  0.1

In [7]:
# Most similar words
similar_words = model1.wv.most_similar('king', topn=5)
print("\nSimilar Words :", similar_words)


Similar Words : [('countess', 0.999409556388855), ('bertram', 0.9993680119514465), ('time', 0.999358594417572), ('old', 0.9993355870246887), ('like', 0.9993341565132141)]


In [8]:
# Analogy task
result = model1.wv.most_similar(positive=['boy', 'queen'], negative=['king'], topn=1)
print("boy + queen - king =", result)

boy + queen - king = [('spend', 0.9876039624214172)]


In [13]:
# NLTK function to split the text into sentences.
sentences = nltk.sent_tokenize(text_content)

# Createad a list of tokenized sentences 
tokenized_sentences = [preprocess_text(sentence) for sentence in sentences]

# Train the Word2Vec model using tokenized sentences this time
model2 = Word2Vec(sentences=tokenized_sentences, vector_size=128, window=15, min_count=5, workers=4, epochs=50)

In [14]:
words = ["king", "queen", "love", "death"]
# Display vector representations for the specified words
for word in words:
    if word in model2.wv:
        print(f"Vector representation for '{word}':\n", model1.wv[word])
    else:
        print(f"'{word}' not found in the model vocabulary.")

Vector representation for 'king':
 [-0.12235345 -0.30896902  0.03448595  0.08692976  0.16043544 -0.36178905
  0.05212218  0.09701214  0.01820629  0.24206945  0.2942523  -0.06111829
 -0.22960678 -0.22409844  0.12893635  0.09998746 -0.15950522  0.09986573
 -0.24654631 -0.06170949  0.27612206  0.1951789  -0.1094756  -0.2887258
 -0.0570084   0.08145791 -0.09405898  0.342319    0.05252145  0.08425724
 -0.04891726 -0.0218114   0.20972729  0.02068686 -0.03605304  0.07724807
  0.01949513 -0.24078599  0.16675448  0.02453166  0.00840398 -0.00307637
 -0.00727713 -0.28617805 -0.01036144  0.11781634 -0.22729996  0.07173115
 -0.00564019  0.10448244  0.1328765  -0.0206785   0.03308845  0.16612796
 -0.10807659 -0.09140565  0.13888554 -0.23774855  0.01074178  0.16070187
  0.1030493   0.20023875  0.10882532 -0.21239659  0.16664004 -0.27651837
  0.09341659  0.09468187  0.04394071  0.01264725 -0.19613041 -0.02455742
 -0.4836539  -0.07735424 -0.05357736 -0.30316705  0.02260751  0.00169996
 -0.30917534  0.1

In [15]:
# Most similar words
similar_words = model2.wv.most_similar('king', topn=5)
print("\nSimilar Words :", similar_words)


Similar Words : [('france', 0.5606173276901245), ('realm', 0.4992162585258484), ('england', 0.4852922260761261), ('deposed', 0.4852396547794342), ('henry', 0.4851248562335968)]


In [16]:
# Analogy
result = model2.wv.most_similar(positive=['boy', 'queen'], negative=['king'], topn=1)
print("boy + queen - king =", result)

boy + queen - king = [('girl', 0.3398003578186035)]
