In [1]:
import nltk
from nltk.corpus import gutenberg
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# Download the corpus (Gutenberg)
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# Uses the language and package information to construct a Language object.
nlp = spacy.load("en_core_web_sm")

<h4> Loading the content of a document in the Gutenberg corpus </h4>

In [3]:
# Get all file IDs in the Gutenberg corpus
file_ids = gutenberg.fileids()
# Load the content of a specific file in the corpus (e.g., "shakespeare-hamlet.txt")
chosen_file = "shakespeare-hamlet.txt"
if chosen_file in file_ids:
    # Content of chosen_file in the Gutenberg corpus
    corpus_text = gutenberg.raw(chosen_file)
else:
    print(f"File '{chosen_file}' not found in the Gutenberg corpus.")

In [4]:
# Process the corpus text with the NLP model
complete_doc = nlp(corpus_text)
complete_doc

[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your selfe

   Bar. Long liue the King

   Fran. Barnardo?
  Bar. He

   Fran. You come most carefully vpon your houre

   Bar. 'Tis now strook twelue, get thee to bed Francisco

   Fran. For this releefe much thankes: 'Tis bitter cold,
And I am sicke at heart

   Barn. Haue you had quiet Guard?
  Fran. Not a Mouse stirring

   Barn. Well, goodnight. If you do meet Horatio and
Marcellus, the Riuals of my Watch, bid them make hast.
Enter Horatio and Marcellus.

  Fran. I thinke I heare them. Stand: who's there?
  Hor. Friends to this ground

   Mar. And Leige-men to the Dane

   Fran. Giue you good night

   Mar. O farwel honest Soldier, who hath relieu'd you?
  Fra. Barnardo ha's my place: giue you goodnight.

Exit Fran.

  Mar. Holla Barnardo

   Bar. Say, what is Horatio there?
  Hor. A peece of

In [5]:
# Extract sentences from the processed text
sentences = [sent.text for sent in complete_doc.sents]
sentences

['[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus.',
 'Scoena Prima.\n\n',
 'Enter Barnardo and Francisco two Centinels.\n\n  Barnardo.',
 "Who's there?\n  Fran.",
 'Nay answer me: Stand & vnfold\nyour selfe\n\n   Bar.',
 'Long liue the King\n\n   Fran.',
 'Barnardo?\n  Bar.',
 'He\n\n   Fran.',
 'You come most carefully vpon your houre\n\n   Bar.',
 "'Tis now strook twelue, get thee to bed Francisco\n\n   Fran.",
 "For this releefe much thankes: 'Tis bitter cold,\nAnd I am sicke at heart\n\n   Barn.",
 'Haue you had quiet Guard?\n  Fran.',
 'Not a Mouse stirring\n\n   Barn.',
 'Well, goodnight.',
 'If you do meet Horatio and\nMarcellus, the Riuals of my Watch, bid them make hast.\n',
 'Enter Horatio and Marcellus.\n\n  ',
 'Fran.',
 'I thinke I heare them.',
 "Stand: who's there?\n  Hor.",
 'Friends to this ground\n\n   Mar.',
 'And Leige-men to the Dane\n\n   Fran.',
 'Giue you good night\n\n   Mar.',
 "O farwel honest Soldier, who hath relieu'd you?\n  ",
 "Fr

<h4> Text Preprocessing </h4>

In [7]:
texts = []
for document in sentences:
    text = []
    doc = nlp(document)
    for w in doc:
        # Remove stop words, punctuation, numbers, and line breaks
        if not w.is_stop and not w.is_punct and not w.like_num and '\n' not  in w.text:
            # Adds the lemmatized form of a word to the text list
            text.append(w.lemma_)
    texts.append(text)
print(texts)



In [8]:
# builds a dictionary of words
from gensim import corpora
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)



<h4> Text Vectorizations </h4>

In [9]:
#  Creating a bag of words (BoW) Corpus
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(6, 1), (7, 1)], [(8, 2), (9, 1), (10, 1), (11, 1)], [(12, 1)], [(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(12, 1), (19, 1), (20, 1), (21, 1)], [(8, 1), (13, 1)], [(12, 1)], [(22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(10, 1), (12, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)], [(12, 1), (40, 1), (41, 1), (42, 1)], [(32, 1), (43, 1), (44, 1)], [(45, 1)], [(46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)], [(11, 1), (46, 1), (47, 1)], [(12, 1)], [(53, 1), (54, 1)], [(17, 1), (55, 1)], [(56, 1), (57, 1), (58, 1)], [(12, 1), (59, 1), (60, 1), (61, 1)], [(56, 1), (62, 1), (63, 1), (64, 1)], [(65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(8, 1), (45, 1), (62, 1), (71, 1), (72, 1), (73, 1)], [(12, 1), (74, 1)], [(8, 1), (13, 1), (56, 1), (75, 1)], [(46, 1), (55, 1)], [(13, 1), (76, 1)], [(13, 1), (46, 1), (47, 1), (56, 1), (

In [11]:
# computes and prints the TF-IDF scores for each document in the corpus.
from gensim import models
tfidf = models.TfidfModel(corpus)

for document in tfidf[corpus]:
    print(document)

[(0, 0.41836334530858044), (1, 0.18900397092771962), (2, 0.45965521775951584), (3, 0.45965521775951584), (4, 0.39420914834022225), (5, 0.45965521775951584)]
[(6, 0.7071067811865475), (7, 0.7071067811865475)]
[(8, 0.7244274610135993), (9, 0.48436668265528493), (10, 0.4408549231735322), (11, 0.21503652521505134)]
[(12, 1.0)]
[(13, 0.45301057552358476), (14, 0.4139492287945293), (15, 0.4139492287945293), (16, 0.2708179713976847), (17, 0.3502590955453296), (18, 0.5060135203533891)]
[(12, 0.621974276077724), (19, 0.29491006500365), (20, 0.5103479982319665), (21, 0.5154813034027117)]
[(8, 0.6977253507004182), (13, 0.7163653641752777)]
[(12, 1.0)]
[(22, 0.5760664113600457), (23, 0.5760664113600457), (24, 0.21619124169214973), (25, 0.4422962707123448), (26, 0.3064805627361644)]
[(10, 0.44332804163056183), (12, 0.3558163360559258), (27, 0.33022080298826945), (28, 0.48708389441787986), (29, 0.23527388406563876), (30, 0.3558163360559258), (31, 0.3854859505152506)]
[(32, 0.3317592535952526), (33, 

<h4> Printing bigram and trigram </h4>

In [13]:
import gensim
bigram = gensim.models.Phrases(texts)
texts2 = [bigram[line] for line in texts]
print(texts2)
trigram= gensim.models.Phrases(texts2) #texts with the new vocabulary
texts3 = [trigram[line] for line in texts2]
print(texts3)



In [14]:
print("Bigrams:")
for bigram in texts2:
    filtered_bigram = [word for word in bigram if '_' in word and word.count('_') == 1]
    if filtered_bigram:
        print(filtered_bigram)

Bigrams:
['haue_seene']
['haue_seene']
['thou_hast']
['thou_hast']
['haue_hear']
['haue_hear']
['haue_seene']
['good_friend']
['good_Lord']
['good_Lord']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['haue_seene']
['enter_Polonius']
['thou_hast']
['enter_Hamlet']
['Lord_Ham']
['Lord_Ham']
['thou_hast']
['set_downe']
['Lord_Ham']
['good_Lord']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['good_friend']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['haue_seene']
['haue_hear']
['good_friend']
['enter_Polonius']
['good_Lord']
['good_Lord']
['good_Lord']
['good_Lord']
['enter_King']
['haue_hear']
['enter_Polonius']
['good_Lord']
['enter_Polonius']
['set_downe']
['enter_Hamlet']
['good_Lord']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['Lord_Ham']
['set_downe']
['Lord_Ham']
['Lord_Ham']
['good_friend']
['Lord_Ham']
['good_friend']
['Lord_Ham']
['Lord_Ham']
['King_Queene']
['Lord_Ham']
['King_Queene']
['Lord_Ham']
['enter_Polonius']
['Lord_Ham']
['good_friend']
['Lord_Ham']
['set_downe']

In [15]:
print("Trigrams:")
for trigram in texts3:
    filtered_trigram = [word for word in trigram if '_' in word and word.count('_') == 2]
    if filtered_trigram:
        print(filtered_trigram)

Trigrams:
['enter_King_Queene']
['enter_King_Queene']
['enter_King_Queene']
['enter_King_Queene']
['enter_King_Queene']
['enter_King_Queene']
['enter_King_Queene']
