In [1]:
import nltk
from collections import Counter
import re

In [2]:
# Function to get the top 25 most frequent non-stopword words in a corpus
def top_25(cp,sw):
    # Removing the stopwords from the corpus
    filtered_words = [word  for word in cp if word not in sw]

    # Count the frequency of each word
    word_count = Counter(filtered_words)

    # Obtaining the top 25 most common words
    top_25_words = word_count.most_common(25)

    return top_25_words

def preprocess(cp, sw):
  # Add specific words of the poem as stopwords
  sw.update(['thee', 'thou', 'thy', 'er', 'thel'])

  # Preprocess the words of the poem using these steps:
  # 1. Convert words to lowercase
  # 2. Removing the non-alphabetical characters using regex library
  updated_cp =  [str(item).lower() for item in cp]
  updated_cp = re.findall(r'\b[a-z]+\b', ' '.join(updated_cp))

  return updated_cp, sw

In [3]:
# Download the required datasets from the corpus
nltk.download('gutenberg')
nltk.download('stopwords')
# Load the words from the 'blake-poems.txt'
cp = nltk.corpus.gutenberg.words('blake-poems.txt')
# Load the stopwords
sw = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
top_25(cp,sw)

[(',', 680),
 ('.', 201),
 ('And', 176),
 ('I', 130),
 ("'", 104),
 (';', 98),
 (':', 75),
 ('?', 65),
 ('The', 61),
 ('!', 59),
 ('"', 51),
 ('thee', 42),
 ('like', 29),
 ('thy', 28),
 ('thou', 28),
 ('THE', 27),
 ('little', 26),
 ('In', 25),
 ('night', 25),
 ('away', 24),
 ('But', 24),
 ('Then', 23),
 ('joy', 22),
 ('He', 21),
 ('weep', 21)]

Since we did not use preprocessing yet, the code resulted in a list containing punctuations, non word characters and duplicates ( "THE" and "The")

In [5]:
updated_cp, sw = preprocess(cp, sw)
top_25(updated_cp,sw)

[('little', 45),
 ('like', 35),
 ('love', 29),
 ('sweet', 28),
 ('night', 28),
 ('joy', 25),
 ('away', 24),
 ('weep', 24),
 ('father', 22),
 ('sleep', 21),
 ('happy', 19),
 ('shall', 19),
 ('day', 19),
 ('mother', 19),
 ('child', 18),
 ('every', 17),
 ('never', 17),
 ('hear', 16),
 ('green', 16),
 ('voice', 16),
 ('infant', 16),
 ('see', 16),
 ('human', 16),
 ('cloud', 15),
 ('lamb', 15)]

Due to the preprocessing, we have obtained a newly updated list of stopwords (containing synonyms), eliminated non word characters and possible duplicates due to case insensitive matching

Conclusions
1. This poem contains several Old English words whose meaning is the same with some words from stopwords so we thought that it is better to remove them.
2. We have used list comprehension  since they are more efficent than writing nested for loops. It is also more
memory efficient since they avoid the overhead of multiple variables.
3. Before finding the most common words we did some text pre-processing to retain only meaningful words.
4. In order to make the count process fast and efficent we have used the built-in functionality Counter. Since it also contains the function .most_common
to find the most frequent words we avoided the loop through the words, sorting in descending order and then print 25 of them.( we also have a less efficient code below in case the built in functionality of the Counter is not allowed)

In [None]:
# Less efficient solution without using built in functions
#def top_25(our_text,sw):
#  result = {}
#  for i in our_text:
#      if i not in sw:
#          if i not in result.keys():
#              result[i] = 1
#          else:
#              result[i] = result[i]+1
#  result = dict(sorted(result.items(), key=lambda item: item[1], reverse=True)[:25])
#  return result

