In [54]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

The text of [*Alice in Wonderland*](https://www.gutenberg.org/files/11/11-0.txt) is taken from Project Gutenberg.

The list of [English stop words](https://www.kaggle.com/datasets/heeraldedhia/stop-words-in-28-languages) is taken from Kaggle.

In [57]:
#Saving the text as a string in the 'corpus' variable, and the list of stop words also as string in the 'stopstring' variable.
fhandle = open('text_files/english_stopwords.txt', 'r', encoding = 'UTF-8')
stopstring = fhandle.read()

thandle = open('text_files/Alice_in_Wonderland.txt', 'r', encoding = 'UTF-8')
corpus = thandle.read()

In [55]:
#Closing both source files
fhandle.close()
thandle.close()

Tokenization of the text and stop words

In [56]:
#Tokenising the text and the stop words
stoplist = word_tokenize(stopstring)
text = word_tokenize(corpus)

In [8]:
stopwords_set = set(stoplist)

In [9]:
def remove_stopwords(str_list, word_set):
    '''
    Removes stop words from the list of words.
    '''
    new_list = list(filter(lambda x: x not in word_set, str_list))
    return new_list

In [10]:
def strip_punc(str_list):
    '''
    Removes punctuation from the the list of words.
    '''
    punc = string.punctuation + ",.;’_—“”‘"
    new_list = []
    for elem in str_list:
        new_list.append(elem.strip(punc).strip())
    return new_list

In [11]:
def normalize(str_list, stopwords_set):
    '''
    Removes stop words, punctuation and lemmatizes words.
    Returns a list of words.
    '''
    final = []
    lemmatizer = WordNetLemmatizer()
    new_list = list(map(str.lower, strip_punc(str_list)))
    new_list = remove_stopwords(new_list, stopwords_set)
    for elem in new_list:
        final.append(lemmatizer.lemmatize(elem))
    return final

Calculating frecuency & probability

In [12]:
def count_words(str_list):
    '''
    Returns a dictionary with words as keys and their frecuency as values.
    '''
    words_dict = {}
    for elem in str_list:
        if elem != '':
            if elem not in words_dict:
                words_dict[elem] = 1
            else:
                words_dict[elem] += 1
    return words_dict

In [58]:
def word_probability(dictionary, length):
    '''
    Returns a dictionary with words as keys and their probabilityies as values.
    '''
    return dict(map(lambda x: (x[0], x[1]/length), dictionary.items()))

#### Sorting the dictionary in order to get an ordered histogram later.

In [14]:
def sort_dict_by_value(dictionary, reversed_order):
    '''
    Returns a list of tuples with key-value pairs sorted by frecuency.
    '''
    list_d = list(dictionary.items())                                          # converting dictionary into a list
    list_swop = []                                       
    for key, value in list_d:                           
        list_swop.append((value, key))                                         # adding key and value tuples in swopped order
    list_sorted = sorted(list_swop, reverse=reversed_order)
    return list_sorted            

In [53]:
def display(sorted_list,top_n):
    '''
    Prints the first top_n pairs of word-probability.
    '''
    short = sorted_list[:top_n]
    print('{:<20} {:>22}\n'.format('Word', 'Percent'))
    for k, v in short:
        print('{:<20} {:>20}%'.format(v,round(k*100,2)))
        

In [16]:
clean_text = normalize(text, stopwords_set)                                 #getting a clean list of semantically important words
words_frecuency = count_words(clean_text)                                   #getting a dictionary with words' frequencies
length = len(clean_text)
prob = word_probability(words_frecuency, length)                            #getting a dictionary with words' probabilities
histogram = sort_dict_by_value(prob, True)                                  #sorting by words' probabilities in descending order

Let's display top 10 most frequent words with their corresponding probability.

In [52]:
display(histogram,10)

Word                                Percent

alice                                2.72%
queen                                0.53%
time                                  0.5%
king                                 0.44%
turtle                               0.42%
head                                  0.4%
mock                                 0.39%
hatter                               0.39%
gryphon                              0.38%
voice                                0.34%
