In [3]:
import numpy as np
import pandas as pd
#required for removing accented, non-english characters
import unicodedata
#tokenization and stemming
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
#remove html tags
from bs4 import BeautifulSoup
#remove special characters
import re
# for limmitisation
#import spacy

import os

# removes html text/tags

def extract_html(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

#replace accented characters

def replace_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

#split sentence into individual words

def tokenise_text(text):
    tokenizer=ToktokTokenizer()
    tokens=tokenizer.tokenize(text)
    return tokens


# remove special characters and unecessary puntuation 

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

#stemming for NLP processing if requeired but not required for this use case
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

# load the text file containing the text to be converted to a dictionary
def load_text_from_file(file_path):
    #open text file in read mode
    text_file = open(file_path, "r")

    #read whole file to a string
    data = text_file.read()
    #close file
    text_file.close()
    return data

def sort_dictionary_df(df,ascend):
    if ascend==True:
        return df.sort_values(by=['unique_words'])
    else:
        return df.sort_values(by='unique_words', ascending=False)
    


# limmitisation for more advanced NLP. Not required for this use case
#nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
#def lemmatize_text(text):
#    text = nlp(text)
#    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
#    return text

In [4]:
test_string = load_text_from_file("test.txt")

In [5]:
print(test_string)

William Shakespeare was an English playwright, poet and actor. He is widely regarded as the greatest writer in the English language and the world's greatest dramatist. He is often called England's national poet and the "Bard of Avon".


In [6]:
test_string = extract_html(test_string)
test_string = replace_accented_chars(test_string)
test_string = remove_special_characters(test_string)
test_string = remove_special_characters(test_string)

word_list = tokenise_text(test_string)

print(word_list)




['William', 'Shakespeare', 'was', 'an', 'English', 'playwright', 'poet', 'and', 'actor', 'He', 'is', 'widely', 'regarded', 'as', 'the', 'greatest', 'writer', 'in', 'the', 'English', 'language', 'and', 'the', 'worlds', 'greatest', 'dramatist', 'He', 'is', 'often', 'called', 'Englands', 'national', 'poet', 'and', 'the', 'Bard', 'of', 'Avon']


In [7]:
word_df = pd.DataFrame(word_list)
word_df.rename(columns = {0:'all_words'}, inplace = True)
grouped_words = word_df['all_words'].value_counts()
final_dict = pd.DataFrame(grouped_words)
final_dict = final_dict.reset_index(level=0)
final_dict.rename(columns = {'index':'unique_words','all_words':'word_count'}, inplace = True)

print('')
print("dictionary ascending:")
print('')
print(sort_dictionary_df(final_dict,True))
print('')
print("dictionary descending:")
print('')
print(sort_dictionary_df(final_dict,False))




dictionary ascending:

   unique_words  word_count
27         Avon           1
14         Bard           1
11     Englands           1
3       English           2
5            He           2
19  Shakespeare           1
17      William           1
23        actor           1
25           an           1
1           and           3
20           as           1
7        called           1
9     dramatist           1
2      greatest           2
12           in           1
6            is           2
16     language           1
13     national           1
15           of           1
10        often           1
24   playwright           1
4          poet           2
21     regarded           1
0           the           4
26          was           1
22       widely           1
8        worlds           1
18       writer           1

dictionary descending:

   unique_words  word_count
18       writer           1
8        worlds           1
22       widely           1
26          was           1