# Glossary Generator

In [None]:
import string
import textract
from collections import Counter
from string import punctuation
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import spacy

def remove_punctuation(from_text):
    # map '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' to ''
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in from_text]
    return stripped

In [None]:
# read file into a list of sentences
filename = 'Stories of Your Life and Others by Ted Chiang.txt'
byte = textract.process(filename)
text = byte.decode("utf-8")
tokenized_text = sent_tokenize(text)
tokens = [[word for word in line.split()] for line in tokenized_text]

In [None]:
# Initialize spacy 'en_core_web_sm' model
nlp = spacy.load('en_core_web_sm')

In [None]:
# extract lemmas
lemmas = []
for s in tokens:
    doc = nlp(' '.join(s))
    lemmas.append(remove_punctuation([token.lemma_ for token in doc]))

In [None]:
# remove stopwords such as 'the', 'i'
sw = (stopwords.words('english'))
words = [token for sentence in lemmas for token in sentence if (token.lower() not in sw and token.isalnum())]

word_count = Counter(words)

In [None]:
# favors less frequent words (??
sorted_words = sorted(word_count, key=word_count.get, reverse=False)

In [None]:
# discard the first 10000 words in the dictionary. 
# this number should be adjusted according to the user's own ability
discard = 10000

In [None]:
with open('common30k.txt') as dict:
    dict_words = [word for line in dict.readlines() for word in line.split()]
    less_common_dict_words = dict_words[discard + 1:]
    new_words = [word for word in sorted_words if word in less_common_dict_words]
    print("There are", len(new_words), "potential new words")    
    glossary = filename.split('.')[0] + '_glossary.txt'
    with open(glossary, 'w') as output:
        output.write('\n'.join(new_words))
        print("Wrote to", glossary)