# Rule 2: Never use a long word where a short word will do

We are going to take a loose interpretation of this. Instead of word length, we will use both the number of syllables and the order of where it appears in order in a frequency distribution of words. If we just did number of syllables, we would, for example, always replace the word `therefore` with `thus`, which is not in the spirit of the problem. 

So let's get cracking on this score!

In [1]:
import nltk
import pprint as pp
import spacy

from nltk.corpus import brown
from nltk.corpus import cmudict
from nltk.probability import FreqDist

In [2]:
nlp = spacy.load('en')
syllable_dict = cmudict.dict()

In [21]:
def brown_freqdist():
    #TODO: make this not just news
    brown_text = brown.words()
    dist = nltk.FreqDist([w.lower() for w in brown_text])
    return dist

def syllable_count(word, syllable_dict):
    if word in syllable_dict:
        return len(syllable_dict[word])
    # TODO
    # If it's not in the dictionary count the number of vowels and ignore an e at the end not
    # preceded by another vowel. It's rough, but there will be few cases if any cases in which
    # a word is not in the CMU dictionary but in Brown and WordNet
    return 1

def readability_for_word(word, brown_dist, syllable_dict):
    syllables = syllable_count(word, syllable_dict)
    brown_most_common = [w for (w, count) in brown_dist.most_common(100000)]
    brown_freq = len(brown_most_common)
    if word in brown_most_common:
        brown_freq = brown_most_common.index(word)
    return (word, syllables, brown_freq)
    
example_text = "What could the readability of this gargantutan document be?"
tokenized_text = nlp(example_text)
fd_brown = brown_freqdist()
scored_text = [readability_for_word(token.text.lower(), fd_brown, syllable_dict) for token in tokenized_text]
pp.pprint(scored_text)




[('what', 2, 62),
 ('could', 1, 75),
 ('the', 3, 0),
 ('readability', 1, 49815),
 ('of', 2, 3),
 ('this', 2, 25),
 ('gargantutan', 1, 49815),
 ('document', 2, 6943),
 ('be', 2, 20),
 ('?', 1, 27)]
