# Rule 2: Never use a long word where a short word will do

We are going to take a loose interpretation of this. Instead of word length, we will use both the number of syllables and the order of where it appears in order in a frequency distribution of words. If we just did number of syllables, we would, for example, always replace the word `therefore` with `thus`, which is not in the spirit of the problem. 

So let's get cracking on this score!

In [41]:
import math
import nltk
import pprint as pp
import re
import spacy
import sys
import time

from google_ngram_downloader import readline_google_store
from nltk.corpus import cmudict
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.wsd import lesk

In [4]:
nlp = spacy.load('en')
syllable_dict = cmudict.dict()

In [5]:
# This code was directly copied from Rule #5
# We use it to minimize the words we need to check for because it's a computationally heavy task

def strip_non_words(tokenized_text):
    return [token for token in tokenized_text if token.is_alpha==True]

# Takes in a bag of words and spits out that same bag of words but without the proper nouns
def strip_proper_nouns(tokenized_text):
    return [token for token in tokenized_text if token.tag_ != 'NNP' and token.tag_ != 'NNPS']

# Takes in a bag of words and removes any of them that are in the top n most common words
def strip_most_common_words(tokenized_text, n_most_common=10000):
    # Build the list of most common words
    most_common_words = []
    google_most_common_words_path = sys.path[1] + '/../Texts/google-10000-english-usa.txt'
    with open(google_most_common_words_path, 'r') as f:
        for i in range(n_most_common):
            most_common_words.append(f.readline().strip())
    # Remove anything in the n most common words
    return [token for token in tokenized_text if token.text.lower() not in most_common_words]

def strip_non_jargon_words(tokenized_text):
    text_no_proper_nouns = strip_proper_nouns(tokenized_text)
    text_no_non_words = strip_non_words(text_no_proper_nouns)
    text_no_common_words = strip_most_common_words(text_no_non_words)
    return text_no_common_words



In [6]:
# This number comes from Google's blog
# https://research.googleblog.com/2006/08/all-our-n-gram-are-belong-to-you.html
# TODO: If there's time, confirm this number
NGRAM_TOKEN_COUNT = 1024908267229

# Shout out to Quora for this snippet of code
# https://www.quora.com/Is-there-any-Google-Ngram-API-for-Python

def find_google_ngrams_word_count(word, time_function=False, verbose=False):
    if time_function == True:
        time1 = time.time()

    count = 2 # Set this to a minimum of 2 so we don't get a divide by zero error
    # TODO: Consider how we want to deal with capitalization
    fname, url, records = next(readline_google_store(ngram_len=1, indices=word[0]))
    # If we use the verbose settings, occaisionally print out the record
    verbosity_count = 1000000000
    earliest_year = 1950
    i = 0
    try:
        record = next(records)
        while record.ngram != word:
            record = next(records)
            if verbose == True and i%verbosity_count == 0:
                print(record)
            i += 1
        while record.ngram == word:
            if record.year >= earliest_year:
                count += record.match_count
                if verbose == True:
                    print(record)
            record = next(records)
    except StopIteration:
        pass
    # Default to 1 so our program doesn't crash
    if count == 0:
        count = 1
    if time_function == True:
        time2 = time.time()
    print('Total seconds for ' + word + ': ' + str(int((time2-time1))))
    return count

def find_frequency_score(word):
    unigram_count = find_google_ngrams_word_count(word, time_function=True)
    percent_occurrence = unigram_count/NGRAM_TOKEN_COUNT
    # Get the log of the frequency to make our number manageable
    freq_val = math.log(percent_occurrence)
    max_ngram_val = math.log(1/NGRAM_TOKEN_COUNT)
    relative_freq = ((freq_val - max_ngram_val)/(-max_ngram_val))
    return round(relative_freq, 5)

In [132]:
BIG_NUMBER = 18109831

def syllable_count(word, syllable_dict):
    syllable_count = 0
    for word in word.split():
        if word in syllable_dict:
            # Shout out to StackOverflow for this snippet of code
            # http://stackoverflow.com/a/4103234/1031615
            syllable_count += [len(list(y for y in x if y[-1].isdigit())) for x in syllable_dict[word]][0]
            continue
        # If it's not in the dictionary count the number of vowels and ignore an e at the end not
        # preceded by another vowel. It's rough, but there will be few cases if any cases in which
        # a word is not in the CMU dictionary but in WordNet
        if word[-1] == 'e':
            word = word[:-1]
        word = re.sub(r'[^aeiou]', '', word)
        syllable_count += len(word)
    return max(syllable_count, 1)


def readability_for_word(word, syllable_dict, use_ngrams = False):
    if word is None:
        return BIG_NUMBER 
    syllables = syllable_count(word.lower(), syllable_dict)
    if use_ngrams == False:
        return syllables
    freq_score = find_frequency_score(word.lower())
    return syllables * freq_score
   
    
example_text = "Sometimes she wore glasses. Other times, she wore soliloquy bifocals. "
tokenized_text = nlp(example_text)
print(tokenized_text[2].pos_)
# Strip out common words and proper nouns for optimization's sake
stripped_text = strip_non_jargon_words(tokenized_text)
pp.pprint(stripped_text)
scored_text = [readability_for_word(token.text.lower(), syllable_dict) for token in stripped_text if token.is_alpha]
pp.pprint(scored_text)



VERB
[wore, wore, soliloquy, bifocals]
[1, 1, 4, 3]


Awesome. Now let's bust some synsets up in here!

In [128]:
def synsets_for_tokens_in_tokenized_sentence(tokenized_sentence):
    sentence = [token.text for token in tokenized_sentence]
    synsets = [lesk(sentence, token.text, spacy_to_wordnet_pos(token.pos_)) for token in tokenized_sentence]
    for i in range(len(synsets)):
        # Get the hypernym of the word
        if synsets[i] is not None:
            if len(synsets[i].hypernyms()) is not 0:
                synsets[i] = synsets[i].hypernyms()[0]
            synsets[i] = synsets[i].name().split('.')[0].replace('_', ' ')
    return synsets

def spacy_to_wordnet_pos(pos):
    # To see all the parts of speech spaCy uses, see the link below
    # http://polyglot.readthedocs.io/en/latest/POS.html
    if pos == 'ADJ':
        return wn.ADJ
    elif pos == 'ADV':
        return wn.ADV
    elif pos == 'NOUN':
        return wn.NOUN
    elif pos == 'VERB':
        return wn.VERB
    return None

# Returns an array of tuples. If the word cannot be replaced, the second value is the replacing word.
# If it cannot be replaced, it is None
def replaceable_word_in_tokenized_sentence(tokenized_sentence):
    sentence_words = [token.text for token in tokenized_sentence]
    sentence_alternatives = synsets_for_tokens_in_tokenized_sentence(tokenized_sentence)
    words_and_alternatives = zip(sentence_words, sentence_alternatives)
    replaceable_words = []
    for (word, alt) in words_and_alternatives:
        if readability_for_word(word, syllable_dict) > readability_for_word(alt, syllable_dict):
            replaceable_words.append((word, alt))
        else:
            replaceable_words.append((word, None))
    return replaceable_words

replaceable_word_in_tokenized_sentence(tokenized_text)

optical instrument not in syllable dict
. not in syllable dict
time period not in syllable dict
, not in syllable dict
actor's line not in syllable dict
4
actor's line not in syllable dict
3
. not in syllable dict


[('Sometimes', None),
 ('she', None),
 ('wore', None),
 ('glasses', None),
 ('.', None),
 ('Other', None),
 ('times', None),
 (',', None),
 ('she', None),
 ('wore', None),
 ('soliloquy', "actor's line"),
 ('bifocals', None),
 ('.', None)]