# Rule 2: Never use a long word where a short word will do

We are going to take a loose interpretation of this. Instead of word length, we will use both the number of syllables and the order of where it appears in order in a frequency distribution of words. If we just did number of syllables, we would, for example, always replace the word `therefore` with `thus`, which is not in the spirit of the problem. 

So let's get cracking on this score!

In [1]:
import math
import nltk
import pprint as pp
import spacy
import sys
import time

from google_ngram_downloader import readline_google_store
from nltk.corpus import cmudict
from nltk.probability import FreqDist

In [2]:
nlp = spacy.load('en')
syllable_dict = cmudict.dict()

In [3]:
# This code was directly copied from Rule #5
# We use it to minimize the words we need to check for because it's a computationally heavy task

def strip_non_words(tokenized_text):
    return [token for token in tokenized_text if token.is_alpha==True]

# Takes in a bag of words and spits out that same bag of words but without the proper nouns
def strip_proper_nouns(tokenized_text):
    return [token for token in tokenized_text if token.tag_ != 'NNP' and token.tag_ != 'NNPS']

# Takes in a bag of words and removes any of them that are in the top n most common words
def strip_most_common_words(tokenized_text, n_most_common=10000):
    # Build the list of most common words
    most_common_words = []
    google_most_common_words_path = sys.path[1] + '/../Texts/google-10000-english-usa.txt'
    with open(google_most_common_words_path, 'r') as f:
        for i in range(n_most_common):
            most_common_words.append(f.readline().strip())
    # Remove anything in the n most common words
    return [token for token in tokenized_text if token.text.lower() not in most_common_words]

def strip_non_jargon_words(tokenized_text):
    text_no_proper_nouns = strip_proper_nouns(tokenized_text)
    text_no_non_words = strip_non_words(text_no_proper_nouns)
    text_no_common_words = strip_most_common_words(text_no_non_words)
    return text_no_common_words



In [14]:
# This number comes from Google's blog
# https://research.googleblog.com/2006/08/all-our-n-gram-are-belong-to-you.html

# TODO: If there's time, confirm this number
NGRAM_TOKEN_COUNT = 1024908267229

# Shout out to Quora for this snippet of code
# https://www.quora.com/Is-there-any-Google-Ngram-API-for-Python

def find_google_ngrams_word_count(word, time_function=False, verbose=False):
    if time_function == True:
        time1 = time.time()

    count = 2 # Set this to a minimum of 2 so we don't get a divide by zero error
    # TODO: Consider how we want to deal with capitalization
    fname, url, records = next(readline_google_store(ngram_len=1, indices=word[0]))
    # If we use the verbose settings, occaisionally print out the record
    verbosity_count = 10000000
    i = 0
    try:
        record = next(records)
        while record.ngram != word:
            record = next(records)
            if verbose == True and i%verbosity_count == 0:
                print(record)
            i += 1
        while record.ngram == word:
            count += record.match_count
            if verbose == True:
                print(record)
            record = next(records)
    except StopIteration:
        pass
    # Default to 1 so our program doesn't crash
    if count == 0:
        count = 1
    if time_function == True:
        time2 = time.time()
    print('Total seconds for ' + word + ': ' + str(int((time2-time1))))
    return count

def find_frequency_score(word):
    unigram_count = find_google_ngrams_word_count(word, time_function=True)
    percent_occurrence = unigram_count/NGRAM_TOKEN_COUNT
    # Get the log of the frequency to make our number manageable
    freq_val = math.log(percent_occurrence)
    max_ngram_val = math.log(1/NGRAM_TOKEN_COUNT)
    relative_freq = ((freq_val - max_ngram_val)/(-max_ngram_val))
    return round(relative_freq, 5)

In [16]:
def syllable_count(word, syllable_dict):
    if word in syllable_dict:
        return len(syllable_dict[word])
    # TODO
    # If it's not in the dictionary count the number of vowels and ignore an e at the end not
    # preceded by another vowel. It's rough, but there will be few cases if any cases in which
    # a word is not in the CMU dictionary but in WordNet
    return 1

def readability_for_word(word, syllable_dict):
    syllables = syllable_count(word, syllable_dict)
    freq_score = find_frequency_score(word)
    return (word, syllables, freq_score)
    
example_text = "Sometimes she wears glasses. Other times, she wears spectacles. "
tokenized_text = nlp(example_text)
# Strip out common words and proper nouns for optimization's sake
stripped_text = strip_non_jargon_words(tokenized_text)
pp.pprint(stripped_text)
scored_text = [readability_for_word(token.text.lower(), syllable_dict) for token in stripped_text if token.is_alpha]
pp.pprint(scored_text)

[wears, wears, spectacles]
Record(ngram="W'ald_NOUN", year=1899, match_count=1, volume_count=1)
Record(ngram='Wandsbek_NOUN', year=1983, match_count=28, volume_count=14)
Record(ngram='WELBY', year=1919, match_count=1, volume_count=1)
Record(ngram='wears', year=1520, match_count=1, volume_count=1)
Record(ngram='wears', year=1575, match_count=1, volume_count=1)
Record(ngram='wears', year=1581, match_count=2, volume_count=2)
Record(ngram='wears', year=1587, match_count=2, volume_count=2)
Record(ngram='wears', year=1590, match_count=8, volume_count=2)
Record(ngram='wears', year=1620, match_count=3, volume_count=1)
Record(ngram='wears', year=1637, match_count=3, volume_count=1)
Record(ngram='wears', year=1643, match_count=1, volume_count=1)
Record(ngram='wears', year=1645, match_count=1, volume_count=1)
Record(ngram='wears', year=1647, match_count=1, volume_count=1)
Record(ngram='wears', year=1650, match_count=1, volume_count=1)
Record(ngram='wears', year=1651, match_count=1, volume_count=1

In [None]:
# WARNING: THIS CODE TAKES FOREVER TO RUN
# import string

# a_through_z = string.ascii_lowercase
# unigram_total_match_count = 0
# fname, url, records = next(readline_google_store(ngram_len=1, indices=a_through_z))

# i = 0
# try:
#     while True:
#         record = next(records)
#         count += record.match_count
#         if i%10000000 == 0:
#             print(record)
#             print('Currently at: ' + str(count))
#         i += 1
# except StopIteration:
#     pass

# print('Count is ' + str(count))
