# Vocabulary Extractor

Takes a book in English, finds the words that are "rare" (by the measure IDF >= 15 or -ln(frequency) > 14) and lists the definition in a dictionary and the places it occurs in the original text. 

Who is this for?
 - A casual reader of a book
 - A teacher trying to teach a book
 - Students trying to learn vocabulary for a standardized test

Features:
 - Tokenizes the text and words with NLTK to deal with punctuation
 - Removes proper nouns and character names 
 - Has a lot of custom-built lemmatization (WordNetLemmatizer proved insufficient) to get the root word in order to correctly evaluate rarity AND look up the correct root word in the dictionary
 - Can easily sort the result by order it appears in the book or overall frequency of the word in the book.
 - Often dictionary definitions have words that are also hard, so we do a secondary extraction of word meanings in the dictionary definition to make it easy to understand.

Could improve:
 - Doesn't do much for words transliterated from other languages - marks them as vocab words with no dictionary definition.
 
Numbers:
 - Merriam Webster dictionary has 100k words (102,217)
 - Word frequency list (en_full) has 1.5m words (1560428) with a total count of ~710m (709588976)
 - Word frequency list (enwiki) has 1.8m words (1857808) (min count 3) with a totl count of 1.9B (1926212329)
 - Anecdotally, around 10-15% of unique words of a book will be a vocab word if 15 threshold, 5-10% if 16 threshold.
 - Anecdotally, around 70% of vocab words have definitions but this number will vary widely. 
 
 
TODO:
  - Lemmatize before doing secondary lookup of dictionary definition words e.g. "tergiversates"
  - Make sure non dict words are written to the vocab   
  - Aggregate words which lemmatize to the same word
  - Make plugin definition collapsible



In [378]:
# BOOK = 'namesake/namesake.txt'
# BOOK = 'midnightschildren/midnightschildren.txt'
BOOK = 'shantaram/shantaram.txt'

In [379]:
import csv
import json
import nltk
import string
import random
import math
import re
import os
from pprint import pprint
from copy import deepcopy
from collections import defaultdict, Counter, namedtuple, OrderedDict
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /Users/deedy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/deedy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [380]:
def read_to_lines(fname) -> str:
    with open(fname, 'r') as f:
        data = f.read()
    lines = [d for d in data.split('\n') if len(d)]
    return lines

In [381]:
lines = read_to_lines(BOOK)
len(lines)

7944

# Tokenize

Tokenize text and grab `word_count` to distil proper nouns. 

In [382]:
blacklist = set([p for p in string.punctuation])
blacklist = blacklist.union(['``', "'s", "''", "'d", "n't", '’', '‘', '…'])
stopwords = nltk.corpus.stopwords.words('english')
WORDS_PER_PAGE = 350
WORDS_PER_MINUTE_READING = 250
Stats = namedtuple('Stats', [
    'num_lines',
    'num_sentences',
    'num_nonstop_words',
    'num_words',
    'num_chars',
    'uniq_words',
    'uniq_nonstop_words',
    'pages',
    'time_to_read',
    'sentences_per_line',
    'words_per_sentence',
    'char_per_word'
])
# Returns:
# - `sentences`: Tokenize into list of sentences, each containing a list of words
# - `word_counts`: Get all word counts (preserving caps) in a Counter
# - `stats`: a Stats type containing basic stats about the book
def tokenize(lines, ignore_punctuation=True):
    num_chars, num_sentences, num_words, num_nonstop_words = 0, 0, 0, 0
    sentences = []
    word_counts = Counter()
    for l in lines:
        num_chars += len(l)
        for s in nltk.tokenize.sent_tokenize(l):
            sen = []
            for w in nltk.tokenize.word_tokenize(s):
                if ignore_punctuation and not w in blacklist:
                    # Someitmes strings like ‘Your aim is off.’ don't tokenize correct.
                    if w.endswith('.'):
                        w = w[:w.index('.')]
                    if w.endswith('…'):
                        w = w[:w.index('…')]
                sen.append(w)
                if w.lower() in stopwords:
                    num_nonstop_words += 1
            sentences.append(sen)
            word_counts.update(sen)
            num_words += len(sen)
    num_sentences = len(sentences)
    uniq_words_set = set([x.lower() for x in word_counts.keys()])
    uniq_nonstop_words = len([x for x in uniq_words_set if not x in stopwords])
    return sentences, word_counts, Stats(
        num_lines=len(lines),
        num_sentences=num_sentences,
        uniq_words=len(uniq_words_set),
        uniq_nonstop_words=uniq_nonstop_words,
        num_nonstop_words=num_nonstop_words,
        num_words=num_words,
        num_chars=num_chars,
        pages=f'{int(num_words / WORDS_PER_PAGE)} pages',
        time_to_read=f'{int(num_words / WORDS_PER_MINUTE_READING)/60:0.2f} hours',
        sentences_per_line=f'{num_sentences/len(lines):0.2f}',
        words_per_sentence=f'{num_words/num_sentences:0.2f}',
        char_per_word=f'{num_chars/num_words:0.2f}',
    )
sentences, word_counts, stats = tokenize(lines)
len(sentences), len(word_counts), stats

(28675,
 21098,
 Stats(num_lines=7944, num_sentences=28675, num_nonstop_words=208270, num_words=476454, num_chars=2091797, uniq_words=19466, uniq_nonstop_words=19315, pages='1361 pages', time_to_read='31.75 hours', sentences_per_line='3.61', words_per_sentence='16.62', char_per_word='4.39'))

# Proper Nouns

Words that are frequently capitalized are naively marked proper nouns. 
There may be false positives we don't care about like "God", "I", "Mr or words which frequntly start sentences like "Oh".

In [383]:
propernouns = {}
for item in word_counts.most_common():
    word = item[0]
    # Word is capitalized and word appears capitalized 20% of the time
    # Question - how to get bigrams and separate proper nouns (Shobhan Mahmoud, Mahmoud Melbaaf)
    if (word.lower() != word) and (word_counts[word.lower()]/word_counts[word]<0.2):
        propernouns[word.lower()] = item[1]
propernoun_count = Counter(propernouns)
pprint(propernoun_count.most_common())

[('i', 11328),
 ('karla', 422),
 ('bombay', 310),
 ('khaderbhai', 273),
 ('khaled', 223),
 ('vikram', 185),
 ('johnny', 174),
 ('oh', 166),
 ('india', 154),
 ('ulla', 147),
 ('ghani', 146),
 ('indian', 145),
 ('english', 138),
 ('lisa', 136),
 ('salman', 131),
 ('god', 127),
 ('maurizio', 121),
 ('hindi', 112),
 ('ali', 112),
 ('qasim', 105),
 ('modena', 103),
 ('sapna', 103),
 ('yeah', 102),
 ('abdul', 101),
 ('mahmoud', 99),
 ('lettie', 95),
 ('habib', 90),
 ('anand', 89),
 ('afghan', 87),
 ('sanjay', 87),
 ('mr', 83),
 ('marathi', 83),
 ('joseph', 78),
 ('cigar', 75),
 ('american', 73),
 ('afghanistan', 71),
 ('ahmed', 70),
 ('linbaba', 68),
 ('tariq', 66),
 ('prabu', 62),
 ('kano', 58),
 ('colaba', 56),
 ('chuha', 56),
 ('pakistan', 55),
 ('farid', 54),
 ('palace', 53),
 ('kavita', 52),
 ('madjid', 51),
 ('russians', 51),
 ('jeetendra', 47),
 ('kandahar', 46),
 ('german', 45),
 ('british', 44),
 ('chapter', 42),
 ('arthur', 41),
 ('russian', 41),
 ('kishan', 39),
 ('rajan', 37),
 (

 ('harshan', 1),
 ('bichchu', 1),
 ('boy—nazeer', 1),
 ('mahmoud—', 1),
 ('maijata', 1),
 ('house—abdullah', 1),
 ('sardar', 1),
 ('malum', 1),
 ('sovereign', 1),
 ('nazeer—the', 1),
 ('no—it', 1),
 ('nothing—sanjay', 1),
 ('lanka—you', 1),
 ('fighting—tamil', 1),
 ('sinhalese', 1),
 ('others—tamil', 1),
 ('muslims—with', 1),
 ('lake—killer', 1),
 ('ooooh', 1),
 ('delhi—well', 1),
 ('idriss—khader', 1),
 ('ramesh—', 1),
 ('pieta', 1),
 ('michelangelo', 1),
 ('good—lisa', 1),
 ('now—i', 1),
 ('know—i', 1),
 ('enemy—sapna—and', 1),
 ('idriss—because', 1),
 ('mukesh', 1),
 ('bilkulfit', 1),
 ('achcha', 1),
 ('shantaram-uncle', 1)]


# Read word frequencies

Read `word_freq_count` and `total_count` from the word frequency file. 

In [443]:
# All lower-case
# https://github.com/hermitdave/FrequencyWords
# Data comes from OpenSubtitles - works better than Wikipedia. 
WORD_FREQ = 'en_full.txt'
# https://github.com/IlyaSemenov/wikipedia-word-frequency/tree/master/results
# Data comes from Wikipedia.
# WORD_FREQ = 'enwiki-20190320-words-frequency.txt'
def isascii(s):
    """Check if the characters in string s are in ASCII, U+0-U+7F."""
    return len(s) == len(s.encode())

word_freq = {}
for l in read_to_lines(WORD_FREQ):
    lsplit = l.split(' ')
    word = lsplit[0]
    if isascii(word):
        # if not '-' in word:
            if "\'" in word:
                 word = word[0:word.index("\'")]
            # sometime apostrophe splitting overwrites previous more popular words (list is sorted)
            if not word in word_freq:
                word_freq[word] = int(lsplit[1])
word_freq_count = Counter(word_freq)
total_count = sum(word_freq_count.values())

total_count, len(word_freq), word_freq['the'], word_freq['mustn']

(709588976, 1560428, 22761659, 17276)

# Read dictionary

Read the dictionary of word definitions (english) into `en_dict`.
Other choices might be better - Merriam-Webster doesn't have "reminisce"?

In [385]:
# https://github.com/topics/merriam-webster
# Merriam Webster
DICT_FILE = 'dictionary_compact.json'
en_dict = None
with open(DICT_FILE, 'r') as f:
    en_dict = json.loads(f.read())
len(en_dict)

102217

# Find the rare words

Compute the IDF of all words after trying to lemmatize correctly to find the root word and return
`book_word_idfs` - a dictionary of word to a tuple of 
 - original word
 - the version of the word which yielded the lowest IDF
 - the version of the word present in the dicrionary
 - the count of the word in the text
 - the count of the word in the frequency corpus
 - and the computed IDF
and
`book_nondict_words` - a dictionary of words that are not proper nouns that don't appear in the dictionary to the count in the book.


We then filter out common words (IDF < 15).

In [444]:
# needs word_freq_count, total_count and propernoun_count from global dict 
# Given `word_count`, a Counter of most common words in the document, get_top_words filters out unimportant words
# and returns an OrderedDict of important words sorted by inverse IDF (based on the en-wiki corpus vars, word_freq_count, total_count)
# and a `non_dict_words`, words which don't exist in en-wiki at all. 
# An important word: 
# - does not contain proper nouns
# - does not contain numbers
# - is at least 3 characters
# - exists in the en-wiki dictionary

# returns -1 if non-dict and -2 if not applicable, and positve idf otherwise

ignore_caps = True
def idf(word_freq_count, total_count, word):
    # names of places and things
    if word in propernoun_count:
        return -2
    # years, etc
    if not word.isalpha():
        return -2
    # a, e, i, o, u
    if len(word) < 3:
        return -2
    if ignore_caps and word[0] != word[0].lower():
        return -2
    if word.lower() not in word_freq_count:
        if not word.lower() in en_dict:
             return -1
        word_freq = 1
    word_freq = word_freq_count[word.lower()]
    return math.log((1 + total_count)/(1 + word_freq)) + 1

suffixes = ['ing', 'ly', 'less', 'ally', 'ed', 'ling', 's', 'es', 'ness', 'er', 'est', 'able', 'ful', 'ant', 'en']
prefixes = ['semi', 'in', 'anti', 'de', 'un', 'dis', 're', 'mis']
# manual lemmatization
def gen_candidates(word):
    candidates = [word]
    if word.endswith('in'):
        candidates.append(word+'g')
    if word.endswith('ive'):
        candidates.append(word[:-3]+'e')
    if word.endswith('ier'):
        candidates.append(word[:-3]+'y')
    if word.endswith('ies'):
        candidates.append(word[:-3]+'y')
    if word.endswith('ied'):
        candidates.append(word[:-3]+'y')
    if word.endswith('ible'):
        candidates.append(word[:-4]+'e')
    if word.endswith('ent'):
        candidates.append(word[:-3]+'ence')
    if word.endswith('ence'):
        candidates.append(word[:-3]+'ent')
    if word.endswith('y'):
        candidates.append(word[:-1])
        candidates.append(word[:-1] + 'e')
    for s in suffixes:
        if word.endswith(s):
            candidates.append(word[:-len(s)])
            if s == 'er':
                candidates.append(word[:-len(s)] + 'e')
            if s == 'ly':
                candidates.append(word[:-len(s)+1] + 'e')
            if s == 'ing':
                candidates.append(word[:-len(s)] + 'e')
                if len(word[:-len(s)]) > 2 and word[:-len(s)][-1] == word[:-len(s)][-2]:
                    candidates.append(word[:-(len(s)+1)])
            if s == 'ed':
                candidates.append(word[:-len(s)] + 'e')
                if len(word[:-len(s)]) > 2 and word[:-len(s)][-1] == word[:-len(s)][-2]:
                    candidates.append(word[:-(len(s)+1)])
            if s == 'ness' and word[:-len(s)].endswith('i'):
                candidates.append(word[:-(len(s)+1)] + 'y')
            if s == 'est' and word[:-len(s)].endswith('i'):
                candidates.append(word[:-(len(s)+1)] + 'y')
    for p in prefixes:
        if word.startswith(p):
            candidates.append(word[len(p):])
    return candidates

def get_top_words(word_counts):
    # Create a Counter() just for non-stop words preserving capitalizing
    book_freq = {}
    nondict_words = {}
    for k, v in word_counts.most_common():
        word = k
        if word in propernoun_count:
            continue
        
        cands = set([cand for c in gen_candidates(word) for cand in gen_candidates(c)])
        candidfs = []
        for c in cands:
            widf = idf(word_freq_count, total_count, c)
            if widf < 0:
                continue
            candidfs.append((c, widf))
        best = (word, idf(word_freq_count, total_count, word))
        if len(candidfs) > 0:
            sortcands = sorted(candidfs, key=lambda x:x[1])
            best = sortcands[0]
            dict_cand = None
            for x in sortcands:
                if x[0] in en_dict:
                    dict_cand = x[0]
                    break
                
        best_cand = best[0]
        if word.lower() == 'reminiscing':
            print(best_cand, dict_cand, candidfs)
        
        word_idf = best[1]
        if word_idf == -2:
            continue
        if word_idf == -1:
            if not word.lower() in nondict_words:
                nondict_words[word.lower()] = v
        
        if not word.lower() in book_freq:
            book_freq[word.lower()] = (k, best_cand, dict_cand, v, word_freq_count[word.lower()], word_idf)
    book_freq = OrderedDict(sorted(book_freq.items(), key=lambda x: -x[1][-1])) #Sort by last ele of value, or IDF
    return book_freq, nondict_words

book_word_idfs, book_nondict_words = get_top_words(word_counts)
len(book_word_idfs), len(book_nondict_words)

reminisce None [('reminiscing', 15.421771761705486), ('reminisce', 15.189881048882121)]


(15138, 89)

In [445]:
list(book_word_idfs.values())[:10]

[('excruciation', 'excruciation', 'excruciation', 2, 0, 21.38019645473527),
 ('claustral', 'claustral', 'claustral', 1, 0, 21.38019645473527),
 ('synonyme', 'synonyme', 'synonyme', 1, 0, 21.38019645473527),
 ('enswathed', 'enswathe', 'enswathe', 1, 0, 21.38019645473527),
 ('inevasible', 'inevasible', 'inevasible', 1, 0, 21.38019645473527),
 ('concrescence', 'concrescence', 'concrescence', 1, 0, 21.38019645473527),
 ('chrismal', 'chrismal', 'chrismal', 1, 0, 21.38019645473527),
 ('plangency', 'plangency', 'plangency', 1, 0, 21.38019645473527),
 ('splendent', 'splendent', 'splendent', 1, 0, 21.38019645473527),
 ('revulsive', 'revulse', 'revulse', 1, 0, 21.38019645473527)]

In [446]:
VOCAB_THRESH = 16
book_word_vocab = [w for w in book_word_idfs.values() if w[-1] >= VOCAB_THRESH] 
len(book_word_vocab), len(book_word_idfs)

(1342, 15138)

In [447]:
# 70% of vocab words are in the english dictionary
len([w for w in book_word_vocab if w[0] in en_dict or w[1] in en_dict]) / len(book_word_vocab)

0.6445603576751118

In [448]:
print(f'Vocab Dictionary Words: {len(book_word_vocab)}\nNon Dictionary Words: {len(book_nondict_words)}\n'+
      f'Percentage Vocab Words: {(len(book_word_vocab) + len(book_nondict_words))*100/stats.uniq_words:0.2f}%')

Vocab Dictionary Words: 1342
Non Dictionary Words: 89
Percentage Vocab Words: 7.35%


# Get a pointer from all rare words to where they appear in book

Generate `vocab_postings` which is a dict from our vocab words to a list of numbers which are the indexes of the sentences in which these words occur.

In [450]:
def get_raw_sentences(lines):
    sens = []
    for l in lines:
        for s in nltk.tokenize.sent_tokenize(l):
            sens.append(s)
    return sens
raw_sens = get_raw_sentences(lines)
len(raw_sens)

28675

In [451]:
def gen_posting_list_for(raw_sentences, words):
    posting_list_sens = defaultdict(list)
    for i, s in enumerate(raw_sentences):
        for w in nltk.tokenize.word_tokenize(s):
            if not w in blacklist:
                # Someitmes strings like ‘Your aim is off.’ don't tokenize correct.
                if w.endswith('.'):
                    w = w[:w.index('.')]
                if w.endswith('…'):
                    w = w[:w.index('…')]
            if w.lower() in words:
                posting_list_sens[w.lower()].append(i)
    return posting_list_sens

In [452]:
book_vocab_raw = set([w[0] for w in book_word_vocab])
for x in book_nondict_words:
    book_vocab_raw.add(x)
vocab_postings = gen_posting_list_for(raw_sens, book_vocab_raw)
len(vocab_postings), len(book_word_vocab)

(1431, 1342)

In [437]:
vocab_postings['guileless']

[]

# Create the final vocabulary assistance tool

For all our words assemble (amongst others:
 - word
 - dictionary meaning
 - sentences in the book where they appear. 

In [453]:
SECONDARY_LOOKUP_THRESH = 16
all_vocab = []
for w in book_word_vocab:
    word = w[0].lower()
    if word in en_dict:
        meaning = en_dict[word]
        root_word = None
    elif w[2] in en_dict:
        meaning = en_dict[w[2]]
        root_word = w[2]
    else:
        meaning = ''
        root_word = None
    if root_word == word:
        root_word = ''
    extra_words = set()
    for mw in nltk.tokenize.word_tokenize(meaning):
        mword = mw.lower()
        if idf(word_freq_count, total_count, mword) >= SECONDARY_LOOKUP_THRESH and mword in en_dict and mword != w[2] and mword != word:
            extra_words.add(mword)
    for ew in extra_words:
        meaning = f'{meaning}\n\n"{ew}": {en_dict[ew]}'
    if not len(vocab_postings[word]):
        print(word)
        continue
    vocabrow = {
        'word': word,
        'root word': root_word,
        'count in book': len(vocab_postings[word]),
        'meaning': meaning,
        'first_ref': vocab_postings[word][0],
        'refs': [(x, raw_sens[x]) for x in vocab_postings[word]][:5],
        'rarity word': root_word,
        'rarity': w[5],
        'rarity count': w[4]
    }
    all_vocab.append(vocabrow)
len(all_vocab)

1342

In [454]:
# sorted_vocab = sorted(all_vocab, key=lambda x:-x['count in book'])
sorted_vocab = sorted(all_vocab, key=lambda x:x['rarity count'])

In [487]:
def get_output_path(ext, suffix = ''):
    path = os.path.dirname(BOOK)
    fname_noext = os.path.splitext(os.path.basename(BOOK))[0]
    new_fname = f'{fname_noext}_vocab'
    if suffix:
        new_fname = f'{new_fname}_{suffix}'
    return os.path.join(path, f'{new_fname}.{ext}')
OUTPUT_FILE = get_output_path('html')
OUTPUT_FILE

'shantaram/shantaram_vocab.html'

In [488]:
def write_csv_with_suffix(suffix):
    OUTPUT_FILE = get_output_path('csv', suffix)
    with open(OUTPUT_FILE, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(sorted_vocab[0].keys())
        for v in sorted_vocab:
            writer.writerow(v.values())

In [489]:
sorted_vocab = sorted(all_vocab, key=lambda x:x['rarity count'])
write_csv_with_suffix('sortby_most_rare_first')
sorted_vocab = sorted(all_vocab, key=lambda x:x['first_ref'])
write_csv_with_suffix('sortby_order_of_appearance')
sorted_vocab = sorted(all_vocab, key=lambda x:-x['count in book'])
write_csv_with_suffix('sortby_most_common_first')

In [477]:
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
    loader=FileSystemLoader("templates"),
    autoescape=select_autoescape()
)
TEMPLATE = "vocab_template.html"
template = env.get_template(TEMPLATE)

In [490]:
def write_html_with_jinja(suffix):
    output = template.render(words=sorted_vocab)
    OUTPUT_FILE = get_output_path('html', suffix)
    with open(OUTPUT_FILE, 'w') as f:
        f.write(output)

In [491]:
sorted_vocab = sorted(all_vocab, key=lambda x:x['rarity count'])
write_html_with_jinja('sortby_most_rare_first')
sorted_vocab = sorted(all_vocab, key=lambda x:x['first_ref'])
write_html_with_jinja('sortby_order_of_appearance')
sorted_vocab = sorted(all_vocab, key=lambda x:-x['count in book'])
write_html_with_jinja('sortby_most_common_first')

# Scratch

In [468]:
book_word_idfs['frown']

('frown', 'frown', 'frown', 54, 1361, 14.16348696802581)

In [None]:
# wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
# wordnet_lemmatizer.lemmatize('unscathed', 'v'), [cand for c in gen_candidates('unscathed') for cand in gen_candidates(c)]

In [467]:
# en_dict['envelop']
word_freq_count['frown'], word_freq_count['envelop'] #, en_dict['hotelier']

(1361, 136)

In [None]:
en_dict['ooze'], word_freq_count['scathe'], idf(word_freq_count, total_count, 'fascinate')