In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from glob import glob
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import pickle
import json

import spacy
import pyinflect
from pyinflect import getAllInflections, getInflection
import mlconjug

# load & save data

In [2]:
sents_df = pd.read_pickle('data/lexical_repl/sents_df.zip')

In [3]:
with open('data/lexical_repl/words_dict.pkl', 'rb') as f:
    dc = pickle.load(f)

In [None]:
with open('data/lexical_repl/words_dict.pkl', 'wb') as f:
    pickle.dump(dc, f)

In [None]:
data_df = pd.read_pickle('data/lexical_repl/data_df.zip')

In [32]:
data_df.to_pickle('data/lexical_repl/data_df.zip')

# Create .json of words that can be suggested with Acrolinx's current setup

In [4]:
with open('data/lexical_repl/acrolinx.json', 'r') as f:
    acro = json.load(f)

In [6]:
def getInflectReplacements(orig, repls):
    dc = defaultdict(set)
    orig_i = getAllInflections(orig)
    repl_i = [getAllInflections(repl) for repl in repls]
    for pos in orig_i:
        for repl_dict in repl_i:
            if pos in repl_dict:
                dc[orig_i[pos][0]].add(repl_dict[pos][0])
    if not dc:
        return dict({orig: repls})
    return dict(dc)

In [497]:
d = getInflectReplacements('optimum', ['perfect'])
d

{'optima': {'perfects'}, 'optimum': {'perfect'}}

In [498]:
d = {'optimum': {'perfect'}}

In [499]:
acro.update(d)

In [500]:
for item in acro:
    acro[item] = list(acro[item])
print(len(acro))
acro

556


{'abaft': ['behind'],
 'abominate': ['hate'],
 'abominated': ['hated'],
 'abominates': ['hates'],
 'abominating': ['hating'],
 'accelerate': ['speed up'],
 'accelerated': ['sped up', 'quickened'],
 'accelerates': ['speeds up'],
 'accelerating': ['quickening', 'speeding up'],
 'accompanied': ['come with', 'came with'],
 'accompanies': ['comes with'],
 'accompany': ['come with'],
 'accompanying': ['coming with'],
 'accordingly': ['so', 'as such'],
 'accrue': ['follow', 'come', 'grow'],
 'accrued': ['followed', 'grew', 'came', 'grown', 'come'],
 'accrues': ['comes', 'grows', 'follows'],
 'accruing': ['following', 'coming', 'growing'],
 'accurate': ['correct', 'right'],
 'acknowledge': ['note', 'recognize'],
 'acknowledged': ['recognized', 'noted'],
 'acknowledges': ['notes', 'recognizes'],
 'acknowledging': ['noting', 'recognizing'],
 'acquiesce': ['agree'],
 'acquiesced': ['agreed'],
 'acquiesces': ['agrees'],
 'acquiescing': ['agreeing'],
 'activate': ['trigger', 'start'],
 'activated':

In [501]:
with open('data/lexical_repl/acrolinx.json', 'w') as f:
    json.dump(acro, f)

# Expand Dictionary, Restrict Dataset

In [None]:
# if the word is already in the sentence, then don't replace it? include it but with 0s

First, I make a list of each word on the formal side of the dictionary, and all of its inflections. To make data processing easier, I reduce the very large dataset to only those sentences which have any of these words in it. Then I use the given list to expand the dictionary with inflected forms of the translations as well.

# text

In [157]:
dc['thy'] = ['your']
dc['farewell'] = ['goodbye', 'bye']
dc['shall'] = ['will']
dc['becometh'] = ['becomes']
dc['believeth'] = ['believes']
dc['hath'] = ['has']
dc['upon'] = ['on']
dc['thee'] = ['you']
dc['thou'] = ['you']
dc['ye'] = ['the']
dc['wilt'] = ['will']
dc['thou art'] = ['you are']
dc['']

In [6]:
def getInflectList(term):
    terms = []
    inflections = getAllInflections(term)
    for pos in inflections:
        for item in inflections[pos]:
            terms.append(item)
    return list(set(terms))

In [10]:
getInflectList('acknowledge')

['acknowledged', 'acknowledging', 'acknowledge', 'acknowledges']

In [134]:
def print_examples(df, word, repl):
    for idx, item in tqdm(df.sent.iteritems(), total = len(df)):
        lowered = [word.lower() for word in item]
        if word in ' '.join(lowered):
            print(' '.join(item))
            print(' '.join(map(lambda x: x if x != word else repl, lowered)))
            print()

In [135]:
print_examples(sents_df, 'thou art', '[you are]')

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

Thereupon Hwang Pah said : `` If I knew thou art an Arhat , I would have doubled you up before thou got over there '' ! !
thereupon hwang pah said : `` if i knew thou art an arhat , i would have doubled you up before thou got over there '' ! !

`` Thou art God '' .
`` thou art god '' .

As they merged , grokking together , Mike said softly and triumphantly : `` Thou art God '' .
as they merged , grokking together , mike said softly and triumphantly : `` thou art god '' .

Thou art God '' ! !
thou art god '' ! !

Thou art a cobbler art thou ?
thou art a cobbler art thou ?

You thou art a cobbler art thou ?
you thou art a cobbler art thou ?

Thou art a cobbler art thou ?
thou art a cobbler art thou ?

Mm thou art a cobbler art thou ?
mm thou art a cobbler art thou ?

interesting you see ? Thou art a cobbler art thou ?
interesting you see ? thou art a cobbler art thou ?

Thou art a cobbler art thou ?
thou art a cobbler art thou ?

Thou art a cobbler art thou ?
thou art a cobbler art thou 

In [167]:
for idx, item in tqdm(sents_df.sent.iteritems(), total = len(sents_df)):
    if 'thy' in item:
        print(' '.join(item))
        print()
        print()
    #if ' he who ' in ' '.join(item):
    #    print(' '.join(item))
    #for term in getInflectList('becometh'):
    #    if term in item:
    #        print(idx)

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

['``', 'O', 'Blessed', 'Virgin', 'Mary', ',', 'Mother', 'of', 'God', 'and', 'our', 'most', 'gentle', 'queen', 'and', 'mother', ',', 'look', 'down', 'in', 'mercy', 'upon', 'England', ',', 'thy', '``', 'dowry', "''", ',', 'and', 'upon', 'us', 'all', 'who', 'greatly', 'hope', 'and', 'trust', 'in', 'thee', '.']
['Intercede', 'for', 'our', 'separated', 'brethren', ',', 'that', 'with', 'us', 'in', 'the', 'one', 'true', 'fold', 'they', 'may', 'be', 'united', 'to', 'the', 'chief', 'Shepherd', ',', 'the', 'vicar', 'of', 'thy', 'Son', '.']
['America', ',', 'America', ',', 'God', 'mend', 'thy', 'every', 'flaw', ',', 'confirm', 'thy', 'soul', 'in', 'self', 'control', ',', 'the', 'liberty', 'in', 'law', '.']
['America', ',', 'America', ',', 'God', 'shed', 'His', 'grace', 'on', 'thee', ',', 'and', 'crown', 'thy', 'good', 'with', 'brotherhood', 'from', 'sea', 'to', 'shining', 'sea', "''", '.']
['My', 'own', 'stern', 'hand', 'has', 'rent', 'the', 'ancient', 'bond', ',', 'And', 'thereof', 'shall', 'the

['It', 'said', ',', '‘', 'Love', 'thy', 'neighbour', 'as', 'thyself', '.']
['And', 'though', 'thy', 'walls', 'be', 'of', 'the', 'countrey', 'stone', ',']
['And', 'tasteless', 'was', 'the', 'wretch', 'who', 'thy', 'existence', 'plann', "'d", '.']
['Not', 'trees', 'alone', 'have', 'owned', 'their', 'force', ',', 'Whole', 'woods', 'beneath', 'them', 'bowed', ',', 'They', 'turned', 'the', 'winding', 'rivulet', "'s", 'course', ',', 'And', 'all', 'thy', 'pastures', 'plough', "'d", '.']
['The', 'Genius', 'of', 'thy', 'shades', ',', 'by', 'Plutus', 'brib', "'d", ',']
['Amid', 'thy', 'grassy', 'lanes', ',', 'thy', 'wildwood', 'glens', ',']
['Thy', 'knolls', 'and', 'bubbling', 'wells', ',', 'thy', 'rocks', ',', 'and', 'streams', ',']
['He', 'was', 'ostracized', 'after', 'his', 'return', 'by', 'an', '‘', 'electorate', '’', 'which', 'certainly', 'included', 'the', '4000', 'hoplites', 'he', 'had', 'brought', 'back', 'from', 'Ithome', ',', 'who', 'went', 'on', 'to', 'fight', 'the', 'Spartans', 'hard

['Such', 'a', 'one', 'is', 'bidden', 'to', '``', 'preye', '``', '``', 'seke', '``', '``', 'aske', '``', 'and', '``', 'knokke', 'at', 'the', 'dore', '``', 'in', 'the', 'certainty', 'that', '``', 'he', 'schal', 'come', '…', 'and', 'coumforte', 'thy', 'desolate', 'soule', '``', 'as', 'the', 'Holy', 'Ghost', 'came', 'to', 'the', 'apostles', ':', 'The', 'book', 'ends', 'with', 'a', 'meditation', 'on', 'the', 'sacrament', 'as', 'the', 'focus', 'for', 'the', 'experience', 'of', 'the', 'transforming', 'energy', 'of', 'God', 'released', 'in', 'time', 'at', 'the', 'Incarnation', '.']
['It', 'was', 'a', 'form', 'of', 'devotion', 'found', 'not', 'only', 'among', 'the', 'learned', 'but', 'in', 'prayers', 'designed', 'for', 'the', 'laity', ',', 'as', 'for', 'instance', ',', 'in', 'another', 'prayer', 'recommended', 'for', 'lay', 'folk', 'at', 'the', 'elevation', 'of', 'the', 'Host', ':', 'or', 'in', 'the', 'prayer', ',', 'a', 'fragment', 'of', 'which', 'was', 'carved', 'on', 'a', 'church', 'bench', 

['I', 'know', 'thee', 'not', ',', 'old', 'man', ':', 'fall', 'to', 'thy', 'prayers', ';', 'How', 'ill', 'white', 'hairs', 'become', 'a', 'fool', 'and', 'jester', '!']
['No', 'more', 'be', 'griev', "'d", 'at', 'that', 'which', 'thou', 'hast', 'done', ';', '...', '.', 'For', 'to', 'thy', 'sensual', 'fault', 'I', 'bring', 'in', 'sense', '--', 'Thy', 'adverse', 'party', 'is', 'thy', 'advocate', '.']
['If', 'thou', 'dost', ',', 'remember', 'that', 'thy', 'gun', 'had', 'nothing', 'to', 'do', 'with', 'it', '.', "''"]
['Farewell', 'to', 'you', ',', 'my', 'hopes', ',', 'my', 'wonted', 'waking', 'dreams', ';', 'Farewell', ',', 'sometimes', 'enjoyed', 'joy', ',', 'eclipsed', 'are', 'thy', 'beams', ';', 'Farewell', ',', 'self-pleasing', 'thoughts', 'which', 'quietness', 'brings', 'forth', ';', 'Farewell', ',', 'friendship', "'s", 'sacred', 'league', ',', 'uniting', 'minds', 'of', 'worth', '.']
['St.', 'Augustine', '(', '354-430', ')', 'was', 'among', 'the', 'first', 'to', 'study', 'the', 'ethics',

In [None]:
def all_contexts(word, sent):
    return True

def 

In [118]:
sents_df.head()

Unnamed: 0,sent,source,description
2,"[The, September-October, term, jury, had, been...",brown,
6,"[The, grand, jury, commented, on, a, number, o...",brown,
8,"[However, ,, the, jury, said, it, believes, ``...",brown,
9,"[The, City, Purchasing, Department, ,, the, ju...",brown,
12,"[It, urged, that, the, next, Legislature, ``, ...",brown,


In [127]:
data_df = pd.DataFrame(columns = ['idx', 'sent'])

In [128]:
data_df.head()

Unnamed: 0,idx,sent


In [120]:
sents_df.iloc[[0, 1]]

Unnamed: 0,sent,source,description
2,"[The, September-October, term, jury, had, been...",brown,
6,"[The, grand, jury, commented, on, a, number, o...",brown,


In [None]:
data_df.append(sents_df[indi])

In [89]:
nlp = spacy.load('en')

In [90]:
tokens = nlp('This is an example of xxtest.')
tokens[3]._.inflect('NNS')

'examples'

In [94]:
getInflection('watch', tag='VBD')

('watched',)

In [21]:
dc

{'abaft': ['behind'],
 'abeyance': ['suspension'],
 'abominate': ['hate'],
 'abundance': ['lot'],
 'accelerated': ['sped up'],
 'accessibility': ['ease of use'],
 'accompanied by': ['with'],
 'accompanies': ['comes with'],
 'accompany': ['come with'],
 'accompanying': ['coming with'],
 'accordingly': ['so'],
 'accrue': ['gather', 'gain'],
 'accrued': ['gathered', 'gained'],
 'accrues': ['gathers', 'gains'],
 'accruing': ['gathering', 'gaining'],
 'accurate': ['correct', 'right'],
 'acknowledge': ['noted'],
 'acknowledged': ['noted'],
 'acknowledges': ['notes'],
 'acknowledging': ['noting'],
 'acknowledgment': ['notice'],
 'acquiesce': ['accept', 'allow'],
 'acquisition': ['acquiring'],
 'activated': ['started'],
 'activation': ['start'],
 'additional': ['more'],
 'adjustment': ['change'],
 'adjustments': ['changes'],
 'administration': ['people in charge'],
 'administrative': ['managing'],
 'administrator': ['manager', 'person in charge'],
 'admissible': ['allowed'],
 'aforementioned':

In [3]:
print(len(glob('data/lexical_repl/word_contexts/*')))
print(len(glob('data/lexical_repl/word_contexts_done/*')))

531
70


In [None]:
def ie_preprocess(sent):
    sent = nltk.pos_tag(sent)

In [None]:
# https://www.nltk.org/book/ch07.html

In [None]:
all_context_words = ['abaft', 'as such']
# add 'apparently' to 'purportedly' (right now only has 'supposedly')

def clean_dict(dc):
    new_dc = {}
    for word in dc:
        if word not in all_context_words:
            new_dc[word] = dc[word]
    return new_dc

In [None]:
# acrolinx examples

In [None]:
'is/are functional'
'prï¿½cis'
'expirations'
'in situ'
#anything with a space

In [35]:
conj = mlconjug.Conjugator(language='en')

In [53]:
test_verb = conj.conjugate("make")
all_conjugated_forms = test_verb.iterate()
print(all_conjugated_forms)

[('imperative', 'imperative present', '2s', 'make'), ('imperative', 'imperative present', '1p', 'make'), ('imperative', 'imperative present', '2p', 'make'), ('infinitive', 'infinitive present', 'ke', 'make'), ('indicative', 'indicative present perfect', '1s', 'made'), ('indicative', 'indicative present perfect', '2s', 'made'), ('indicative', 'indicative present perfect', '3s', 'made'), ('indicative', 'indicative present perfect', '1p', 'made'), ('indicative', 'indicative present perfect', '2p', 'made'), ('indicative', 'indicative present perfect', '3p', 'made'), ('indicative', 'indicative present', '1s', 'make'), ('indicative', 'indicative present', '2s', 'make'), ('indicative', 'indicative present', '3s', 'makes'), ('indicative', 'indicative present', '1p', 'make'), ('indicative', 'indicative present', '2p', 'make'), ('indicative', 'indicative present', '3p', 'make'), ('indicative', 'indicative past tense', '1s', 'made'), ('indicative', 'indicative past tense', '2s', 'made'), ('indica

In [13]:
conj.conjugate("leave").conjug_info

{'imperative': {'imperative present': OrderedDict([('2s', 'leave'),
               ('1p', 'leave'),
               ('2p', 'leave')])},
 'indicative': {'indicative past tense': OrderedDict([('1s', 'left'),
               ('2s', 'left'),
               ('3s', 'left'),
               ('1p', 'left'),
               ('2p', 'left'),
               ('3p', 'left')]),
  'indicative present': OrderedDict([('1s', 'leave'),
               ('2s', 'leave'),
               ('3s', 'leaves'),
               ('1p', 'leave'),
               ('2p', 'leave'),
               ('3p', 'leave')]),
  'indicative present continuous': OrderedDict([('1s 1s', 'leaving'),
               ('2s 2s', 'leaving'),
               ('3s 3s', 'leaving'),
               ('1p 1p', 'leaving'),
               ('2p 2p', 'leaving'),
               ('3p 3p', 'leaving')]),
  'indicative present perfect': OrderedDict([('1s', 'left'),
               ('2s', 'left'),
               ('3s', 'left'),
               ('1p', 'left'),
          