In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from glob import glob
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import pickle
import json
from random import sample

import spacy
import pyinflect
from pyinflect import getAllInflections, getInflection
from spacy.tokens import Doc
import time
import nltk
from sklearn.model_selection import train_test_split

In [2]:
nlp = spacy.load('en')

Once all the sentences have been collected, they must be sorted and organized. Phrases containing formal words need to be extracted and marked, as well as translated in a semi-automatic process.

# load & save data

In [331]:
sents_df = pd.read_pickle('data/lexical_repl/sents_df.zip')
sents_df = sents_df.reset_index()

In [5]:
with open('data/lexical_repl/words_dict.pkl', 'rb') as f:
    dc = pickle.load(f)

In [None]:
with open('data/lexical_repl/words_dict.pkl', 'wb') as f:
    pickle.dump(dc, f)

In [304]:
data_df = pd.read_pickle('data/lexical_repl/data_df.zip')

In [32]:
data_df.to_pickle('data/lexical_repl/data_df.zip')

# compile lists of sentences per term: helper functions

In [6]:
def getInflectList(term):
    terms = []
    inflections = getAllInflections(term)
    for pos in inflections:
        for item in inflections[pos]:
            terms.append(item)
    return list(set(terms))

In [134]:
def print_examples(df, word, repl):
    for idx, item in tqdm(df.sent.iteritems(), total = len(df)):
        lowered = [word.lower() for word in item]
        if word in ' '.join(lowered):
            print(' '.join(item))
            print(' '.join(map(lambda x: x if x != word else repl, lowered)))
            print()

# Create .json of words that can be suggested with Acrolinx's current setup

Here, the already-existing lexicons of marked formal words are expanded with all possible inflections of each word.

In [3]:
with open('data/lexical_repl/acrolinx.json', 'r') as f:
    acro = json.load(f)

In [3]:
def getInflectReplacements(orig, repls):
    dc = defaultdict(set)
    orig_i = getAllInflections(orig)
    repl_i = [getAllInflections(repl) for repl in repls]
    for pos in orig_i:
        for repl_dict in repl_i:
            if pos in repl_dict:
                dc[orig_i[pos][0]].add(repl_dict[pos][0])
    if not dc:
        return dict({orig: repls})
    return dict(dc)

In [299]:
d = getInflectReplacements('whence', ['from what', 'from which'])
d

{'whence': ['from what', 'from which']}

In [216]:
d = {'thither': {'there'}}

In [300]:
acro.update(d)

In [301]:
for item in acro:
    acro[item] = list(acro[item])
print(len(acro))
acro

739


{'abaft': ['behind'],
 'abominate': ['hate'],
 'abominated': ['hated'],
 'abominates': ['hates'],
 'abominating': ['hating'],
 'accelerate': ['speed up'],
 'accelerated': ['sped up', 'quickened'],
 'accelerates': ['speeds up'],
 'accelerating': ['quickening', 'speeding up'],
 'accompanied': ['come with', 'came with'],
 'accompanies': ['comes with'],
 'accompany': ['come with'],
 'accompanying': ['coming with'],
 'accordingly': ['so', 'as such'],
 'accrue': ['follow', 'come', 'grow'],
 'accrued': ['followed', 'grew', 'came', 'grown', 'come'],
 'accrues': ['comes', 'grows', 'follows'],
 'accruing': ['following', 'coming', 'growing'],
 'accurate': ['correct', 'right'],
 'acknowledge': ['note', 'recognize'],
 'acknowledged': ['recognized', 'noted'],
 'acknowledges': ['notes', 'recognizes'],
 'acknowledging': ['noting', 'recognizing'],
 'acquiesce': ['agree'],
 'acquiesced': ['agreed'],
 'acquiesces': ['agrees'],
 'acquiescing': ['agreeing'],
 'activate': ['trigger', 'start'],
 'activated':

In [302]:
with open('data/lexical_repl/acrolinx.json', 'w') as f:
    json.dump(acro, f)

# Find phrases containing words in dataset

First, the words are collected - and only the words, not the phrases (main words are taken from phrases) - to make a master list of known formal words. Then the sentences are processed and syntactic phrases with those words are marked.

In [6]:
all_formal_words = ['hark', 'harking', 'harks', 'conjures', 'aggregate', 'functional', 'conjuring',
           'abundance', 'conjure', 'attest', 'abeyance', 'harked', 'conjured', 'attests', 
           'attesting', 'attested', 'accompanied', 'accompany', 'accompanying', 'accompanies',
                   'access', 'eke', 'ekes', 'eked', 'eking', 'apropos', 'recipients', 'recipient',
                   'derive', 'derived', 'deriving', 'derives', 'allocate', 'allocating', 'allocates',
                   'allocated', 'individual', 'individuals']

for item in acro.keys():
    if ' ' not in item:
        all_formal_words.append(item)
        
for item in dc.keys():
    for inflect in [x[0] for x in getAllInflections(item).values()]:
        all_formal_words.append(item)
        
all_formal_words = list(set(all_formal_words))

In [7]:
len(all_formal_words)

827

In [1054]:
with open('data/lexical_repl/list-formal-words-only.pkl', 'wb') as f:
    pickle.dump(all_formal_words, f)

In [991]:
def find_all_phrases(sent):
    
    doc = Doc(nlp.vocab, words=sent)
    nlp.tagger(doc)
    nlp.parser(doc)
    
    indices = []
    for word in all_formal_words:
        phr = find_phrase(doc, word)
        if phr != None:
            for idx in phr:
                indices.append(idx)
    indices = list(set(indices))
                
    arr = [0] * len(sent)
    for idx in indices:
        arr[idx] = 1

    return arr

def check_phrase(tokens, word):
    if len(tokens) > 10:
        words = [tok.text for tok in tokens]
        idx = words.index(word)
        start = idx - 5 if idx > 4 else 0
        end = idx + 5 if len(words) > idx + 4 else len(words) - 1
        tokens = tokens[start:end]
            
    try:
        assert word in [tok.text for tok in tokens], 'word got lost?'
    except AssertionError:
        return None
    return [tok.i for tok in tokens]

def find_phrase(doc, word):
    # sent = list of str (words)
    # word = str
    
    if word not in doc.text:
        return None
    
    # if it's in a noun phrase, then return the noun phrase
    for np in doc.noun_chunks: # use np instead of np.text
        if word in [tok.text for tok in np]:
            return check_phrase([tok for tok in np], word)

    for w in doc:
        if str(w.text) == word:
            if w.dep_ == 'ROOT': # main verb clause: returns with object
                phrase_idx = [w.i]
                for right in w.rights:
                    if right.dep_ in ('punct', 'advcl'):
                        break
                    phrase_idx = phrase_idx + [tok.i for tok in right.subtree]
                if len(phrase_idx) > 1:
                    return check_phrase([tok for tok in doc[min(phrase_idx) : max(phrase_idx) + 1]], word)
                # no object was found? check if next left text is subj, return that
                if len(list(w.lefts)) > 0:
                    left = list(w.lefts)[-1]
                    if left.dep_ == 'nsubj':
                        phrase_idx = [tok.i for tok in left.subtree] + phrase_idx
                return check_phrase([tok for tok in doc[min(phrase_idx) : max(phrase_idx) + 1]], word)
            return check_phrase([tok for tok in w.subtree], word)
    return None

In [1052]:
test_int = 3818245
ex_arr = arrays[test_int]
for idx in range(len(sents_df.sent[test_int])):
    print(str(ex_arr[idx]) + '\t' + sents_df.sent[test_int][idx])

0	Submitted
0	by
0	]


In [1027]:
assert len(arrays) == len(present)

for idx, sent in tqdm(sents_df.sent.iteritems(), total = len(sents_df)):
    if idx < len(arrays):
        continue
    arr = find_all_phrases(sent)
    arrays.append(arr)
    
    if sum(arr) > 0:
        present.append(True)
    else:
        present.append(False)
        
    if idx % 100000 == 0 and idx > 0:
        with open('data/lexical_repl/arrays' + str(idx) + '.pkl', 'wb') as f:
            pickle.dump(arrays, f)
        with open('data/lexical_repl/present' + str(idx) + '.pkl', 'wb') as f:
            pickle.dump(present, f)

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

In [1051]:
assert len(sents_df) == len(arrays) == len(present)

In [1053]:
with open('data/lexical_repl/sents-df-masks.pkl', 'wb') as f:
    pickle.dump(arrays, f)
with open('data/lexical_repl/sents-df-masks-presence-bool.pkl', 'wb') as f:
    pickle.dump(present, f)

# create mini-set with restricted number of occurrences per word

In [1097]:
sents_df.head()

Unnamed: 0,sent,source,description,masks,words,phrases
2,"[The, September-October, term, jury, had, been...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",[investigate],"[[to, investigate, reports, of, possible, ``]]"
6,"[The, grand, jury, commented, on, a, number, o...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[operated, purchasing]","[[purchasing], [well, operated, and, follow]]"
8,"[However, ,, the, jury, said, it, believes, ``...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administration],[[administration]]
12,"[It, urged, that, the, next, Legislature, ``, ...",brown,,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",[provide],"[[that, the, next, Legislature, ``, provide, e..."
18,"[The, jury, also, commented, on, the, Fulton, ...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administrators],[[administrators]]


In [1110]:
any([True, True, False])

True

In [1115]:
counts = defaultdict(lambda: 0)
restricted = []

def check_word(word):
    if counts[word] >= 10:
        counts[word] += 1
        return False
    else:
        counts[word] += 1
        return True

# shuffle to get balance of data
for idx, row in tqdm(sents_df.sample(frac=1).iterrows(), total = len(sents_df)):
    checks = []
    for word in row.words:
        checks.append(check_word(word))
    if any(checks):
        restricted.append(True)
    else:
        restricted.append(False)

HBox(children=(IntProgress(value=0, max=525114), HTML(value='')))

In [1118]:
word_counts = defaultdict(lambda: 0)
for idx, l in sents_df[restricted].words.iteritems():
    for item in l:
        word_counts[item] += 1
word_counts

defaultdict(<function __main__.<lambda>()>,
            {'abeyance': 1,
             'abundance': 9,
             'accelerate': 2,
             'accelerated': 9,
             'accelerating': 6,
             'access': 65,
             'accessibility': 2,
             'accompanied': 30,
             'accompany': 5,
             'accompanying': 13,
             'accordingly': 9,
             'accrue': 1,
             'accrued': 3,
             'accrues': 1,
             'accruing': 3,
             'accurate': 32,
             'acknowledge': 15,
             'acknowledged': 20,
             'acknowledges': 5,
             'acknowledging': 2,
             'acknowledgment': 1,
             'acquiesce': 2,
             'acquiesced': 1,
             'acquisition': 13,
             'activated': 1,
             'activating': 1,
             'activation': 3,
             'adjustment': 20,
             'adjustments': 11,
             'administration': 85,
             'administrative': 37,
       

In [1116]:
print(len(sents_df[restricted]))
sents_df[restricted].head()

7146


Unnamed: 0,sent,source,description,masks,words,phrases
2,"[The, September-October, term, jury, had, been...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",[investigate],"[[to, investigate, reports, of, possible, ``]]"
6,"[The, grand, jury, commented, on, a, number, o...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[operated, purchasing]","[[purchasing], [well, operated, and, follow]]"
8,"[However, ,, the, jury, said, it, believes, ``...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administration],[[administration]]
12,"[It, urged, that, the, next, Legislature, ``, ...",brown,,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",[provide],"[[that, the, next, Legislature, ``, provide, e..."
18,"[The, jury, also, commented, on, the, Fulton, ...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[administrators],[[administrators]]


In [1119]:
sents_df[restricted].to_pickle('data/lexical_repl/sents-df-restricted.pkl')

# Extract phrases

Now marked, the phrases are extracted from the sentences.

In [1056]:
sents_df = pd.read_pickle('data/lexical_repl/sents-df-with-frags.zip')

In [1059]:
sents_df.head()

Unnamed: 0,sent,source,description,masks
2,"[The, September-October, term, jury, had, been...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
6,"[The, grand, jury, commented, on, a, number, o...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,"[However, ,, the, jury, said, it, believes, ``...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12,"[It, urged, that, the, next, Legislature, ``, ...",brown,,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
18,"[The, jury, also, commented, on, the, Fulton, ...",brown,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [1073]:
relevant_words = []
phrases = []

replace = defaultdict(list)

for idx, row in tqdm(sents_df.iterrows(), total = len(sents_df)):
    
    current_phrases = []
    on_phrase = False
    phrase = []
    for i in range(len(row.masks)):
        if row.masks[i]:
            phrase.append(row.sent[i])
            if not on_phrase:
                on_phrase = True
        if not row.masks[i]:
            if on_phrase:
                on_phrase = False
                current_phrases.append(phrase)
                phrase = []
                
    current_words = []
    for p in current_phrases:
        for w in all_formal_words:
            if w in p:
                current_words.append(w)
                replace[w].append(p)
                continue
    current_words = list(set(current_words))
    
    relevant_words.append(current_words)
    phrases.append(current_phrases)
    
sents_df['words'] = relevant_words
sents_df['phrases'] = phrases

HBox(children=(IntProgress(value=0, max=525114), HTML(value='')))

In [1076]:
sents_df.to_pickle('data/lexical_repl/sents-df-with-frags-detail.zip')

In [1087]:
for key in replace:
    x = set()
    for item in replace[key]:
        x.add(tuple(item))
    replace[key] = x

In [1088]:
# turn dictionary of words and phrases into df

phrase = []
word = []

for key in replace:
    for val in replace[key]:
        phrase.append(val)
        word.append(key)
        
repl_df = pd.DataFrame()
repl_df['word'] = word
repl_df['orig'] = phrase
repl_df['repl'] = [None]*len(word)

In [1089]:
repl_df.head()

Unnamed: 0,word,orig,repl
0,internal,"(the, bank, 's, still-sloppy, internal, controls)",
1,internal,"(external, and, internal)",
2,internal,"((, internal, leaf, crowns)",
3,internal,"(one, internal, pocket)",
4,internal,"(the, internal, variations)",


In [1091]:
repl_df.to_pickle('data/lexical_repl/repl_df.zip')

# semi-automatically provide translations

Here, in a similar process as before, I replace the formal words in the extracted phrases with their defined informal replacement. I at least make sure to evaluate, at least briefly, each word to make sure it can be replaced.

In [3]:
repl_df = pd.read_pickle('data/lexical_repl/repl_df.zip')
repl_df = repl_df.drop_duplicates(subset = 'orig', keep = False)
with open('data/lexical_repl/words_dict.pkl', 'rb') as f:
    dc = pickle.load(f)
with open('data/lexical_repl/acrolinx.json', 'r') as f:
    acro = json.load(f)
with open('data/lexical_repl/list-formal-words-only.pkl', 'rb') as f:
    form = pickle.load(f)

In [4]:
def replace(word, replace, phrase):
    phrase = list(phrase)
    new_phrase = [replace if x == word else x for x in phrase]
    new_phrase = ' '.join(new_phrase)
    new_phrase = word_tokenize(new_phrase)
    return new_phrase

In [5]:
#remaining = set()

for idx, row in tqdm(repl_df.iterrows(), total = len(repl_df)):
    if repl_df.at[idx, 'repl'] != None:
        continue
    #remaining.add(row.word)
    if row.word in acro:
        ind = row.orig.index(row.word)
        pos = nltk.pos_tag(list(row.orig))[ind][1]
        repl_pos = []
        for item in acro[row.word]:
            temp_pos = nltk.pos_tag(word_tokenize(item))[0]
            if temp_pos[1] == pos:
                repl_pos.append(temp_pos[0])
        if len(repl_pos) > 1:
            w = sample(repl_pos, 1)[0]
        elif len(repl_pos) == 1:
            w = repl_pos[0]
        else:
            w = sample(acro[row.word], 1)[0]
        repl_df.at[idx, 'repl'] = replace(row.word, w, row.orig)

HBox(children=(IntProgress(value=0, max=369487), HTML(value='')))




In [12]:
repl_df = repl_df.dropna()

In [15]:
repl_df.head()

Unnamed: 0,word,orig,repl
0,internal,"(the, bank, 's, still-sloppy, internal, controls)","[the, bank, 's, still-sloppy, inside, controls]"
2,internal,"((, internal, leaf, crowns)","[(, inside, leaf, crowns]"
3,internal,"(one, internal, pocket)","[one, inside, pocket]"
4,internal,"(the, internal, variations)","[the, inside, variations]"
5,internal,"(Yugoslavia, 's, internal, common, market)","[Yugoslavia, 's, inside, common, market]"


In [18]:
repl_dict = {}
for idx, row in tqdm(repl_df.iterrows(), total = len(repl_df)):
    repl_dict[' '.join(row.orig)] = ' '.join(row.repl)

HBox(children=(IntProgress(value=0, max=306715), HTML(value='')))




In [19]:
with open('data/lexical_repl/repl_dict.pkl', 'wb') as f:
    pickle.dump(repl_dict, f)

In [14]:
repl_df.to_pickle('data/lexical_repl/repl_df.zip')

In [167]:
# prepare for NMT

orig_data = [list(x) for x in list(repl_df.dropna().orig)]
repl_data = list(repl_df.dropna().repl)
assert len(orig_data) == len(repl_data)

orig_train, orig_test, repl_train, repl_test = train_test_split(orig_data, 
                                                                repl_data, 
                                                                test_size = .1,
                                                               random_state = 47)
orig_train, orig_val, repl_train, repl_val = train_test_split(orig_train,
                                                             repl_train,
                                                             test_size = .2,
                                                             random_state = 47)

In [172]:
with open('data/lexical_repl_models/src-train.txt', 'w') as f:
    for x in orig_train:
        f.write(' '.join(x) + '\n')
    
with open('data/lexical_repl_models/src-val.txt', 'w') as f:
    for x in orig_val:
        f.write(' '.join(x) + '\n')
    
with open('data/lexical_repl_models/src-test.txt', 'w') as f:
    for x in orig_test:
        f.write(' '.join(x) + '\n')
    
with open('data/lexical_repl_models/tgt-train.txt', 'w') as f:
    for x in repl_train:
        f.write(' '.join(x) + '\n')
    
with open('data/lexical_repl_models/tgt-val.txt', 'w') as f:
    for x in repl_val:
        f.write(' '.join(x) + '\n')
    
with open('data/lexical_repl_models/tgt-test.txt', 'w') as f:
    for x in repl_test:
        f.write(' '.join(x) + '\n')

## unused: more detailed replacement

In [71]:
to_do.remove('abeyance')

In [72]:
to_do

['abominate',
 'abominated',
 'abundance',
 'accelerate',
 'accelerated',
 'accelerates',
 'accelerating',
 'access',
 'accessibility',
 'accompanied',
 'accompanies',
 'accompany',
 'accompanying',
 'accordingly',
 'accrue',
 'accrued',
 'accrues',
 'accruing',
 'accurate',
 'acknowledge',
 'acknowledged',
 'acknowledges',
 'acknowledging',
 'acknowledgment',
 'acquiesce',
 'acquiesced',
 'acquiesces',
 'acquiescing',
 'acquisition',
 'activate',
 'activated',
 'activates',
 'activating',
 'activation',
 'activations',
 'adjustment',
 'adjustments',
 'administration',
 'administrative',
 'administrator',
 'administrators',
 'admissible',
 'aforementioned',
 'aforesaid',
 'agent',
 'aggregate',
 'aggregated',
 'aggregates',
 'aggregating',
 'aggregation',
 'aggregations',
 'alleged',
 'alleviate',
 'alleviated',
 'alleviates',
 'alleviating',
 'allocate',
 'allocated',
 'allocates',
 'allocating',
 'allocation',
 'alternate',
 'alternative',
 'alternatively',
 'alternatives',
 'amelior

In [73]:
word = 'accelerate'

In [74]:
print(word)
if word in dc:
    print(dc[word])
if word in acro:
    print('acro ' + str(acro[word]))

accelerate
acro ['speed up']


In [84]:
def replace(phrase):
    phrase = list(phrase)
    return ['speed' if x == word else x for x in phrase] + 'up'

In [80]:
do_these = [52452,52453,52457,52459,52460,52461,52467,52472,52473,
            ]

In [82]:
for idx, item in tqdm(repl_df.orig[repl_df.word == word].iteritems(), 
                     total = len(repl_df[repl_df.word == word])):
    if repl_df.at[idx, 'repl'] != None:
        continue
    print(idx)
    print(item)
    #print(nltk.pos_tag(item))
    print()
    #if idx in do_these:
    #    repl_df.at[idx, 'repl'] = replace(item)

HBox(children=(IntProgress(value=0, max=227), HTML(value='')))

52452
('to', 'accelerate', 'the', 'introduction', 'of', 'an', 'appraisal', 'scheme', 'for', 'teachers')

52453
('to', 'accelerate', 'them')

52454
('to', 'accelerate', 'the', 'harmonious', 'development', 'of')

52456
('accelerate', 'the', 'tests', 'if', 'the')

52457
(')', 'accelerate')

52458
('to', 'accelerate', 'in', 'order', 'to', 'meet')

52459
('to', 'accelerate', 'boxing', "'s", 'decline')

52460
('to', 'accelerate', 'our', 'programmes', 'for', 'reducing', 'costs', 'and', 'raising', 'productivity')

52461
('to', 'accelerate', 'the', 'development', 'and', 'deployment', 'of', 'low-carbon')

52463
('effect', ',', 'global', 'warming', 'will', 'accelerate', 'sharply', ',', 'with', 'unpredictable')

52464
('accelerate', 'harder')

52465
('to', 'accelerate', 'water', 'or', 'air', 'past')

52466
('to', 'accelerate', 'and', 'involved', 'quite', 'different', 'factors')

52467
('turbulence', 'and', 'accelerate', 'movement')

52468
('to', 'accelerate', 'and', 'beat', 'only', 'one')

52469
(

In [85]:
repl_df.to_pickle('data/lexical_repl/repl_df-tbd.zip')