In [3]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
from glob import glob
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize

# read manually curated csv

In [4]:
ann_df = pd.read_csv('data/microsoft/to_be_checked.csv', encoding='ISO-8859-1')[['Word', 'Replacement']].dropna()

In [5]:
ann_df.sample(5)

Unnamed: 0,Word,Replacement
930,TRANSFERRED,"gave, given"
911,THEREBY,as such
429,FACILITATES,helps
83,APPLICATION,use
727,PROVIDING,giving


In [6]:
dc = {}
for idx, row in ann_df.iterrows():
    repl = [x.strip() for x in row['Replacement'].split(',')]
    dc[row['Word'].lower()] = repl
dc

{'abaft': ['behind'],
 'abeyance': ['suspension'],
 'abominate': ['hate'],
 'abundance': ['lot'],
 'accelerated': ['sped up'],
 'accessibility': ['ease of use'],
 'accompanied by': ['with'],
 'accompanies': ['comes with'],
 'accompany': ['come with'],
 'accompanying': ['coming with'],
 'accordingly': ['so'],
 'accrue': ['gather', 'gain'],
 'accrued': ['gathered', 'gained'],
 'accrues': ['gathers', 'gains'],
 'accruing': ['gathering', 'gaining'],
 'accurate': ['correct', 'right'],
 'acknowledge': ['noted'],
 'acknowledged': ['noted'],
 'acknowledges': ['notes'],
 'acknowledging': ['noting'],
 'acknowledgment': ['notice'],
 'acquiesce': ['accept', 'allow'],
 'acquisition': ['acquiring'],
 'activated': ['started'],
 'activation': ['start'],
 'additional': ['more'],
 'adjustment': ['change'],
 'adjustments': ['changes'],
 'administration': ['people in charge'],
 'administrative': ['managing'],
 'administrator': ['manager', 'person in charge'],
 'admissible': ['allowed'],
 'aforementioned':

In [7]:
len(dc)

613

# collect all sentences: Brown, OANC, BNC

In [6]:
df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])

In [7]:
df.sent = brown.sents()
df.source = ['brown'] * len(brown.sents())

In [8]:
path = '/home/rebekah/Documents/BNC/Texts/'
files = [f for f in glob(path + "**/*.xml", recursive=True)]

In [9]:
skip = [['Voice', 'over'], ['Male', 'speaker'], ['Female', 'speaker']]

def parse_bnc_xml(path):
    # takes path to BNC xml file
    # returns description of texts
    # returns list sentences, each of which is a list of tokens
    
    tree = ET.parse(path)
    root = tree.getroot()
    info = root[0][0][0][0].text
    sents = []
    
    for div in root[1]:
        sent = []
        for tag in div.iter():
            if tag.text == '\n':
                sent = list(filter(None, sent))
                if sent.count('.') > 1:
                    temp = ' '.join(sent)
                    temp = sent_tokenize(temp)
                    for temp_sent in temp:
                        temp_sent = word_tokenize(temp_sent)
                        if len(sent) < 2:
                            sent = []
                        elif temp_sent in skip: #skip certain things
                            sent = []
                        elif any([piece.isupper() for piece in temp_sent]):
                            sent = []
                        else:
                            sents.append(temp_sent)
                elif len(sent) > 1:
                    if sent in skip:
                        sent = []
                    elif any([piece.isupper() for piece in sent]):
                        sent = []
                    else:
                        sents.append(sent)
                sent = []
            elif tag.text != None:
                sent.append(tag.text.strip())
    
    return info, sents

In [10]:
sent = []
source = []
description = []

for file in tqdm(files):
    info, sents = parse_bnc_xml(file)
    for s in sents:
        sent.append(s)
        source.append('bnc')
        description.append(info)

temp_df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])
temp_df.sent = sent
temp_df.source = source
temp_df.description = description
df = df.append(temp_df)




In [11]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_df.zip')

In [12]:
sent = []
source = []
description = []

for idx, row in tqdm(oanc_df.iterrows(), total = len(oanc_df)):
    for s in row['clean_and_tokenized']:
        sent.append(s)
        source.append('oanc')
        description.append(row['label'])
        
temp_df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])
temp_df.sent = sent
temp_df.source = source
temp_df.description = description
df = df.append(temp_df)




In [13]:
len(df)

3818246

In [14]:
df.sample(10)

Unnamed: 0,sent,source,description,orig,repl
2305642,"[He, could, see, no, improvement, in, prospect...",bnc,Loving and giving. Sample containing about 4...,,
2802073,"[But, it, was, not, a, source, of, income, ;, ...",bnc,Jane's journey. Sample containing about 3376...,,
2562886,"[She, looked, at, the, rigid, contours, of, hi...",bnc,A French encounter. Sample containing about ...,,
1117225,"[However, ,, its, coming, is, not, automatic, ...",bnc,I believe in church growth. Sample containin...,,
1774196,"[‘, Gone, .]",bnc,The Mamur Zapt and the girl in the Nile. Sam...,,
467875,"[Oh, yes, ,, we, could, be, very, chic, when, ...",bnc,[Sainsbury's magazines]. Sample containing a...,,
1500951,"[‘, No, .]",bnc,Towards the end of the morning. Sample conta...,,
687130,"[Without, commitment, faith, seems, to, cost, ...",bnc,Doubt. Sample containing about 36915 words f...,,
1313473,"[’, (, He, goes, on, to, describe, how, swans,...",bnc,The Greek world: 479-323BC. Sample containin...,,
265264,"[I, ca, n't, even, think, about, my, fish, sau...",oanc,journal/slate/53/ArticleIP_57073,,


In [15]:
df.to_pickle('data/lexical_repl/sents_df.zip')

# Compile dataset

In [8]:
df = pd.read_pickle('data/lexical_repl/sents_df.zip')

In [9]:
dc

{'abaft': ['behind'],
 'abeyance': ['suspension'],
 'abominate': ['hate'],
 'abundance': ['lot'],
 'accelerated': ['sped up'],
 'accessibility': ['ease of use'],
 'accompanied by': ['with'],
 'accompanies': ['comes with'],
 'accompany': ['come with'],
 'accompanying': ['coming with'],
 'accordingly': ['so'],
 'accrue': ['gather', 'gain'],
 'accrued': ['gathered', 'gained'],
 'accrues': ['gathers', 'gains'],
 'accruing': ['gathering', 'gaining'],
 'accurate': ['correct', 'right'],
 'acknowledge': ['noted'],
 'acknowledged': ['noted'],
 'acknowledges': ['notes'],
 'acknowledging': ['noting'],
 'acknowledgment': ['notice'],
 'acquiesce': ['accept', 'allow'],
 'acquisition': ['acquiring'],
 'activated': ['started'],
 'activation': ['start'],
 'additional': ['more'],
 'adjustment': ['change'],
 'adjustments': ['changes'],
 'administration': ['people in charge'],
 'administrative': ['managing'],
 'administrator': ['manager', 'person in charge'],
 'admissible': ['allowed'],
 'aforementioned':

In [10]:
df.head()

Unnamed: 0,sent,source,description,orig,repl
0,"[The, Fulton, County, Grand, Jury, said, Frida...",brown,,,
1,"[The, jury, further, said, in, term-end, prese...",brown,,,
2,"[The, September-October, term, jury, had, been...",brown,,,
3,"[``, Only, a, relative, handful, of, such, rep...",brown,,,
4,"[The, jury, said, it, did, find, that, many, o...",brown,,,


In [11]:
i = 0
word = 'abaft'

for idx, row in tqdm(df.iterrows(), total = len(df)):
    if np.isnan(row.orig) and word in row.sent:
        print(' '.join(row.sent))
        print(' '.join(map(lambda x: x if x != word else dc[word][0], row.sent)))
        print()
        i += 1
    if i > 100:
        break

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

Inside his , Anatoliy had his catch for the day , a couple of handfuls of little silver fish , still slithering , with red around the gills and streaks of gold abaft their fins .
Inside his , Anatoliy had his catch for the day , a couple of handfuls of little silver fish , still slithering , with red around the gills and streaks of gold behind their fins .

’ Talbot woke in his sea-cabin abaft the bridge to find Van Gelder bending over him .
’ Talbot woke in his sea-cabin behind the bridge to find Van Gelder bending over him .

In the yacht we sailed , a second head and shower ( a smaller compartment than the one forward ) was provided just abaft the galley .
In the yacht we sailed , a second head and shower ( a smaller compartment than the one forward ) was provided just behind the galley .

Went-worth/flexner and Chapman both cite the term the leather as meaning a kick , but they date the term from 1946 , citing a passage from Damon Runyon : “ he would give his fallen foe what we cal

In [None]:
# acrolinx examples