In [15]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
from glob import glob
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import difflib
import pickle

This notebook takes the first steps towards a curated dataset to train the fragment translation system. In it, I collect all sentences from the three relevant datasets, split them up into sentences, and put them in a dataframe to use. I also take the validated pairs of formal/informal words and phrases from the Microsoft data and create text files with the sentences from the dataset for each of them to check if they really can be replaced in all contexts.

# read manually curated csv

This creats a dictionary of terms and their informal replacements from the generated and validated lists.

In [2]:
ann_df = pd.read_csv('data/microsoft/to_be_checked.csv', encoding='ISO-8859-1')[['Word', 'Replacement']].dropna()

dc = {}
for idx, row in ann_df.iterrows():
    repl = [x.strip() for x in row['Replacement'].split(',')]
    dc[row['Word'].lower()] = repl

print(len(dc))

613


# collect all sentences: Brown, OANC, BNC

The three different sources of data required different handling methods. The Brown sentences could be directly accessed, pre-tokenized, from nltk, and the BNC/OANC ones had either been accessed and tokenized before for the discourse marker approach, or were stripped and preprocessed from the XML files.

In [6]:
df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])

In [7]:
df.sent = brown.sents()
df.source = ['brown'] * len(brown.sents())

In [8]:
path = '/home/rebekah/Documents/BNC/Texts/'
files = [f for f in glob(path + "**/*.xml", recursive=True)]

In [9]:
skip = [['Voice', 'over'], ['Male', 'speaker'], ['Female', 'speaker']]

def parse_bnc_xml(path):
    # takes path to BNC xml file
    # returns description of texts
    # returns list sentences, each of which is a list of tokens
    
    tree = ET.parse(path)
    root = tree.getroot()
    info = root[0][0][0][0].text
    sents = []
    
    for div in root[1]:
        sent = []
        for tag in div.iter():
            if tag.text == '\n':
                sent = list(filter(None, sent))
                if sent.count('.') > 1:
                    temp = ' '.join(sent)
                    temp = sent_tokenize(temp)
                    for temp_sent in temp:
                        temp_sent = word_tokenize(temp_sent)
                        if len(sent) < 2:
                            sent = []
                        elif temp_sent in skip: #skip certain things
                            sent = []
                        elif any([piece.isupper() for piece in temp_sent]):
                            sent = []
                        else:
                            sents.append(temp_sent)
                elif len(sent) > 1:
                    if sent in skip:
                        sent = []
                    elif any([piece.isupper() for piece in sent]):
                        sent = []
                    else:
                        sents.append(sent)
                sent = []
            elif tag.text != None:
                sent.append(tag.text.strip())
    
    return info, sents

In [10]:
sent = []
source = []
description = []

for file in tqdm(files):
    info, sents = parse_bnc_xml(file)
    for s in sents:
        sent.append(s)
        source.append('bnc')
        description.append(info)

temp_df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])
temp_df.sent = sent
temp_df.source = source
temp_df.description = description
df = df.append(temp_df)




In [11]:
oanc_df = pd.read_pickle('data/discourse_markers/oanc_df.zip')

In [12]:
sent = []
source = []
description = []

for idx, row in tqdm(oanc_df.iterrows(), total = len(oanc_df)):
    for s in row['clean_and_tokenized']:
        sent.append(s)
        source.append('oanc')
        description.append(row['label'])
        
temp_df = pd.DataFrame(columns = ['sent', 'source', 'description', 'orig', 'repl'])
temp_df.sent = sent
temp_df.source = source
temp_df.description = description
df = df.append(temp_df)




In [13]:
len(df)

3818246

In [14]:
df.sample(10)

Unnamed: 0,sent,source,description,orig,repl
2305642,"[He, could, see, no, improvement, in, prospect...",bnc,Loving and giving. Sample containing about 4...,,
2802073,"[But, it, was, not, a, source, of, income, ;, ...",bnc,Jane's journey. Sample containing about 3376...,,
2562886,"[She, looked, at, the, rigid, contours, of, hi...",bnc,A French encounter. Sample containing about ...,,
1117225,"[However, ,, its, coming, is, not, automatic, ...",bnc,I believe in church growth. Sample containin...,,
1774196,"[‘, Gone, .]",bnc,The Mamur Zapt and the girl in the Nile. Sam...,,
467875,"[Oh, yes, ,, we, could, be, very, chic, when, ...",bnc,[Sainsbury's magazines]. Sample containing a...,,
1500951,"[‘, No, .]",bnc,Towards the end of the morning. Sample conta...,,
687130,"[Without, commitment, faith, seems, to, cost, ...",bnc,Doubt. Sample containing about 36915 words f...,,
1313473,"[’, (, He, goes, on, to, describe, how, swans,...",bnc,The Greek world: 479-323BC. Sample containin...,,
265264,"[I, ca, n't, even, think, about, my, fish, sau...",oanc,journal/slate/53/ArticleIP_57073,,


In [15]:
df.to_pickle('data/lexical_repl/sents_df.zip')

# Compile dataset

## OANC / BNC collect sentences

Here, 

In [18]:
df = pd.read_pickle('data/lexical_repl/sents_df.zip')

In [40]:
def print_examples(df, word):
    for idx, row in tqdm(df.iterrows(), total = len(df)):
        lowered = [word.lower() for word in row.sent]
        if np.isnan(row.orig) and word in lowered:
            print(' '.join(row.sent))
            print(' '.join(map(lambda x: x if x != word else dc[word][0], lowered)))
            print()
            
def print_all_examples(df, dc):
    current_status = [file[32:-4] for file in glob('data/lexical_repl/word_contexts/*')]
    current_status.extend([file[37:-4] for file in glob('data/lexical_repl/word_contexts_done/*')])
    print('already done:\t' + ' '.join(current_status))
        
    for term in dc:
        print('file started:\t' + term)
        replace = '[' + '/'.join(dc[term]) + ']'
        with open('data/lexical_repl/word_contexts/' + term + '.txt', 'w') as f:
            for idx, row in tqdm(df.iterrows(), total = len(df)):
                lowered = [word.lower() for word in row.sent]
                if np.isnan(row.orig) and term in lowered:
                    f.write(' '.join(row.sent) + '\n')
                    f.write(' '.join(map(lambda x: x if x != term else replace, lowered)) + '\n\n')
        print('file written:\t' + term)

In [41]:
print_all_examples(df, dc)

already done:	
file started:	considerations


HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

file written:	considerations
file started:	accurate


HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

file written:	accurate
file started:	triggered


HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

file written:	triggered
file started:	obtains


HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))

KeyboardInterrupt: 

# text

In [23]:
with open('data/lexical_repl/w2idx.pkl', 'rb') as f:
    w2idx = pickle.load(f)
    
def collect_all_idx(df):
    indices = []
    for idx, row in tqdm(df.iterrows(), total = len(df)):
        current = []
        for item in row.sent:
            if item in w2idx:
                current.append(w2idx[item])
            else: # handle unknowns
                current.append(w2idx['[UNK]'])
        indices.append(current)
    df['idx'] = indices
    return df

In [24]:
df = collect_all_idx(df)

HBox(children=(IntProgress(value=0, max=3818246), HTML(value='')))




In [25]:
df.to_pickle('data/lexical_repl/sents_df.zip')