In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from glob import glob
import xml.etree.ElementTree as ET
import string
import pickle

This notebook serves the purpose of extracting discourse markers/connectives from various corpora for later use.

# helper functions

These functions are used later in the notebook for various purposes.

In [80]:
def read_lines(loc):
    # given file path, returns all lines in list form
    with open(loc) as f:
        return [x.strip() for x in f.readlines()]

In [84]:
def read_file(loc):
    # given file path
    with open(loc) as f:
        return f.read().strip()

In [48]:
def fill_df(df, terms):
    # count number of example sentences (unverified) for each example
    terms_dict = defaultdict(list)
    for item in tqdm(terms):
        if item.strip() in df.columns:
            continue
        for sent in df['sent']:
            sent = ' '.join(sent)
            if item + ' ' in sent:
                terms_dict[item.strip()].append(1)
            else:
                terms_dict[item.strip()].append(0)
        df[item.strip()] = terms_dict[item.strip()]
    return df

In [49]:
def clean_sent_simple(sent, term):
    # given a sentence and a term in one of the initial positions,
    # removes the term and capitalizes the next appropriate word
    # makes sure to remove commas as well
    
    copy = sent.copy()
    
    terms = nltk.word_tokenize(term)
    if len(terms) == 1:
        term = terms[0]
    else:
        temp_idx = -1
        for term_idx in range(len(terms)):
            if temp_idx == -1:
                temp_idx = copy.index(terms[term_idx])
            else:
                try:
                    assert copy.index(terms[term_idx]) == temp_idx + 1
                except: # there's only 3
                    print('skip')
                    return np.nan
                del copy[temp_idx + 1]
        term = terms[0]
    
    idx = sent.index(term)
    
    if copy[idx + 1] == ',': # remove comma too
        del copy[idx + 1]
    elif copy[idx + 1] == '.': # make sure you're not removing entire sentence
        print('end of sent')
        return np.nan
    elif copy[idx + 1] == "''": # end of quote
        print('end of quote')
        return np.nan
        
    copy[idx + 1] = copy[idx + 1].capitalize()
    del copy[idx]
        
    return copy

def clean_df(df):
    # given a dataset with sentences
    # provide 'clean' version of sentence, without the relevant term, using function above
    # checks for capitalization and makes proper if necessary
    
    df['clean'] = [np.nan] * len(df)
    df['clean'] = df['clean'].astype(object)
    terms = df.columns[1:-1]
    
    for term in tqdm(terms):
        for idx, row in df[df[term] == 1].iterrows():
            df.at[idx, 'clean'] = clean_sent_simple(row['sent'], term)
            
    return df

In [6]:
def clean_sent(sent, term):
    # given a sentence and a term in one of the initial positions,
    # removes the term and capitalizes the next appropriate word
    # makes sure to remove commas as well
    # this version tailored to BNC corpus and gives feedback
    
    copy = sent.copy()
    idx = copy.index(term)
    copy[idx] = ''
    
    change = False
    for rest_idx in range(idx, len(copy)):
        if copy[rest_idx] == '':
            continue
        elif copy[rest_idx] == ',': # remove comma too
            copy[rest_idx] = ''
        elif copy[rest_idx] == '.':
            print('ERROR: end of sent')
            print(sent)
            return np.nan
        elif copy[rest_idx] == "'" or copy[rest_idx] == '"':
            print('ERROR: end of quote')
            print(sent)
            return np.nan
        elif copy[rest_idx][0].isalpha():
            copy[rest_idx] = copy[rest_idx].capitalize()
            change = True
            
        if change == True:
            break
    
    if change == False:
        print('ERROR: no change possible')
        print(sent)
        return np.nan
        
    return list(filter(None, copy))

In [223]:
def check_sent(sent):
    # ensure that sentence is proper, remove some dust, return
    ends = ['”', '"', "'", '.', ']', ')', '!', '?', '’', '…', ':']
    sent = sent.strip(string.digits)
    sent = sent.strip()
    if sent == '':
        return ''
    if sent[0].isupper() and sent[-1] in ends: # try to filter out sentences that aren't complete
        return sent
    else:
        return ''

In [251]:
def return_vector(sent, terms):
    # given a dictionary of terms-to-idx, returns a vector of length len(terms + 1) showing which term is in the sentence
    # the last class is the null class -- i.e. no term appears
    # also returns the sentence, changed properly if a term does appear
    sent = nltk.word_tokenize(sent)
    
    vec = [0] * (len(terms) + 1)
    for term in terms:
        if term in sent:
            vec[terms[term]] = 1
            sent = clean_sent(sent, term)
            break            
    
    return vec, sent

In [5]:
skip = [['Voice', 'over'], ['Male', 'speaker'], ['Female', 'speaker']]

def parse_bnc_xml(path):
    # takes path to BNC xml file
    # returns description of texts
    # returns list of documents, each of which is a list of sentences, each of which is a list of tokens
    
    tree = ET.parse(path)
    root = tree.getroot()
    info = root[0][0][0][0].text
    documents = []
    
    for div in root[1]:
        doc = []
        sent = []
        for tag in div.iter():
            if tag.text == '\n':
                sent = list(filter(None, sent))
                if sent.count('.') > 1:
                    temp = ' '.join(sent)
                    temp = sent_tokenize(temp)
                    for temp_sent in temp:
                        temp_sent = word_tokenize(temp_sent)
                        if len(sent) < 2:
                            sent = []
                        elif temp_sent in skip: #skip certain things
                            sent = []
                        elif any([piece.isupper() for piece in temp_sent]):
                            sent = []
                        else:
                            doc.append(temp_sent)
                elif len(sent) > 1:
                    if sent in skip:
                        sent = []
                    elif any([piece.isupper() for piece in sent]):
                        sent = []
                    else:
                        doc.append(sent)
                sent = []
            elif tag.text != None:
                sent.append(tag.text.strip())
        if len(doc) > 1:
            documents.append(doc)
    
    return info, documents

# Open American National Corpus (OANC)

The nature of the OANC documents meant that they had to be handled specially, without much use of the above functions. Some types of files were skipped (the spoken ones were difficult to parse into sentences, and the biomedical files not only had those segmentation issues but were also profoundly technical) and others had differing preprocessing needs that were discovered through some manual inspection.

In [204]:
disc = read_lines('/home/rebekah/Documents/make-it-sound-less-formal/data/discourse_markers/discourse_markers.txt')

In [42]:
path = '/home/rebekah/Documents/OANC-GrAF/'

In [43]:
files = [f for f in glob(path + "**/*.txt", recursive=True)]

In [220]:
all_paragraphs = []
labels = []

for file in tqdm(files):
    if 'spoken' in file or 'biomed' in file:
        continue
    else:
        text = read_file(file)
        if 'eggan' in file:
            text = text.replace("\n\t\t\t\t", ' ')
            text = text.replace('\t', '')
            text = text.split('\n')
        elif 'slate' in file or 'verbatim' in file:
            text = text.split('\n\n')
            text = [x.replace('\n', ' ').strip() for x in text]
        elif 'verbatim' in file:
            text = text.split('\n\n')
            text = [x.replace('\n', ' ').strip() for x in text]
        elif 'icic' in file:
            text = text.split('\n\n')
            text = [x.replace('\n', ' ').strip() for x in text]
            text = [x.replace('       ', ' ').strip() for x in text]
        elif 'OUP' in file:
            text = [x.strip() for x in text.split('\n')]
        elif '911report' in file:
            text = text.replace('            ', '')
            text = text.split('\n\n')
            text = [x.replace('\n', ' ').strip() for x in text]
            text = [x.replace('     ', ' ').strip() for x in text]
        elif 'government' in file:
            text = text.split('\n\n')
            text = [x.replace('\n', ' ').strip() for x in text]
        elif 'plos' in file:
            text = text.replace('\n        ', ' ')
            text = text.split('\n')
        elif 'berlitz1' in file:
            text = text.replace('\n        ', ' ')
            text = text.split('\n')
        elif 'berlitz2' in file:
            text = text.replace('\n        ', ' ')
            text = text.split('\n')
        for par in text:
            if par == '':
                continue
            elif len(par) < 100:
                continue
            else:
                all_paragraphs.append(par)
                labels.append(file[49:-4])

oanc_df = pd.DataFrame()
oanc_df['text'] = all_paragraphs
oanc_df['label'] = labels

In [226]:
sents = []
count = 0
for idx, row in tqdm(oanc_df.iterrows(), total = len(oanc_df)):
    temp_sents = []
    for sent in nltk.sent_tokenize(row['text']):
        sent = check_sent(sent)
        if sent != '':
            temp_sents.append(sent)
            count += 1
    if len(temp_sents) > 0:
        sents.append(temp_sents)
    else:
        sents.append(np.nan)
oanc_df['sents'] = sents
oanc_df = oanc_df.dropna()
print(count) # number of sentences

339631


In [228]:
oanc_df.sample(10)

Unnamed: 0,text,label,sents
896,Computer layout systems also improve the quali...,non-fiction/OUP/Abernathy/ch8,[Computer layout systems also improve the qual...
15792,"Tenet, accompanied by his deputy director for ...",technical/911report/chapter-6,"[Tenet, accompanied by his deputy director for..."
32416,The sociologist Nathan Glazer reviewed Slavery...,journal/slate/48/ArticleIP_9089,[The sociologist Nathan Glazer reviewed Slaver...
15706,"His one-day stopover on March 25, 2000, was th...",technical/911report/chapter-6,"[His one-day stopover on March 25, 2000, was t..."
61409,The coverage emphasizes that the White House c...,journal/slate/7/Article247_821,[The coverage emphasizes that the White House ...
10875,Internal Control 7.11 Auditors should obtain a...,technical/government/Gen_Account_Office/Govern...,[Internal Control 7.11 Auditors should obtain ...
20306,"And there Fish more or less stops. (Well, actu...",journal/slate/19/Article247_4257,"[And there Fish more or less stops., Maddening..."
64273,The remainder of Chapter 1 is mainly a catalog...,journal/verbatim/VOL15_2,[The remainder of Chapter 1 is mainly a catalo...
29733,I love this in part because I am proud that I ...,journal/slate/37/ArticleIP_2561,[I love this in part because I am proud that I...
28834,"""That was a miserable year, when I watched a g...",journal/slate/51/ArticleIP_34924,[Who watched whom go from 92 to 38 what?]


In [230]:
terms_dict = defaultdict(int)

for idx, row in tqdm(oanc_df.iterrows(), total = len(oanc_df)):
    for sent in row.sents:
        sent = nltk.word_tokenize(sent)
        for item in disc:
            if item in sent:
                terms_dict[item] += 1

print(terms_dict)

defaultdict(<class 'int'>, {'And': 6150, 'Recently': 108, 'First': 990, 'Yet': 759, 'So': 2115, 'Fortunately': 82, 'But': 11108, 'Unfortunately': 256, 'Finally': 457, 'Now': 1264, 'Also': 993, 'Yes': 356, 'Or': 888, 'Oh': 228, 'Well': 534, 'Again': 160, 'Alas': 78, 'Ah': 66, 'Plus': 60, 'Sure': 110, 'Admittedly': 23, 'Basically': 19, 'Hey': 83, 'Heck': 2, 'Seriously': 4, 'Okay': 17, 'Uh': 16, 'Anyway': 86, 'Anyhow': 8, 'Ok': 2})


In [249]:
sorted_items = sorted(terms_dict.items(), key=lambda x:x[1])[-9:]
# take the ones with more than 500 occurrences - even with these this is not enough data

chosen_terms = {}
idx = 0
for item in sorted_items:
    chosen_terms[item[0]] = idx
    idx += 1
chosen_terms

{'Also': 4,
 'And': 7,
 'But': 8,
 'First': 3,
 'Now': 5,
 'Or': 2,
 'So': 6,
 'Well': 0,
 'Yet': 1}

In [257]:
tokenized_par = []
vecs_par = []

for idx, row in tqdm(oanc_df.iterrows(), total = len(oanc_df)):
    tokenized_sents = []
    vecs = []
    for sent in row.sents:
        vec, tokenized_sent = return_vector(sent, chosen_terms)
        tokenized_sents.append(tokenized_sent)
        vecs.append(vec)
    tokenized_par.append(tokenized_sents)
    vecs_par.append(vecs)

oanc_df['clean_and_tokenized'] = tokenized_par
oanc_df['vectors'] = vecs_par

end of sent
end of sent
end of quote
end of sent
end of sent
end of sent
end of quote
end of sent
end of sent
end of sent
end of quote
end of sent
end of sent
end of quote
end of sent
end of quote
end of quote
end of quote
end of quote
end of sent
end of sent
end of quote
end of quote
end of sent
end of sent
end of sent


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [265]:
oanc_df = oanc_df.dropna()
oanc_df = oanc_df.reset_index(drop = True)
oanc_df

Unnamed: 0,text,label,sents,clean_and_tokenized,vectors
0,In my three decades of teaching university cou...,non-fiction/OUP/Berk/ch1,[In my three decades of teaching university co...,"[[In, my, three, decades, of, teaching, univer...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
1,"As a byproduct of those experiences, parents r...",non-fiction/OUP/Berk/ch1,"[As a byproduct of those experiences, parents ...","[[As, a, byproduct, of, those, experiences, ,,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
2,"•Bob and Sharon, parents of a 4-year-old: Our ...",non-fiction/OUP/Berk/ch1,"[When we looked for a preschool, many programs...","[[When, we, looked, for, a, preschool, ,, many...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
3,"•Angela, mother of a 4-year-old and 6-year-old...",non-fiction/OUP/Berk/ch1,[I’ve read that it’s the quality of time we sp...,"[[I, ’, ve, read, that, it, ’, s, the, quality...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
4,"•Talia, mother of a 7-year-old: My son Anselmo...",non-fiction/OUP/Berk/ch1,[His father ﬁrmly insists that he do it by him...,"[[His, father, ﬁrmly, insists, that, he, do, i...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
5,"•Noah and Suzanne, parents of a 2-year-old: Wh...",non-fiction/OUP/Berk/ch1,[Recently we read that how children turn out i...,"[[Recently, we, read, that, how, children, tur...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
6,"Despite being well educated, intent on doing w...",non-fiction/OUP/Berk/ch1,"[Despite being well educated, intent on doing ...","[[Despite, being, well, educated, ,, intent, o...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
7,"The reasons, I believe, are twofold. First, ra...",non-fiction/OUP/Berk/ch1,"[The reasons, I believe, are twofold., First, ...","[[The, reasons, ,, I, believe, ,, are, twofold...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, ..."
8,"Over the past three decades, external forces i...",non-fiction/OUP/Berk/ch1,"[Over the past three decades, external forces ...","[[Over, the, past, three, decades, ,, external...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
9,Although many societal conditions heighten par...,non-fiction/OUP/Berk/ch1,[Although many societal conditions heighten pa...,"[[Although, many, societal, conditions, height...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."


In [270]:
oanc_df.to_pickle('data/discourse_markers/oanc_df.zip')

with open('oanc_terms.pkl', 'wb') as f:
    pickle.dump(chosen_terms, f)

# British National Corpus (BNC)

The BNC, while easier to handle than the OANC, still had some specialized preprocessing and segmentation issues to be handled. Here I extract the data, clean it and put it into a DataFrame for later use.

In [3]:
with open('data/discourse_markers/oanc_terms.pkl', 'rb') as f:
    terms_dict = pickle.load(f)

In [4]:
path = '/home/rebekah/Documents/BNC/Texts/'
files = [f for f in glob(path + "**/*.xml", recursive=True)]

In [7]:
label = []
sent1 = []
sent2 = []
sent2_og = []
y = []

for file in tqdm(files):
    info, docs = parse_bnc_xml(file)
    for doc in docs:
        for idx in range(len(doc))[1:]:
            found = False
            for word in doc[idx][:3]:
                if word in terms_dict:
                    clean = clean_sent(doc[idx], word)
                    if type(clean) != list:
                        break
                    label.append(info)
                    sent1.append(doc[idx-1])
                    sent2.append(clean)
                    sent2_og.append(doc[idx])
                    this_y = np.zeros(len(terms_dict) + 1)
                    this_y[terms_dict[word]] = 1
                    y.append(this_y)
                    found = True
            if not found:
                label.append(info)
                sent1.append(doc[idx-1])
                sent2.append(doc[idx])
                sent2_og.append(None)
                this_y = np.zeros(len(terms_dict) + 1)
                this_y[9] = 1
                y.append(this_y)
        break
                
df = pd.DataFrame()
df['label'] = label
df['sent1'] = sent1
df['sent2'] = sent2
df['sent2_orig'] = sent2_og 
df['y'] = y

ERROR: no change possible
['Well', '!']
ERROR: end of sent
['Well', ',', '.']
ERROR: no change possible
['Well', '!']
ERROR: no change possible
['Yes', '.', 'So']
ERROR: no change possible
['Ah', '!', 'Well']
ERROR: no change possible
['Yeah', '.', 'Well']
ERROR: end of sent
['And', '.']
ERROR: no change possible
['So', '?']
ERROR: no change possible
['Ha', '!', 'Well']
ERROR: end of sent
['Well', '.']
ERROR: end of sent
['But', '.']
ERROR: end of sent
['So', '.']
ERROR: end of sent
['Now', '.']
ERROR: no change possible
['Right', '.', 'And']
ERROR: no change possible
['on', '.', 'And']
ERROR: no change possible
['No', '!', 'Well']
ERROR: end of sent
['Now', '.']
ERROR: no change possible
['So', '?']
ERROR: no change possible
['Or', '?']
ERROR: no change possible
['And', '?']
ERROR: end of sent
['So', '.']
ERROR: end of sent
['So', '.']
ERROR: end of sent
['And', '.']
ERROR: no change possible
['birthday', '.', 'Now']
ERROR: no change possible
['Liver', '.', 'And']
ERROR: no change pos

ERROR: no change possible
['‘', 'And', '…', '?', '’']
ERROR: end of sent
['Yet', '.']
ERROR: end of sent
['Now', '.']
ERROR: no change possible
['But', '…']
ERROR: no change possible
['‘', 'And', '?', '’']
ERROR: no change possible
['‘', 'But', '…', '’']
ERROR: no change possible
['‘', 'So', '?', '’']
ERROR: no change possible
['‘', 'So', '?', '’']
ERROR: no change possible
['‘', 'But', '…', '’']
ERROR: no change possible
['Well', '!']
ERROR: no change possible
['And', '—', '’']
ERROR: no change possible
['‘', 'So', '?']
ERROR: end of sent
['Well', '.']
ERROR: no change possible
['‘', 'And', '?', '’']
ERROR: no change possible
['‘', 'Well', '?', '’']
ERROR: no change possible
['‘', 'But', '?', '’']
ERROR: no change possible
['‘', 'But', '?', '’']
ERROR: end of sent
['Now', '.']
ERROR: end of sent
['‘', 'Now', '.']
ERROR: end of sent
['‘', 'But', '—', '.', '’']
ERROR: no change possible
['‘', 'Well', '?']
ERROR: no change possible
['‘', 'So', '?']
ERROR: no change possible
['Tart-with-a

In [19]:
df.sample(5)

Unnamed: 0,label,sent1,sent2,sent2_orig,y
186625,The birdwatcher's handbook. Sample containing...,"[There, we, all, stand, in, companionable, sil...","[More, folk, tramp, along, the, sandy, track, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
258269,Introduction to politics. Sample containing ...,"[The, people, recruited, into, official, posit...","[Although, the, United, States, Congress, is, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
482174,"Independent, electronic edition of 1989-10-0...","[Andy, Jones, ,, the, Charlton, striker, ,, ha...","[Ian, Dowie, ,, a, former, missile, engineer, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5274,19 conversations recorded by `Martin' (PS0KN...,"[yeah, but, this, is, ,, this, is, different, ...","[cos, has, n't, Grantham, got, the, reputation...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
182921,A compass error. Sample containing about 361...,"[‘, Her, mother, still, did, n't, tell, her, t...","[Constanza, did, ask, ,, Anna, simply, repeate...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [20]:
df.to_pickle('data/discourse_markers/bnc_df.zip')

# [unused] Brown corpus

This section takes the Brown corpus (used for initial tests, eventually discarded due to lack of paragraph/document boundaries) and the discovered discourse markers from the Acrolinx data and creates a DataFrame with the sentences, the discourse markers found in them if any, and the "cleaned" sentences if there were markers found.

In [99]:
brown_disc_df = pd.read_pickle('data/discourse_markers/brown_disc_df.pkl')

In [97]:
brown_disc_df.to_pickle('data/discourse_markers/brown_disc_df.pkl')

# [unused] qualifiers/intensifiers: Brown corpus

This was used initially to search for qualifiers and intensifiers, which were collected from the Acrolinx dataset, in the Brown corpus and see the contexts in which they appeared.

In [137]:
# count number of example sentences (unverified) for each example
for col in brown_qual_df:
    if brown_qual_df[col].dtype == 'int64':
        print(col + '\t' + str(sum(brown_qual_df[col])))

if ever	3
ultimately	19
far	401
all of	153
so-called	31
particularly	141
much	878
admittedly	3
please	45
exactly	99
perfectly	31
actually	127
little	767
big	308
apparently	102
a lot	85
alone	190
vast	60
all	2611
at all	183
in effect	24
clearly	118
extremely	50
generally	119
some kind of	21
quite	269
usually	185
right	577
too	760
sorely	3
also	983
of course	234
surely	38
importantly	8
ever	328
blatantly	0
such	1124
lots of	26
exact	27
honestly	12
at least	272
just	742
necessarily	49
really	267
probably	232
some	1345
even	954
occasionally	32
pretty	98
relatively	84
fully	80
ideally	5
absolutely	27
well	744
definitely	21
a bit	56
incredibly	7
sort of	117
maybe	66
certainly	115
literally	26
simply	166
possibly	57
so	1641
specifically	36
actively	11
then	984
very	749
completely	109
er	0
truly	56
