In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict

# helper functions

In [103]:
def read_file(loc):
    with open(loc) as f:
        return [x.strip() for x in f.readlines()]

In [108]:
def fill_df(df, terms):
    # count number of example sentences (unverified) for each example
    terms_dict = defaultdict(list)
    for item in tqdm(terms):
        if item.strip() in df.columns:
            continue
        for sent in df['sent']:
            sent = ' '.join(sent)
            if item + ' ' in sent:
                terms_dict[item.strip()].append(1)
            else:
                terms_dict[item.strip()].append(0)
        df[item.strip()] = terms_dict[item.strip()]
    return df

In [92]:
def clean_sent(sent, term):
    # remove and recapitalize
    # makes sure to remove commas as well
    
    copy = sent.copy()
    
    terms = nltk.word_tokenize(term)
    if len(terms) == 1:
        term = terms[0]
    else:
        temp_idx = -1
        for term_idx in range(len(terms)):
            if temp_idx == -1:
                temp_idx = copy.index(terms[term_idx])
            else:
                try:
                    assert copy.index(terms[term_idx]) == temp_idx + 1
                except: # there's only 3
                    print('skip')
                    return np.nan
                del copy[temp_idx + 1]
        term = terms[0]
    
    idx = sent.index(term)
    
    if copy[idx + 1] == ',': # remove comma too
        del copy[idx + 1]
    elif copy[idx + 1] == '.': # make sure you're not removing entire sentence
        print('end of sent')
        return np.nan
    elif copy[idx + 1] == "''": # end of quote
        print('end of quote')
        return np.nan
        
    copy[idx + 1] = copy[idx + 1].capitalize()
    del copy[idx]
        
    return copy

def clean_df(df):
    # provide 'clean' version of sentence, without the relevant term
    # checks for capitalization and makes proper if necessary
    
    df['clean'] = [np.nan] * len(df)
    df['clean'] = df['clean'].astype(object)
    terms = df.columns[1:-1]
    
    for term in tqdm(terms):
        for idx, row in df[df[term] == 1].iterrows():
            df.at[idx, 'clean'] = clean_sent(row['sent'], term)
            
    return df

# discourse markers: Brown corpus

In [99]:
brown_disc_df = pd.read_pickle('data/discourse_markers/brown_disc_df.pkl')

In [97]:
brown_disc_df.to_pickle('data/discourse_markers/brown_disc_df.pkl')

# qualifiers/intensifiers: Brown corpus

In [101]:
sents = brown.sents()

In [127]:
qual = set(read_file('data/qualifiers_intensifiers/qual_intens_list.txt'))

In [132]:
brown_qual_df = pd.DataFrame()
brown_qual_df['sent'] = sents

In [133]:
brown_qual_df = fill_df(brown_qual_df, [' ' + x for x in qual])

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))




In [137]:
# count number of example sentences (unverified) for each example
for col in brown_qual_df:
    if brown_qual_df[col].dtype == 'int64':
        print(col + '\t' + str(sum(brown_qual_df[col])))

if ever	3
ultimately	19
far	401
all of	153
so-called	31
particularly	141
much	878
admittedly	3
please	45
exactly	99
perfectly	31
actually	127
little	767
big	308
apparently	102
a lot	85
alone	190
vast	60
all	2611
at all	183
in effect	24
clearly	118
extremely	50
generally	119
some kind of	21
quite	269
usually	185
right	577
too	760
sorely	3
also	983
of course	234
surely	38
importantly	8
ever	328
blatantly	0
such	1124
lots of	26
exact	27
honestly	12
at least	272
just	742
necessarily	49
really	267
probably	232
some	1345
even	954
occasionally	32
pretty	98
relatively	84
fully	80
ideally	5
absolutely	27
well	744
definitely	21
a bit	56
incredibly	7
sort of	117
maybe	66
certainly	115
literally	26
simply	166
possibly	57
so	1641
specifically	36
actively	11
then	984
very	749
completely	109
er	0
truly	56


In [152]:
for item in brown_qual_df[brown_qual_df['particularly'] == 1].sample(20)['sent']:
    print(' '.join(item))
    print()

It has been obvious to the assessors , particularly those in shore communities , that boats comprise the largest category of tangible personal property which they have been unable to reach .

The preceding methods allow efficient use of index words and electronic switches during a sectionalized or multi-phase program , particularly when used in conjunction with the LITORIGIN statement .

And while no one expects total democracy on the academic scene , the scholar will be particularly sensitive to a line between first and second class citizenship drawn on any basis other than that of academic rank or professional achievement .

But it is the wooden sculpture from Bali , the one representing two men with their heads bent backward and their bodies interlaced by a fish , that I particularly call to your attention .

Many companies have systems , particularly in R & D , which work more or less well , depending upon size and actual belief in the policy on the part of administration , as will