# Add bags of words from additional arousal and non-arousal datasets.

In [None]:
### Start postgres if necessary

In [1]:
#!pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start

## A good spacy intro for getting tokens
https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

## Load spacy.io
https://spacy.io/#install

In [1]:
import io
import codecs
from spacy.en import English
nlp = English(parser=True, tagger=True) # so we can sentence parse

## Gather positive and negative sentences for Arousal

These are all the annotations for Arousal topic as annotated by Mark.

    See 2_spacy_parse_annotations notebook.

In [2]:
wkdir = '/Users/ccarey/Documents/Projects/NAMI/rdoc/results'
%cd $wkdir

/Users/ccarey/Documents/Projects/NAMI/rdoc/results


In [5]:
!wc ./annotations_processed/AR_mk_pos
!wc ./annotations_processed/AR_mk_neg
!wc ./annotations_processed/AR_mk_not_pos
! file -I annotations_processed/AR_mk_pos # utf-8

     201    5280   38255 ./annotations_processed/AR_mk_pos
     166    4054   29188 ./annotations_processed/AR_mk_neg
     211    4215   29329 ./annotations_processed/AR_mk_not_pos
annotations_processed/AR_mk_pos: text/plain; charset=utf-8


In [8]:
pos_sentences = []
with codecs.open('./annotations_processed/AR_mk_pos', mode='r', encoding='utf-8') as f:
    sents = f.read().splitlines() # 201
    for s in sents:
        s = s.replace('{{', '').replace('}}', '').strip()
        pos_sentences.append(s)
len(pos_sentences)

201

In [9]:
neutral_sentences = []
with codecs.open('./annotations_processed/AR_mk_not_pos', mode='r', encoding='utf-8') as f:
    sents = f.read().splitlines() # 201
    for s in sents:
        s = s.replace('{{', '').replace('}}', '').strip()
        neutral_sentences.append(s)
len(neutral_sentences)

211

These are a temporary set of negative sentences for Arousal by Mark.

They are annotated in abstracts that were not in the arousal topic
    See 2_spacy_parse_annotations notebook.

In [10]:
neg_sentences = []
with codecs.open('./annotations_processed/AR_mk_neg', mode='r', encoding='utf-8') as f:
    neg_sents = f.read().splitlines() # 201
    for s in neg_sents:
        s = s.replace('{{', '').replace('}}', '').strip()
        neg_sentences.append(s)
len(neg_sentences)

166

## Use spacy to create bags of words from each sentence.
#TODO: not sure if we should unique the bag of words. If encoding to integer feature vector, we would be counting occurrences.
### spacy helper functions

In [17]:
def spacy_split_sentences(text):
    sentences = []
    #doc = nlp(text.decode('utf8')) #"This is a sentence. Here's another...".decode('utf8'))
    doc = nlp(text) #"This is a sentence. Here's another...".decode('utf8'))
    for span in doc.sents:
        #sentences.append(u''.join(doc[i].string for i in range(span.start, span.end)).encode('utf-8').strip())
        sentences.append(''.join(doc[i].string for i in range(span.start, span.end)))#.strip())
    return(sentences)

In [15]:
def spacy_lemma_gt_len(text, length=2):
    '''Create bag of unique lemmas, requiring lemma length > length
    
    Note, not counting possible multiple occurrences of words / lemmas.
    
    (That caution in order as Some approaches might prefer to score by word frequency as well as by the word.)

    Note: setting length to 1 may mess up our postgres arrays as we would
    get commas here, unless we were to quote everything.
    '''
    tokens = []
    #doc = nlp(text.decode('utf8')) #"This is a sentence. Here's another...".decode('utf8'))
    parsed_data = nlp(text) #"This is a sentence. Here's another...".decode('utf8'))
    for token in parsed_data:
        if len(token.lemma_) > length:
            tokens.append(token.lemma_.lower())
    return(list(set(tokens)))

In [19]:
def spacy_lemma_biwords_gt_len(text, length=3):
    '''Create bag of unique bi-lemmas, requiring lemma length > length
    
    Note, not counting possible multiple occurrences of words / lemmas.
    
    (That caution in order as Some approaches might prefer to score by word frequency as well as by the word.)
    
    We are crudely eliminating any bi-lemmas that have commas in them to save us in loading postgres arrays.
    '''
    biwords = []
    parsed_data = nlp(text)
    skip_chars = [',', '"', "'"]
    for i in range(1, len(parsed_data) - 1):
        skip = False
        biword = u'{} {}'.format(parsed_data[i].lemma_.lower(), parsed_data[i+1].lemma_.lower())
        if (parsed_data[i].lemma_ in skip_chars or parsed_data[i+1].lemma_ in skip_chars):
            skip = True
        if len(biword) > length and not skip:
            biwords.append(biword)
    return(list(set(biwords)))

test = 'A good, apple once told me there was a rotten worm there inside.'.decode('utf8')
res = spacy_lemma_gt_len(test, length=4)
print(res)
res = spacy_lemma_biwords_gt_len(test, length=4)
# ', '.join(res) # note, flattens formatting.
print(', '.join(res))

[u'rotten', u'there', u'inside', u'apple']
there inside, rotten worm, tell me, once tell, worm there, inside ., apple once, me there, there be, a rotten


In [18]:
def to_lemmas_and_lemma_biwords(sentence, lemma_len=2, bi_lemma_len=4):
    lemma = spacy_lemma_gt_len(sentence, length=lemma_len)
    lemma_biwords = spacy_lemma_biwords_gt_len(sentence, length=bi_lemma_len)
    return(lemma + lemma_biwords)

### Sidenote: what do we use as sentences?

We preserve sentences as parsed in stop 2_spacy_parse_annotations notebook.

Note, we did some manual sentence spliiting within the abstracts.

We wish to retain that splitting instead of using spacy to split herein as spacy is imperfect due to all the acronyms in our text.

In [27]:
# with codecs.open('./annotations_processed/AR_mk_pos', mode='r', encoding='utf-8') as f:
#     sents = spacy_split_sentences(f.read())
# len(sents) # 179

# sentences = []
# with codecs.open('./annotations_processed/AR_mk_pos', mode='r', encoding='utf-8') as f:
#     sents = f.read().splitlines() # 201
#     for s in sents:
#         sentences.append(spacy_split_sentences(s))
# len(sentences) # still have 201, spacy found no new splits.

### Create bag of words and biwords from each of our sentences.
Function assumes bag of words (lemmas) should have the actual words, and duplicate words are removed.

Some approaches to using bag of words or lemmas assumes word appears greater than once in corpus. (which makes sense as meaningless to use if only occurs once across corpus?)

### positives and negatives as bags of lemmas and lemma biwords
Note, '{' to wrap array for psql, and \N to leave empty field for deepdive reserved id.

In [76]:
print(len(sentences))
bags_of_lemmas = []
for sentence in sentences:
    lbw = to_lemmas_and_lemma_biwords(sentence)
    bags_of_lemmas.append(lbw)
print(len(bags_of_lemmas))

print(len(neg_sentences))
neg_bags_of_lemmas = []
for sentence in neg_sentences:
    lbw = to_lemmas_and_lemma_biwords(sentence)
    neg_bags_of_lemmas.append(lbw)
print(len(neg_bags_of_lemmas))

201
201
166
166


### save bags of lemmas

In [72]:
wkdir = '/Users/ccarey/Documents/Projects/NAMI/rdoc/results'
%cd $wkdir

/Users/ccarey/Documents/Projects/NAMI/rdoc/results


### Saving positives and negatives
- #TODO: the positives are not strictly positive yet, we still need to remove those annoatations from 'irrelevant' docs.
- remember, positves had '{{' tags somewhere in sentence.
- we are skipping the 'neutral' sentences, i.e. not saving anything from a positive document.
- tmp_neg are any sentence from documents that were not positive.

Note, '{' to wrap array for psql, and \N to leave empty field for deepdive reserved id.

In [77]:
with codecs.open('./annotations_processed/AR_mk_pos_bags-of-lemmas', 'w', encoding='utf-8') as f:
    for b in bags_of_lemmas:
        b_arr = ', '.join(b)
        f.write(u'{{{}}}\t{}\t{}\n'.format(b_arr, '+arousal', '\N'))

with codecs.open('./annotations_processed/AR_mk_tmp_neg_bags-of-lemmas', 'w', encoding='utf-8') as f:
    for b in neg_bags_of_lemmas:
        b_arr = ', '.join(b)
        f.write(u'{{{}}}\t{}\t{}\n'.format(b_arr, '-arousal', '\N'))
#./annotations_processed/AR_mk_tmp_neg_bags-of-lemmas

### Misc (some other features from spacy)

In [75]:
# # can iterate over (ents) entities, (noun_chunks), sentences (sents)
# print(sentences[17])
# print('.....')
# for i in range(17,18):
#     print(sentences[i])
#     print('------')
#     doc = nlp(sentences[i], tag=True)
#     print('Entities :')
#     for ent in doc.ents:
#         print(ent)
#     print('Noun chunks :')
#     for nchunk in doc.noun_chunks:
#         print(nchunk)
#     for sent in doc.sents:
#          print(sent)