# Week 8: Parsing

## Setup

In [None]:
# !pip install benepar
# !pip install svgling

In [2]:
#setup
%matplotlib notebook
import pandas as pd
import spacy
from spacy import displacy
import benepar
import nltk

from collections import Counter


df = pd.read_csv('death-penalty-cases.csv')

In [2]:
df

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
0,nj,,NJ,1965,1965-09-14T00:00:00Z,8,N.J. ( )\n A. d \nIN RE WAIVER OF DEATH PE...
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,"whether the death penalty is, per se, unconsti..."
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,# ;s contention that the assessment of the dea...
3,nm,,NM,2009,2009-11-30T00:00:00Z,0,. d ( )\n -NMSC- \nIN THE MATTER OF DEATH PE...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
...,...,...,...,...,...,...,...
32562,ohioctapp,8055.0,OH,2017,2017-07-20T00:00:00Z,0,of two counts of aggravated murder with deat...
32563,cal,,CA,2017,2017-07-20T00:00:00Z,0,his general views about the death penalty as ...
32564,neb,,NE,2017,2017-07-21T00:00:00Z,0,"been subject to the death\npenalty, because Ne..."
32565,ohio,5374.0,OH,2017,2017-07-25T00:00:00Z,0,that Indiana law permits imposition of the de...


## Dependency Parsing with SpaCy

Let's first look at one example:

In [4]:
text = 'Science cannot solve the ultimate mystery of nature.'
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [5]:
displacy.render(doc, style="dep")

In [6]:
for sent in doc.sents:
    print("sentence:", sent)
    print("root:", sent.root)
    print([(w, w.dep_) for w in sent.root.children])
    print()

sentence: Science cannot solve the ultimate mystery of nature.
root: solve
[(Science, 'nsubj'), (can, 'aux'), (not, 'neg'), (mystery, 'dobj'), (., 'punct')]



In [7]:
# current sentence
print(sent)
print(sent.root)
print(list(sent.root.children))
# Left children
print(list(sent.root.lefts))
# Right children
print(list(sent.root.rights))
# first token
print(sent[0])
# first token dependency label, cc=conjunction
print(sent[0].dep_)
print(sent[0].head)

Science cannot solve the ultimate mystery of nature.
solve
[Science, can, not, mystery, .]
[Science, can, not]
[mystery, .]
Science
nsubj
solve


# Constituency Parsing with SpaCy

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
doc = nlp('Science cannot solve the ultimate mystery of nature.')
sent = list(doc.sents)[0]
print(sent._.parse_string)
print(sent._.labels)
print(list(sent._.children)[0])
nltk.Tree.fromstring(sent._.parse_string)

# Application
## Unsupervised Discovery of Gendered Language through Latent-Variable Modeling

[Hoyle et al. (2019)](https://www.aclweb.org/anthology/P19-1167/) study the language use of gendered nouns and proceed to train a generative latent-variable model that jointly represents adjective (or verb) choice, with its sentiment given the (natural) gender of a noun. To this extent, they extract noun–adjectives pairs, NSUBJ–verb pairs and DOBJ–verb pairs. 

In the following, we show how to extract NSUBJ-verb pairs from text.

In [9]:
df

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
0,nj,,NJ,1965,1965-09-14T00:00:00Z,8,N.J. ( )\n A. d \nIN RE WAIVER OF DEATH PE...
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,"whether the death penalty is, per se, unconsti..."
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,# ;s contention that the assessment of the dea...
3,nm,,NM,2009,2009-11-30T00:00:00Z,0,. d ( )\n -NMSC- \nIN THE MATTER OF DEATH PE...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
...,...,...,...,...,...,...,...
32562,ohioctapp,8055.0,OH,2017,2017-07-20T00:00:00Z,0,of two counts of aggravated murder with deat...
32563,cal,,CA,2017,2017-07-20T00:00:00Z,0,his general views about the death penalty as ...
32564,neb,,NE,2017,2017-07-21T00:00:00Z,0,"been subject to the death\npenalty, because Ne..."
32565,ohio,5374.0,OH,2017,2017-07-25T00:00:00Z,0,that Indiana law permits imposition of the de...


In [10]:
df = df.sample(n=2000)
df["processed"] = df["snippet"].apply(lambda x: nlp(x))


In [11]:
def extract_subject_verb_pairs(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["subj-verb-pairs"] = df["processed"].apply(lambda x: extract_subject_verb_pairs(x))

In [12]:
# most common pairs
counter = Counter()
for item in df["subj-verb-pairs"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts) # -pron- is a pronoun

('penalty', 'be') 229
('-pron-', 'be') 192
('state', 'seek') 160
('-pron-', 'have') 70
('statute', 'be') 52
('jury', 'recommend') 48
('-pron-', 'consider') 44
('-pron-', 'vote') 43
('this', 'be') 41
('-pron-', 'find') 41
('court', 'find') 39
('that', 'be') 39
('defendant', 'be') 38
('-pron-', 'do') 33
('court', 'hold') 32
('jury', 'find') 31
('-pron-', 'believe') 30
('case', 'be') 29
('-pron-', 'impose') 29
('-pron-', 'say') 28
('imposition', 'be') 27
('who', 'be') 26
('-pron-', 'seek') 25
('-pron-', 'hold') 25
('court', 'impose') 23


In [13]:
# install coreference resolution for spacy
# !pip install neuralcoref
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x17e83ef70>

In [15]:
# Coreference Resolution
doc = nlp(u'My sister has a dog. She loves him.')
print(doc._.has_coref)         ## True
print(doc._.coref_clusters)    ## [My sister: [My sister, She], a dog: [a dog, him]]
print(doc._.coref_resolved)    ## 'My sister has a dog. My sister loves a dog.'


True
[My sister: [My sister, She], a dog: [a dog, him]]
My sister has a dog. My sister loves a dog.


In [16]:
df["corefs_resolved"] = df["snippet"].apply(lambda x: nlp(x))


In [21]:
def extract_subject_verb_pairs_coref(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = []
    for w in subjs:
        # either a subject is part of a coreference chain, then we need to resolve the chain
        if w._.in_coref:
            cluster = w._.coref_clusters[0]
            lemma = cluster.main.root.lemma_.lower()
            pairs.append((lemma, w.head.lemma_.lower()))
        # if it's not, we can just do the same as above
        else:
            pairs.append((w.lemma_.lower(), w.head.lemma_.lower()))
    return pairs

In [22]:
df["subj-verb-pairs-coref"] = df["corefs_resolved"].apply(lambda x: extract_subject_verb_pairs_coref(x))
counter = Counter()
for item in df["subj-verb-pairs-coref"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts)

('penalty', 'be') 231
('state', 'seek') 156
('-pron-', 'be') 152
('-pron-', 'have') 58
('statute', 'be') 50
('jury', 'recommend') 48
('this', 'be') 41
('defendant', 'be') 41
('court', 'find') 40
('that', 'be') 38
('-pron-', 'find') 37
('-pron-', 'consider') 35
('-pron-', 'vote') 33
('jury', 'find') 33
('court', 'hold') 32
('case', 'be') 30
('court', 'impose') 27
('-pron-', 'do') 26
('imposition', 'be') 26
('who', 'be') 25
('-pron-', 'believe') 25
('-pron-', 'hold') 25
('-pron-', 'say') 22
('jury', 'impose') 22
('state', 'waive') 22


In [24]:
# verbs used with defendant

for (subject, verb), counts in counter.most_common():
    if subject == "defendant" and counts > 1:
        print (subject, verb, counts)

defendant be 41
defendant eligible 19
defendant receive 18
defendant argue 16
defendant contend 15
defendant s 10
defendant face 10
defendant have 9
defendant guilty 7
defendant intend 5
defendant claim 5
defendant raise 5
defendant deserve 5
defendant waive 5
defendant ask 4
defendant move 4
defendant seek 4
defendant challenge 3
defendant file 3
defendant suffer 3
defendant do 3
defendant commit 3
defendant appeal 3
defendant qualify 2
defendant make 2
defendant assert 2
defendant allege 2
defendant get 2
defendant plead 2
defendant tell 2
defendant risk 2
defendant say 2
defendant subject 2
defendant find 2
defendant acknowledge 2
defendant hold 2


In [25]:
# verbs used with jury

for (subject, verb), counts in counter.most_common():
    if subject == "jury" and counts > 1:
        print (subject, verb, counts)

jury recommend 48
jury find 33
jury impose 22
jury be 15
jury return 8
jury assess 7
jury answer 6
jury s 6
jury consider 4
jury give 4
jury deliberate 4
jury conclude 3
jury reach 3
jury sentence 3
jury have 3
jury fix 3
jury may 3
jury vote 3
jury decide 3
jury begin 2
jury convict 2
jury acquit 2
jury reject 2
jury charge 2
jury participate 2
jury choose 2
jury direct 2
