## Introduction to Dependency Parsing with spaCy

In [None]:
#setup
import warnings; warnings.simplefilter('ignore')
%matplotlib notebook
import pandas as pd
df = pd.read_csv('death-penalty-cases.csv')

In [None]:
text = 'Science cannot solve the ultimate mystery of nature. And that is because, in the last analysis, we ourselves are a part of the mystery that we are trying to solve.'
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [None]:
doc

In [None]:
for sent in doc.sents:
    print("sentence:", sent)
    print("root:", sent.root)
    print([(w, w.dep_) for w in sent.root.children])
    print()

In [None]:
to_display = nlp('Science cannot solve the ultimate mystery of nature.')
to_display

In [None]:
from spacy import displacy
displacy.render(to_display, style="dep", jupyter=True)

In [None]:
# Noun Phrase Chunking
list(doc.noun_chunks)

In [None]:
sent.root

In [None]:
list(sent.root.children)

In [None]:
# Left children
list(sent.root.lefts)

In [None]:
# Right children
list(sent.root.rights)

In [None]:
# first token
sent[0]

In [None]:
# first token dependency label, cc=conjunction
sent[0].dep_

In [None]:
sent[0].head

## Unsupervised Discovery of Gendered Language through Latent-Variable Modeling

[Hoyle et al. (2019)](https://www.aclweb.org/anthology/P19-1167/) study the language use of gendered nouns and proceed to train a generative latent-variable model that jointly represents adjective (or verb) choice, with its sentiment given the (natural) gender of a noun. To this extent, they extract noun–adjectives pairs, NSUBJ–verb pairs and DOBJ–verb pairs. 

In the following, we show how to extract NSUBJ-verb pairs from text.

In [None]:
df

In [None]:
df = df.sample(n=2000)
df["processed"] = df["snippet"].apply(lambda x: nlp(x))


In [None]:
def extract_subject_verb_pairs(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["subj-verb-pairs"] = df["processed"].apply(lambda x: extract_subject_verb_pairs(x))

In [None]:
# most common pairs
from collections import Counter
counter = Counter()
for item in df["subj-verb-pairs"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts) # -pron- is a pronoun

In [None]:
# install coreference resolution for spacy
!git clone https://github.com/huggingface/neuralcoref.git
!cd neuralcoref
!pip install -r neuralcoref/requirements.txt
!pip install -e neuralcoref

In [None]:
# set up coreference resolution
import neuralcoref      ## ignore RuntimeWarning(s)
neuralcoref.add_to_pipe(nlp)

In [None]:
# Coreference Resolution
doc = nlp(u'My sister has a dog. She loves him.')
print(doc._.has_coref)         ## True
print(doc._.coref_clusters)    ## [My sister: [My sister, She], a dog: [a dog, him]]
print(doc._.coref_resolved)    ## 'My sister has a dog. My sister loves a dog.'


In [None]:
df["corefs_resolved"] = df["snippet"].apply(lambda x: nlp(x))


In [None]:
def extract_subject_verb_pairs_coref(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = []
    for w in subjs:
        # either a subject is part of a coreference chain, then we need to resolve the chain
        if w._.in_coref:
            cluster = w._.coref_clusters[0]
            lemma = cluster.main.root.lemma_.lower()
            pairs.append((lemma, w.head.lemma_.lower()))
        # if it's not, we can just do the same as above
        else:
            pairs.append((w.lemma_.lower(), w.head.lemma_.lower()))
    return pairs

In [None]:
df["subj-verb-pairs-coref"] = df["corefs_resolved"].apply(lambda x: extract_subject_verb_pairs_coref(x))
counter = Counter()
for item in df["subj-verb-pairs-coref"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts)

In [None]:
# verbs used with defendant

for (subject, verb), counts in counter.most_common():
    if subject == "defendant" and counts > 1:
        print (subject, verb, counts)

In [None]:
# verbs used with jury

for (subject, verb), counts in counter.most_common():
    if subject == "jury" and counts > 1:
        print (subject, verb, counts)