In [1]:
#setup
import warnings; warnings.simplefilter('ignore')
%matplotlib notebook
import pandas as pd
df = pd.read_csv('death-penalty-cases.csv')

In [2]:
text = 'Science cannot solve the ultimate mystery of nature. And that is because, in the last analysis, we ourselves are a part of the mystery that we are trying to solve.'
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [3]:
doc

Science cannot solve the ultimate mystery of nature. And that is because, in the last analysis, we ourselves are a part of the mystery that we are trying to solve.

In [4]:
for sent in doc.sents:
    print("sentence:", sent)
    print("root:", sent.root)
    print([(w, w.dep_) for w in sent.root.children])
    print()

sentence: Science cannot solve the ultimate mystery of nature.
root: solve
[(Science, 'nsubj'), (can, 'aux'), (not, 'neg'), (mystery, 'dobj'), (., 'punct')]

sentence: And that is because, in the last analysis, we ourselves are a part of the mystery that we are trying to solve.
root: is
[(And, 'cc'), (that, 'nsubj'), (are, 'advcl'), (., 'punct')]



In [5]:
# current sentence
sent

And that is because, in the last analysis, we ourselves are a part of the mystery that we are trying to solve.

In [6]:
# Noun Phrase Chunking
list(doc.noun_chunks)

[Science,
 the ultimate mystery,
 nature,
 the last analysis,
 we,
 ourselves,
 a part,
 the mystery,
 we]

In [7]:
sent.root

is

In [8]:
list(sent.root.children)

[And, that, are, .]

In [9]:
# Left children
list(sent.root.lefts)

[And, that]

In [10]:
# Right children
list(sent.root.rights)

[are, .]

In [11]:
# first token
sent[0]

And

In [12]:
# first token dependency label, cc=conjunction
sent[0].dep_

'cc'

In [13]:
sent[0].head

is

## Unsupervised Discovery of Gendered Language through Latent-Variable Modeling

[Hoyle et al. (2019)](https://www.aclweb.org/anthology/P19-1167/) study the language use of gendered nouns and proceed to train a generative latent-variable model that jointly represents adjective (or verb) choice, with its sentiment given the (natural) gender of a noun. To this extent, they extract noun–adjectives pairs, NSUBJ–verb pairs and DOBJ–verb pairs. 

In the following, we show how to extract NSUBJ-verb pairs from text.

In [14]:
df

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
0,nj,,NJ,1965,1965-09-14T00:00:00Z,8,N.J. ( )\n A. d \nIN RE WAIVER OF DEATH PE...
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,"whether the death penalty is, per se, unconsti..."
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,# ;s contention that the assessment of the dea...
3,nm,,NM,2009,2009-11-30T00:00:00Z,0,. d ( )\n -NMSC- \nIN THE MATTER OF DEATH PE...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
...,...,...,...,...,...,...,...
32562,ohioctapp,8055.0,OH,2017,2017-07-20T00:00:00Z,0,of two counts of aggravated murder with deat...
32563,cal,,CA,2017,2017-07-20T00:00:00Z,0,his general views about the death penalty as ...
32564,neb,,NE,2017,2017-07-21T00:00:00Z,0,"been subject to the death\npenalty, because Ne..."
32565,ohio,5374.0,OH,2017,2017-07-25T00:00:00Z,0,that Indiana law permits imposition of the de...


In [15]:
df = df.sample(n=2000)
df["processed"] = df["snippet"].apply(lambda x: nlp(x))


In [16]:
def extract_subject_verb_pairs(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["subj-verb-pairs"] = df["processed"].apply(lambda x: extract_subject_verb_pairs(x))

In [17]:
# most common pairs
from collections import Counter
counter = Counter()
for item in df["subj-verb-pairs"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts) # -pron- is a pronoun

('penalty', 'be') 229
('-pron-', 'be') 202
('state', 'seek') 144
('this', 'be') 67
('-pron-', 'have') 59
('-pron-', 'vote') 58
('jury', 'recommend') 49
('-pron-', 'impose') 45
('statute', 'be') 43
('court', 'find') 36
('-pron-', 'find') 36
('defendant', 'be') 34
('jury', 'find') 34
('-pron-', 'seek') 33
('that', 'be') 31
('jury', 'impose') 30
('court', 'impose') 29
('court', 'hold') 28
('-pron-', 'give') 28
('-pron-', 'do') 28
('-pron-', 'believe') 26
('defendant', 'eligible') 25
('-pron-', 'consider') 25
('imposition', 'be') 24
('who', 'be') 23


In [18]:
# install coreference resolution for spacy
!current_dir==$(pwd)
!cd
!git clone https://github.com/huggingface/neuralcoref.git
!cd neuralcoref
!pip install -r neuralcoref/requirements.txt
!pip install -e neuralcoref
!cd $current_dir

fatal: destination path 'neuralcoref' already exists and is not an empty directory.
Obtaining file:///home/dominsta/Dropbox/2021-02-Legal-DNA/legal_dna_2021/notebooks/neuralcoref


Installing collected packages: neuralcoref
  Attempting uninstall: neuralcoref
    Found existing installation: neuralcoref 4.0
    Uninstalling neuralcoref-4.0:
      Successfully uninstalled neuralcoref-4.0
  Running setup.py develop for neuralcoref
Successfully installed neuralcoref


In [19]:
# set up coreference resolution
import neuralcoref      ## ignore RuntimeWarning(s)
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fcc5d70be10>

In [20]:
# Coreference Resolution
doc = nlp(u'My sister has a dog. She loves him.')
print(doc._.has_coref)         ## True
print(doc._.coref_clusters)    ## [My sister: [My sister, She], a dog: [a dog, him]]
print(doc._.coref_resolved)    ## 'My sister has a dog. My sister loves a dog.'


True
[My sister: [My sister, She], a dog: [a dog, him]]
My sister has a dog. My sister loves a dog.


In [21]:
df["corefs_resolved"] = df["snippet"].apply(lambda x: nlp(x))


In [22]:
def extract_subject_verb_pairs_coref(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = []
    for w in subjs:
        # either a subject is part of a coreference chain, then we need to resolve the chain
        if w._.in_coref:
            cluster = w._.coref_clusters[0]
            lemma = cluster.main.root.lemma_.lower()
            pairs.append((lemma, w.head.lemma_.lower()))
        # if it's not, we can just do the same as above
        else:
            pairs.append((w.lemma_.lower(), w.head.lemma_.lower()))
    return pairs

In [23]:
df["subj-verb-pairs-coref"] = df["corefs_resolved"].apply(lambda x: extract_subject_verb_pairs_coref(x))
counter = Counter()
for item in df["subj-verb-pairs-coref"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts)

('penalty', 'be') 237
('state', 'seek') 146
('-pron-', 'be') 145
('this', 'be') 67
('jury', 'recommend') 48
('-pron-', 'have') 45
('-pron-', 'vote') 45
('statute', 'be') 43
('defendant', 'be') 40
('court', 'find') 36
('jury', 'find') 34
('that', 'be') 31
('court', 'impose') 31
('jury', 'impose') 30
('-pron-', 'find') 30
('court', 'hold') 28
('-pron-', 'impose') 26
('defendant', 'eligible') 25
('imposition', 'be') 24
('-pron-', 'seek') 24
('-pron-', 'consider') 23
('court', 'sentence') 22
('case', 'be') 22
('who', 'be') 22
('-pron-', 'hold') 21


In [24]:
# verbs used with defendant

for (subject, verb), counts in counter.most_common():
    if subject == "defendant" and counts > 1:
        print (subject, verb, counts)

defendant be 40
defendant eligible 25
defendant contend 16
defendant receive 14
defendant argue 11
defendant have 9
defendant s 8
defendant raise 7
defendant file 5
defendant waive 4
defendant guilty 4
defendant make 4
defendant claim 4
defendant challenge 3
defendant allege 3
defendant rely 2
defendant face 2
defendant appeal 2
defendant could 2
defendant escape 2
defendant assert 2
defendant move 2
defendant avoid 2
defendant acknowledge 2
defendant hold 2
defendant assign 2
defendant indicate 2
defendant get 2
defendant deserve 2
defendant state 2
defendant learn 2


In [25]:
# verbs used with jury

for (subject, verb), counts in counter.most_common():
    if subject == "jury" and counts > 1:
        print (subject, verb, counts)

jury recommend 48
jury find 34
jury impose 30
jury be 17
jury return 6
jury consider 6
jury vote 5
jury convict 4
jury assess 4
jury decline 4
jury decide 4
jury answer 4
jury s 3
jury make 3
jury abuse 2
jury inflict 2
jury charge 2
jury determine 2
jury do 2
jury reject 2
jury fix 2
jury have 2
