In [1]:
import json

import numpy as np
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open("../data/death_penalty.json") as file:
    data = json.load(file)

In [4]:
data[0]

{'term': ['death penalty" AND "innocent',
  'death row" AND "mistake',
  'death penalty" AND "mistake',
  'death sentence" AND "mistake'],
 'paragraph_groups': [['Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent | Death Penalty Information Center',
   'Innocence',
   'Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent',
   'State-by-State',
   'by Richard C. Dieter, Esq. Executive Director, Death Penalty Information Center']],
 'title': 'Innocence and the Death Penalty: The Increasing Danger of ...',
 'url': 'https://deathpenaltyinfo.org/node/523',
 'content': {'titles': ['Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent | Death Penalty Information Center',
   'Innocence',
   'Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent',
   'State-by-State'],
  'language': 'en',
  'title': 'Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent | 

In [4]:
subjects = ["death penalty", "capital punishment"]
urls = set()
texts = []
total_count = 0
document_count = 0
for entry in data:
    if entry["url"] in urls:
        continue
    text = ""
    for paragraph_group in entry.get("paragraph_groups", []):
        text += "\n".join(paragraph_group) + " "
    if len(text):
        total_count += 1
        urls.add(entry["url"])
    for subject in subjects:
        if subject not in text:
            continue
        texts.append(text)
        document_count += 1
        break

In [5]:
print("Total count: {}".format(total_count))
print("Document count: {}".format(document_count))

Total count: 592
Document count: 250


In [15]:
for key in entities.keys():
    entities[key] = list(entities[key])

In [16]:
with open("../data/entities.json", "w") as file:
    json.dump(entities, file)

In [6]:
entities = {}
for doc in nlp.pipe(texts, batch_size=32):
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = set([ent.text])
        else:
            entities[ent.label_].add(ent.text)

In [17]:
entities["WORK_OF_ART"][:10]

['\nVisiting Criminal Law Scholar',
 'Musée d’histoire vivante',
 'Jurisprudence',
 'HKT',
 'Meet Me',
 'Amazon Best Book of the Year',
 'Women on Death Row',
 '48 Hours',
 'Investigation into Death Row',
 'See Arbitrariness']

In [10]:
for type_, ents in entities.items():
    print(type_, len(ents))

ORG 1679
CARDINAL 458
NORP 142
GPE 604
PRODUCT 30
DATE 1175
TIME 126
PERSON 1603
EVENT 104
PERCENT 138
ORDINAL 39
WORK_OF_ART 178
LOC 54
LAW 66
FAC 65
LANGUAGE 4
MONEY 48
QUANTITY 33


In [7]:
def entity_tokenizer(text):
    doc = nlp(text)
    return [ent.text.replace(" ", "_") for ent in doc.ents if ent.label_ == "PERSON" and " " in ent.text]        

In [8]:
def texts_for_feature(feature):
    pattern = feature.replace("_", " ").casefold()
    for text in texts:
        if pattern in text.casefold():
            yield text

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_entities = TfidfVectorizer(tokenizer=entity_tokenizer)
tfidf_entities.fit(texts)
tfidf_vectors = tfidf_entities.transform(texts)
feature_names = tfidf_entities.get_feature_names()
tfidf_frame = pd.DataFrame(tfidf_vectors.toarray(), columns=feature_names)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [None]:
feature_names

In [12]:
tfidf_words_sorted = tfidf_frame.sum(axis=0).sort_values(ascending=False)

In [None]:
tfidf_frame

In [13]:
tfidf_words_sorted[:20]

 _            7.0
david_earl    1.0
al_matin      1.0
dtype: float64

In [13]:
sorted_idf_entities = tfidf_entities.idf_.argsort()

In [14]:
entities_features = tfidf_entities.get_feature_names()

In [18]:
{entities_features[ix]:tfidf_entities.idf_[ix] for ix in sorted_idf_entities[:20]}

{'antonin_scalia': 4.838624840473278,
 '\n_\n': 4.838624840473278,
 'carlos_hernandez': 5.005678925136444,
 'rick_perry': 5.100989104940769,
 'carlos_deluna': 5.206349620598596,
 'bryan_stevenson': 5.206349620598596,
 'martin_luther_king': 5.324132656254979,
 'george_w.': 5.324132656254979,
 'wanda_lopez': 5.324132656254979,
 'charles_manson': 5.324132656254979,
 'clarence_thomas': 5.324132656254979,
 'cameron_todd_willingham': 5.324132656254979,
 'john_paul_stevens': 5.457664048879502,
 'bernie_sanders': 5.61181472870676,
 'clarence_smith': 5.61181472870676,
 'troy_davis': 5.61181472870676,
 'robert_dunham': 5.61181472870676,
 '__': 5.61181472870676,
 'walter_mcmillian': 5.61181472870676,
 'george_ryan': 5.61181472870676}

In [None]:
antonin_scalia = texts_for_feature("antonin_scalia")

In [None]:
antonin_scalia_texts = list(antonin_scalia)

In [None]:
antonin_scalia_texts

In [None]:
antonin_scalia_vectors = tfidf_entities.transform(antonin_scalia_texts)

In [None]:
antonin_scalia_frame = pd.DataFrame(antonin_scalia_vectors.toarray(), columns=word_feature_names)

In [None]:
antonin_scalia_frame.shape

In [None]:
feature_names = tfidf_entities.get_feature_names()

In [None]:
# Create tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_vectorizer.fit(texts)
# Easily lookup for feature numbers to words
index_to_word = {ix: word for word, ix in tfidf_vectorizer.vocabulary_.items()}
word_feature_names = tfidf_vectorizer.get_feature_names()
tfidf_vectors = tfidf_vectorizer.transform(texts)
corpus_frame = pd.DataFrame(tfidf_vectors.toarray(), columns=word_feature_names)

In [None]:
def get_sorted_feature_vocabulary(feature, normalize=True):
    feature_texts = texts_for_feature(feature)
    feature_vectors = tfidf_vectorizer.transform(feature_texts)
    frame = pd.DataFrame(feature_vectors.toarray(), columns=word_feature_names)
    print(frame.shape)
    if not normalize:
        return frame.sum(axis=0).sort_values(ascending=False)
    rows = frame.shape[0]
    corpus_mean = corpus_frame.mean(axis=0)
    corpus_std = corpus_frame.std(axis=0)
    frame_sum = frame.sum(axis=0)
    normalized = (frame_sum - corpus_mean) / corpus_std
    return normalized.sort_values(ascending=False)

In [None]:
antonin_scalia_words_normed = get_sorted_feature_vocabulary("antonin_scalia")

In [None]:
antonin_scalia_words_sorted

In [None]:
antonin_scalia_words_normed

In [19]:
len(feature_names)

2245

In [None]:
get_sorted_feature_vocabulary("bryan_stevenson", normalize=False)

In [None]:
get_sorted_feature_vocabulary("bryan_stevenson")

In [None]:
get_sorted_feature_vocabulary("carlos_hernandez")

In [None]:
get_sorted_feature_vocabulary("martin_luther_king")

In [None]:
get_sorted_feature_vocabulary("rick_perry")