In [1]:
import json

import numpy as np
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open("../data/death_penalty.json") as file:
    data = json.load(file)

Home page text comparison
=======================

Our first strategy is to compare HTML from a homepage to the HTML of the article. We strip out any HTML that is also on the homepage.

In [4]:
import hashlib
from difflib import HtmlDiff
from bs4 import BeautifulSoup
from bs4.element import NavigableString

In [5]:
with open("../data/pages/deathpenaltyinfo.home.html") as home_page_file:
    home_page = home_page_file.read()
with open("../data/pages/deathpenaltyinfo.page.html") as page_file:
    article_page = page_file.read()

The HtmlDiff
---------------

There is a HtmlDiff class, but it calculates much more than we need and is very inefficient. It's only here for documentating purposes and is not a feasable approach

In [6]:
# This approach is very inefficient
# differ = HtmlDiff()
# with open("diff.html", "w") as diff_file:
#     diff_file.write(differ.make_file(home_page, page))

BeautifulSoup
-----------------

Now we'll see what we get from hashing beautiful soup strings. It looks very promising and it's far more efficient than HtmlDiff or Tika


In [7]:
home = BeautifulSoup(home_page)
article = BeautifulSoup(article_page)

In [13]:
home_contents = set()

for el in home.descendants:
    if not isinstance(el, NavigableString):
        continue
    home_contents.add(el.string)

In [14]:
article_contents = []

for el in article.descendants:
    if not isinstance(el, NavigableString):
        continue
    if el.string not in home_contents:
        article_contents.append(el.string)

In [15]:
article_contents[:10]

['Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent | Death Penalty Information Center',
 'Innocence and the Death Penalty: The Increasing Danger of Executing the Innocent',
 'by Richard C. Dieter, Esq.',
 'Executive Director, Death Penalty Information Center',
 'July 1997',
 'Table of Contents:',
 'Executive Summary',
 'Introduction',
 'Part I: The Danger of Mistaken Executions',
 'Part II: The Cases of Innocence']

In [16]:
article_contents[-20:]

['50',
 'See id. at 14.',
 '51',
 'See D. Terry, Ex-Prosecutors and Deputies in Death Row Case are Charged with Framing Defendant, The N.Y. Times, Dec. 13, 1996, at A18.',
 '52',
 'See D. Terry, DNA Tests and a Confession Set Three on the Path to Freedom in 1978 Murders, The N. Y. Times, June 15, 1996.',
 '53',
 'See Capital Punishment 1995 (1996), Bureau of Justice Statistics, Appendix Table 1 (5,580 sentenced to death through the end of 1995, with approximately 300 new death sentences per year).',
 '54',
 ' M. Radelet, H. Bedau, C. Putnam, In Spite of Innocence 17 (1992).',
 '55',
 'E. Connors et al., Convicted by Juries, Exonerated by Science: Case Studies in the Use of DNA Evidence to Establish Innocence After Trial, U.S. Dept. of Justice Research Report, June 1996, at 33, 44.',
 '56',
 ' Id. at xxviii-ix.',
 '57',
 'See note 10 above (Resolution).',
 'Back To Top',
 'Tweet',
 '563737 reads',
 '\n<!--//--><![CDATA[//><!--\njQuery.extend(Drupal.settings, {"basePath":"\\/","pathPrefi