# Cypress PDF - Text Extraction Exploration

Code for Durham

In [1]:
import os
import re
import pdfrw
import pdf_redactor
import pprint

import pdfminer

In [2]:
def uncompress(pages):
    """Text content must be uncompressed before reading"""
    for page in pages:
        for content in page.Contents:
            pdfrw.uncompress.uncompress([content])

In [3]:
# Path to original example Cypress PDF
CYPRESS_PATH = 'ignore/cypress-example.PDF'

In [4]:
# Path to externally uncompressed version of Cypress PDF using:
#   qpdf --stream-data=uncompress --object-streams=disable notebooks/ignore/cypress-example.PDF notebooks/ignore/cypress-example.unc.pdf
CYPRESS_PATH_UNC = 'ignore/cypress-example.unc.PDF'

In [5]:
pdf = pdfrw.PdfReader(CYPRESS_PATH)
pdf_unc = pdfrw.PdfReader(CYPRESS_PATH_UNC)
uncompress(pdf.pages)

In [6]:
page0 = pdf.pages[0].Contents[1].stream
page1 = pdf.pages[1].Contents[1].stream
page2 = pdf.pages[1].Contents[1].stream

In [7]:
page0[:2000]

"q\nq\n0 1 -1 0 16 800 cm\nBT\n1 0 0 1 -756.57 0 Tm\n/F1 12 Tf\n0.45 g\n(Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy.)Tj\n0 g\nET\nQ\nQ\nq\nBT\n28 771 Td\n0 -66.5 Td\nET\nQ\nq\n1.5 w\n28 747 m\n600 747 l\nS\nQ\nq\n1.2 w\n28 714 m\n600 714 l\nS\nQ\nq\nQ\nq\n2 J\n0 G\nQ\nq 220 0 0 17.5 28 753.5 cm /img1 Do Q\nBT\n1 0 0 1 28 753.5 Tm\n220 0 Td\n-220 0 Td\nET\nq 132.5 0 0 17.5 469.5 753.5 cm /img3 Do Q\nBT\n1 0 0 1 469.5 753.5 Tm\n132.5 0 Td\n-132.5 0 Td\nET\nBT\n1 0 0 1 150.84 732.5 Tm\n/F1 14 Tf\n(Case Details for Court Case ORANGE 15IF702186)Tj\nET\nBT\n1 0 0 1 208.65 716.5 Tm\n/F1 12 Tf\n(Defendant: )Tj\n/F2 12 Tf\n(BOWES,DANIEL,PATRICK)Tj\nET\nq\n0.82353 0.82353 0.82353 rg\n28 696 574 16 re\nf\nQ\nq\n2 J\n0 G\nQ\nBT\n1 0 0 1 30 698 Tm\n/F1 12 Tf\n(Case Information)Tj\nET\nq\nQ\nq\n2 J\n0 G\nQ\nBT\n1 0 0 1 30 681 Tm\n/F1 9 Tf\n(Case Record Was Last Updated: )Tj\n/F2 9 Tf\n(09/22/2015)Tj\nET

## Using pdf_redactor

In [8]:
data = {}
tokens = list(pdf_redactor.build_text_layer(pdf, None))

In [9]:
tokens[0][:20]

[Token<'Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy. ~ Not a certified copy.'>,
 Token<'Case Details for Court Case ORANGE 15IF702186'>,
 Token<'Defendant: '>,
 Token<'BOWES,DANIEL,PATRICK'>,
 Token<'Case Information'>,
 Token<'Case Record Was Last Updated: '>,
 Token<'09/22/2015'>,
 Token<'Case Status: '>,
 Token<'DISPOSED'>,
 Token<'Microfilm Number: '>,
 Token<'-'>,
 Token<'Case Was Reinstated: '>,
 Token<'-'>,
 Token<'Jurisdiction: '>,
 Token<'DISTRICT COURT'>,
 Token<'Citation Number: '>,
 Token<'8878F91'>,
 Token<'Originating Case (OCA) or Incident'>,
 Token<'•   Agency: '>,
 Token<'-'>]

## Using manual text searching w/ RegEx

In [10]:
data = {}

In [11]:
# DEFENDANT
# Text: (Defendant: )Tj\n/F2 12 Tf\n(BOWES,DANIEL,PATRICK)Tj\n
defendant_re = r'\(Defendant: \)Tj\n/F2 12 Tf\n\(([\w,]+)\)Tj\n'
match = re.search(defendant_re, page0)
data['defendant'] = match.group(1)

In [12]:
# CASE STATUS
# Text: (Case Status: )Tj\n/F2 9 Tf\n(DISPOSED)Tj\n
case_status_re = r'\(Case Status: \)Tj\n/F2 9 Tf\n\(([\w,]+)\)Tj\n'
match = re.search(case_status_re, page0)
data['case_status'] = match.group(1)

In [13]:
# DATE OF BIRTH
# Text: (Date of Birth/Estimated Age:)Tj\nET\nBT\n1 0 0 1 30 172 Tm\n/F2 9 Tf\n(09/10/1988)Tj\n
dob_re = r'\(Date of Birth/Estimated Age:\)Tj\nET\nBT\n1 0 0 1 30 172 Tm\n/F2 9 Tf\n\(([\w/]+)\)Tj\n'
match = re.search(dob_re, page0)
data['dob'] = match.group(1)

In [14]:
data

{'defendant': 'BOWES,DANIEL,PATRICK',
 'case_status': 'DISPOSED',
 'dob': '09/10/1988'}

## pdfminer

In [15]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [16]:
pdf_text = convert_pdf_to_txt(CYPRESS_PATH)
pdf_text[:2000]

".\n\ny\np\no\nc\nd\ne\n\n \n\ni\nf\ni\nt\nr\ne\nc\na\n\n \n\n \nt\no\nN\n~\n\n \n\n \n.\n\n \n\ny\np\no\nc\nd\ne\ni\nf\ni\nt\nr\ne\nc\na\n\n \n\n \nt\no\nN\n~\n\n \n\n \n.\n\n \n\ny\np\no\nc\nd\ne\ni\nf\ni\nt\nr\ne\nc\n \na\n \nt\no\nN\n~\n\n \n\n \n.\ny\np\no\nc\n \nd\ne\ni\nf\ni\nt\nr\ne\nc\n \na\n \nt\no\nN\n~\n\n \n\n \n.\ny\np\no\nc\n \nd\ne\n\ni\nf\ni\nt\nr\ne\nc\n\n \n\na\n\n \nt\no\nN\n~\n\n \n\n \n.\n\ny\np\no\nc\nd\ne\n\n \n\ni\nf\ni\nt\nr\ne\nc\n\n \n\na\n \nt\no\nN\n\nCase Details for Court Case ORANGE 15IF702186\n\nDefendant: BOWES,DANIEL,PATRICK\n\nCase Information\nCase Record Was Last Updated: 09/22/2015\nCase Status: DISPOSED\nMicrofilm Number: -\nCase Was Reinstated: -\nJurisdiction: DISTRICT COURT\nCitation Number: 8878F91\nOriginating Case (OCA) or Incident\n•   Agency: -\n•   Number: -\n\nProcess Type in District Court: CITATION\nProcess Type in Superior Court: -\n\nDomestic Violence General Statute (G.S. 50B) Applies to\n•   Charged Offenses: NO\n•   Offenses Con

In [17]:
miner_data = {}

In [25]:
# DEFENDANT
# Text: Defendant: BOWES,DANIEL,PATRICK
defendant_re = r'Defendant: ([\w,]+)'
match = re.search(defendant_re, pdf_text)
miner_data['defendant'] = match.group(1)

In [26]:
miner_data

{'defendant': 'BOWES,DANIEL,PATRICK'}