In [1]:
import os
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

## Read from Cypress

In [2]:
# Path to original example Cypress PDF
CYPRESS_PATH = 'ignore/cypress-example.pdf'

In [3]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [4]:
pdf_text = convert_pdf_to_txt(CYPRESS_PATH)

## Data map

In [None]:
data_dict = {
    "County": 'Durham',
    # File No
    "ConsJdgmntFileNum": "CfD-100",
    # Checkboxes: In The General Court of Justice
    "District": 'Yes',  # Yes == checked
    "Superior": '',
    # Name and Address of Petitioner
    "NamePetitioner": "John Doe",
    "StreetAddr": "100 Main St.",
    "MailAddr": "Apt A",
    "City": "Durham",
    "State": "NC",
    "ZipCode": "27701",
    # Drivers License Number
    "DLNo": "123456789",
    "DLState": "NC",
    # Race
    "Race": "U",
    # Sex
    "Sex": "M",
    # Date of Birth
    "DOB": "1/1/2000",
    # Full Social Security Number
    "SNN": '111-11-1111',
    # Age At Time of Offense
    "Age": '18',
    # Name and Address of Petitioners Attorney
    "NameAtty": "Arnetta Herring",
    "StAddrAtty": "510 Dillard Street, 6th Fl",
    "MailAddrAtty": "Suite 6400",
    "CityAtty": "Durham",
    "StateAtty": "NC",
    "ZipCodeAtty": "27701",
    
    
    
#    'customer_name': 'company.io',
#    'customer_email': 'joe@company.io',
#    'invoice_number': '102394',
#    'send_date': '2018-02-13',
#    'due_date': '2018-03-13',
#    'note_contents': 'Thank you for your business, Joe',
#    'item_1': 'Data consulting services',
#    'item_1_quantity': '10 hours',
#    'item_1_price': '$200/hr',
#    'item_1_amount': '$2000',
#    'subtotal': '$2000',
#    'tax': '0',
#    'discounts': '0',
#    'total': '$2000',
#    'business_name_2': 'Bostata LLC',
#    'business_email_address': 'hi@bostata.com',
#    'business_phone_number': '(617) 930-4294'
}

In [None]:
data_map = {
    
}

## Extract from Cypress

In [78]:
miner_data={}

In [79]:
# COUNTY AND FILE NO.
# Text: Case Details for Court Case ORANGE 15IF702186
case_summary_re = r'Case Details for Court Case (.+) (\w+)'
match = re.search(case_summary_re,pdf_text)
miner_data['county']=match.group(1)
miner_data['file_no']=match.group(2)

In [80]:
# DEFENDANT
# Text: Defendant: BOWES,DANIEL,PATRICK
defendant_re = r'Defendant: ([\w,]+)'
match = re.search(defendant_re, pdf_text)
miner_data['defendant'] = match.group(1)

In [81]:
# OFFENSE DATE
# Text: Offense Date/Time: 05/17/2015 09:59 PM
datetime_re = r'Offense Date/Time: (.+) \d'
match = re.search(datetime_re, pdf_text)
miner_data['offense_date'] = match.group(1)

In [82]:
# DATE OF BIRTH
# Text: Date of Birth/Estimated Age:\n 09/10/1988
dob_re = r'Date of Birth/Estimated Age:\n(.+)'
match = re.search(dob_re, pdf_text)
miner_data['date_of_birth'] = match.group(1)

In [83]:
# RACE
# Text: Race: WHITE
race_re = r'Race: (.+)'
match = re.search(race_re,pdf_text)
miner_data['race'] = match.group(1)

In [85]:
# SEX
# Text: Sex: MALE
sex_re = r'Sex: (.+)'
match = re.search(sex_re,pdf_text)
miner_data['sex'] = match.group(1)

In [86]:
# OFFENSE DESCRIPTION
#Text: CHARGED\nSPEEDING(80 mph in a 65 mph zone)
description_re = r'CHARGED\n(.+)\n'
match = re.search(description_re,pdf_text)
miner_data['offense_description'] = match.group(1)

In [87]:
# VERDICT
#Text: Verdict: RESPONSIBLE
verdict_re = r'Verdict: (\w+)'
match = re.search(verdict_re,pdf_text)
miner_data['verdict'] = match.group(1)

In [88]:
# DISPOSED
#Text: Disposed on: 07/15/2015
disposed_re = r'Disposed on: (.+)'
match = re.search(disposed_re,pdf_text)
miner_data['disposed_on'] = match.group(1)

In [89]:
miner_data

{'county': 'ORANGE',
 'file_no': '15IF702186',
 'defendant': 'BOWES,DANIEL,PATRICK',
 'offense_date': '05/17/2015',
 'date_of_birth': '09/10/1988',
 'race': 'WHITE',
 'sex': 'MALE',
 'offense_description': 'SPEEDING(80 mph in a 65 mph zone)',
 'verdict': 'RESPONSIBLE',
 'disposed_on': '07/15/2015'}

## Write to petition form

In [22]:
pdf_text

".\n\ny\np\no\nc\nd\ne\n\n \n\ni\nf\ni\nt\nr\ne\nc\na\n\n \n\n \nt\no\nN\n~\n\n \n\n \n.\n\n \n\ny\np\no\nc\nd\ne\ni\nf\ni\nt\nr\ne\nc\na\n\n \n\n \nt\no\nN\n~\n\n \n\n \n.\n\n \n\ny\np\no\nc\nd\ne\ni\nf\ni\nt\nr\ne\nc\n \na\n \nt\no\nN\n~\n\n \n\n \n.\ny\np\no\nc\n \nd\ne\ni\nf\ni\nt\nr\ne\nc\n \na\n \nt\no\nN\n~\n\n \n\n \n.\ny\np\no\nc\n \nd\ne\n\ni\nf\ni\nt\nr\ne\nc\n\n \n\na\n\n \nt\no\nN\n~\n\n \n\n \n.\n\ny\np\no\nc\nd\ne\n\n \n\ni\nf\ni\nt\nr\ne\nc\n\n \n\na\n \nt\no\nN\n\nCase Details for Court Case ORANGE 15IF702186\n\nDefendant: BOWES,DANIEL,PATRICK\n\nCase Information\nCase Record Was Last Updated: 09/22/2015\nCase Status: DISPOSED\nMicrofilm Number: -\nCase Was Reinstated: -\nJurisdiction: DISTRICT COURT\nCitation Number: 8878F91\nOriginating Case (OCA) or Incident\n•   Agency: -\n•   Number: -\n\nProcess Type in District Court: CITATION\nProcess Type in Superior Court: -\n\nDomestic Violence General Statute (G.S. 50B) Applies to\n•   Charged Offenses: NO\n•   Offenses Con