Proposed expansion to [natural_pdf](https://github.com/jsoma/natural-pdf) to identify tables by page elements or regular expressions. More effective in certain cases than methods reliant on OCR or page structure.

In [12]:
import natural_pdf
from natural_pdf import PDF
import tabledelim as td

## Election scraping

In [25]:
pdf = PDF('demo/bergin_precincts.pdf')
page = pdf.pages[0]

This example uses a helper function, find_by_regex, to return a set of text elements matching a regular expression. Ideally natural_pdf will eventually incorporate a text:matches specification to match regular expressions directly in the find method, and this step will be unnecessary.

In [26]:
left_margin = page.create_region(0, 0, 100, page.height)
rows = td.find_by_regex(left_margin, '\w+[\s0-9]*$')

In [None]:
td.extract_table_by_delim(page, 
    rows=rows, # text on the left hand side of the page consisting of at least one world followed by 0 or more digits
    cols='line:vertical[height>=20]', # vertical lines of height at least 20
    bbox = {
        'top': page.find('text:contains("PRESIDENT")').bottom # only apply this scrape to the part of the page below the first appearance of PRESIDENT
    })

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Allendale 1,Early Voting,485,6,1.24%,,***,***,***,***,,***,***,***,***,
1,Allendale 1,Election Day,485,82,16.91%,,6,70,2,,,68,3,7,,
2,Allendale 1,Mail-In,485,46,9.48%,,,41,4,,,40,,3,,
3,Allendale 1,Total,485,134,27.63%,,6,116,7,,,114,3,10,,
4,Allendale 2,Early Voting,257,2,0.78%,,***,***,***,***,,***,***,***,***,
5,Allendale 2,Election Day,257,33,12.84%,,,29,2,,,29,1,2,,
6,Allendale 2,Mail-In,257,26,10.12%,,,23,3,,,23,2,,,
7,Allendale 2,Total,257,61,23.74%,,,54,5,,,54,3,2,,
8,Allendale 3,Early Voting,338,2,0.59%,,***,***,***,***,,***,***,***,***,
9,Allendale 3,Election Day,338,33,9.76%,,,32,1,,,26,3,3,,


In [None]:
pd.DataFrame(page.extract_table())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,,,,,,,1,1,1,1,,2,2,2,2
1,,,Registered\nVoters,Voters\nCast,Turnout\n(%),,DEM\nBUKOVINAC\n-\nTERRISA,DEM\n-\nJOSEPH\nBIDEN\nJr. R.,UNCOMMITTED\nDELEGATES\nDEM\n-,Write-ins,,DEM\n-\nANDY\nKIM,LAWRENCE\nHAMM DEM\n-,DEM\n-\nCAMPOS- PATRICIA\nMEDINA,Write-ins
2,Allendale 1\nAllendale 1\nAllendale 1\nAllenda...,Early Voting\nElection Day\nMail-In\nTotal\nEa...,485\n485\n485\n485\n257\n257\n257\n257\n338\n3...,6\n82\n46\n134\n2\n33\n26\n61\n2\n33\n43\n78\n...,1.24%\n16.91%\n9.48%\n27.63%\n0.78%\n12.84%\n1...,,***\n6\n6\n***\n***\n***\n6\n6\n***\n12\n12\n*...,***\n70\n41\n116\n***\n29\n23\n54\n***\n32\n40...,***\n2\n4\n7\n***\n2\n3\n5\n***\n1\n1\n2\n***\...,***\n***\n***\n***\n***\n2\n2\n***\n2\n2\n***\...,,***\n68\n40\n114\n***\n29\n23\n54\n***\n26\n34...,***\n3\n3\n***\n1\n2\n3\n***\n3\n3\n6\n***\n1\...,***\n7\n3\n10\n***\n2\n2\n***\n3\n4\n7\n***\n1...,***\n***\n***\n***\n***\n***\n***\n***\n***\n***
3,Alpine 1\nAlpine 1\nAlpine 1\nAlpine 1\nAlpine...,,,,,,,,,,,,,,
4,Bergenfield 1\nBergenfield 1\nBergenfield 1\nB...,,,,,,,,,,,,,,


## Use of Force in Vancouver

From https://badpdfs.com/pdfs/use-of-force-raw/

In [2]:
force_doc = PDF('demo/use-of-force-raw.pdf')
page = force_doc.pages[0]

In [20]:
td.extract_table_by_delim(page, rows='text:contains("970")', cols=td.slice_fitting_elem(page, 'text:contains("PFEIFER, TIM")').find_all('text'))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,"PFEIFER, TIM",12/2/2016,Dec-02-2016,22:30,WEST,23-2016-18335,"SOLLERS, CECIL ARTHUR",M,U,H,...,FOOT PURSUIT,NONE,NONE,"TASER, CAROTID RESTRAINT",CAROTID RESTRAINT,"EMS AT SCENE, HOSPITAL/RELEASED",YES,YES,SGT. GEDDRY / 1315,970
1,"NICHOLSON, DUSTIN",12/3/2016,Dec-03-2016,6:45,WEST,23-2016-18348,"GUTIERREZ, SAVANNAH D (DOB: Aug-14-1986)",F,W,N,...,"FOOT PURSUIT, FIELD CONTACT",NONE,NONE,TAKEDOWNS,TAKEDOWNS TREATMENT REFUSED,TREATMENT REFUSED,YES,NO,SGT MARTIN /,970
2,"JENNINGS, ERIK",12/7/2016,Dec-07-2016,13:05,EAST,23-2016-18571,"MCDONALD, ALEMAYOHU J APR-03-1996",M,B,U,...,HANDCUFFING,NONE,NONE,TAKEDOWNS,TAKEDOWNS,,YES,NO,BURGARA / 1257,970
3,"BATES, JUSTIN",12/13/2016,Dec-02-2016,22:30,WEST,23-2016-18335,"SOLLERS, CECIL A (DOB: Sep-25-1989)",M,U,H,...,FOOT PURSUIT,NONE,NONE,POINT TASER,POINT TASER,"EMS AT SCENE, HOSPITAL/ADMITTED, HOSPITAL/RELE...",YES,YES,CPL PARDUE/1219,970
4,"GEDDRY, BLAISE",12/13/2016,Dec-13-2016,2:35,WEST,23-2016-18818,"RENFRO, JOHN W",M,W,U,...,OTHER,NONE,"NONE MENTAL ILLNESS, ADMISSION TO FACILITY, PR...","TAKEDOWNS, HANDS/FEET","TAKEDOWNS, HANDS/FEET","EMS AT SCENE, HOSPITAL/ADMITTED",YES,YES,GEDDRY 1315,970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,"PARDUE, BILL",1/19/2018,1/19/2018,1:24,WEST,23-2018-967,"RAULS, COURTNEY A",F,B,U,...,FIELD CONTACT,NONE,NONE,TAKEDOWNS,TAKEDOWNS,,YES,YES,DUMAS/ 1543,970
297,"MARBACH, NICHOLAS",1/20/2018,1/18/2018,23:20,EAST,23-2018-965,"ELLIOT, DENNIS RICHARD",M,U,U,...,"ESCORT, HANDCUFFING, FIELD CONTACT PRIOR TO IN...",,"BRUISES MENTAL ILLNESS, ADMISSION TO FACILITY",HANDS/FEET,HANDS/FEET,"EMS AT SCENE, HOSPITAL/ADMITTED",YES,YES,KREBS/DAVIS #1508/1230,970
298,"MARBACH, NICHOLAS",1/20/2018,1/18/2018,23:21,EAST,23-2018-965,"NAPOLEON, ALICE LANEY",F,I,U,...,"OTHER, FIELD CONTACT",NONE,NONE,HANDS/FEET,HANDS/FEET,"EMS AT SCENE, SELF TREATMENT",YES,YES,KREBS/DAVIS #1508/1230,970
299,"HEMSTOCK, STUART",1/23/2018,1/23/2018,8:45,EAST,23-2018-1191,"KESSEL, JORDAN P",M,W,N,...,"HANDCUFFING, OTHER",NONE,NONE,CONTROL HOLDS CAUSING INJURY,CONTROL HOLDS CAUSING INJURY,,YES,YES,HUBERTY/1214,970


## Booze Licenses

From https://badpdfs.com/pdfs/m27/

In [3]:
pdf = PDF('demo/booze.pdf')
page = pdf.pages[0]

In [9]:
elems = td.slice_fitting_elem(page, 'text:contains("415051")').find_all('text')