# Parsing Debug Notebook

This somewhat messy notebook makes it easier to debug the parser, because we can just rerun the cells needed to set up the debugging process with various parameters.

In [1]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii, postprocess
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex, cluster_words, columnize, cluster_x, sort_words

In [2]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

Setting `VALIDATE_TOP` to `True` will parse all 25 validation PDFs and compare non-Schedule F output to the validation data. If set to `False`, we run the test code for Schedule F instead.

In [3]:
VALIDATE_TOP = True

In [4]:
if VALIDATE_TOP:
    validation_data = pd.read_csv("validation_data.csv", index_col="job_id").fillna("")
    validation_data.head()

In [5]:
extractor_df = load_extractor_df("parse_data/990_extractors.csv")
roadmap_df = load_extractor_df("parse_data/990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("parse_data/schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("parse_data/schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("parse_data/schedule_f_row_extractors.csv")

In [6]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = r"Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = r"Grants to Individuals Outside the United States"

In [7]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []
if VALIDATE_TOP:
    values = validation_data.index.values
else:
    values = []

for i, job_id in enumerate(values):
    print(i)
    print(job_id)
    pdf_key = validation_data.at[job_id, "pdf_key"]
    print(pdf_key)
    
    data = open_df(bucket, job_id)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    row = postprocess(row, job_id, pdf_key, clean_filing)
    filing_rows.append(row)
    
    pages = lines.groupby("Page")
    
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_i_table = postprocess(part_i_table, job_id, pdf_key, clean_f_i)
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_ii_table = postprocess(part_ii_table, job_id, pdf_key, clean_f_ii)
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_iii_table = postprocess(part_iii_table, job_id, pdf_key, clean_f_iii)
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

0
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4
EIN_760733035_YEAR_2009_FORMTYPE_990.pdf


No match for year_formation in L Year of Formation M State of legal domicile
No match for state_of_domicile in M State of legal domicile
No match for largest_program_service_grants in 4a (Code ) (Expenses $ 1,241,334 including grants of $ ) (Revenue $ )
No match for largest_program_service_revenue in 4a (Code ) (Expenses $ 1,241,334 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4

1
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc
EIN_363235550_YEAR_2009_FORMTYPE_990.pdf


No match for total_unrelated_biz_revenue in 7a NONE
No match for net_unrelated_biz_taxable_revenue in 7b NONE
No match for benefits_paid_members_expenses in NONE
No match for professional_fundraising_fees_expenses in NONE
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ 1,271,659 including grants of $ NONE ) (Revenue $ NONE )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ 1,271,659 including grants of $ NONE ) (Revenue $ NONE )
No match for investment_tax_exempt_bonds_total in NONE
No match for royalties_total in NONE
No match for net_rental_income_total in NONE
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for fundraising_net_income_total in NONE
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gaming_net_income_total in NONE
No match for gross_income_sales in a
No match for direct_expenses_sales in b
No match for sales_net_income_tota

2
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b
EIN_223134995_YEAR_2010_FORMTYPE_990.pdf


No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 506,702 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ 506,702 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ 691,250 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ 691,250 including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) See also Additional Data for Description (Expenses $ 46,528 including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) See also Additional Data for Description (Expenses $ 46,528 including grants of $ ) (Revenue $ )
No match for contributions_federated_campaigns in 1a
No match for direct_expenses_fundraising in a b
No match for direct_ex

3
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82
EIN_264320885_YEAR_2009_FORMTYPE_990.pdf


No match for website in J Website: freedom424 org
No match for largest_program_service_grants in 4a (Code ) (Expenses $ 26,032 including grants of $ ) (Revenue $ 46,117 )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenu

4
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be
EIN_581943161_YEAR_2009_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 439,685 including grants of $ ) (Revenue $ 503,136 )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 55,819 including grants of $ ) (Revenue $ 80,000 )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expenses $ inc

5
5596f51a999ebbd4cb992f490ceaffcddbac9bce532b911997683ae6897c2797
EIN_231352689_YEAR_2009_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 3,016,297 including grants of $ ) (Revenue $ )
No match for largest_program_service_revenue in 4a (Code ) (Expenses $ 3,016,297 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 735,525 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ 735,525 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ 1,114,927 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ 1,114,927 including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ 701,105 including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expenses $ 701,105 i

6
481a62c75776cc7f62075c1d60c230ea01f2dbc5b6224c65cb73db1728e86b00
EIN_954806856_YEAR_2009_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 479,051 including grants of $ ) (Revenue $ )
No match for largest_program_service_revenue in 4a (Code ) (Expenses $ 479,051 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe

7
39d270117e4b6354850551c8237421403da7d2b3f5d06657a28f5a9d07febf17
EIN_521238301_YEAR_2010_FORMTYPE_990.pdf


No match for net_unrelated_biz_taxable_revenue in 7b
No match for other_program_service_expenses in 4d Other program services. (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services. (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services. (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for contributions_noncash in $
No match for gross_sales_securities in (i)
No match for gross_sales_other in (i) (ii)
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b
No match for total_number_recipient_foreign_orgs_listed_as_charities in or for which


8
9a69ea9d5c8d5397603dbb77d027f4ff4ef9dcdc6609b7a1657f9484529353a0
EIN_521703065_YEAR_2010_FORMTYPE_990.pdf


No match for largest_program_service_grants in expenses, revenue, any, program service reported. 4a (Code: ) (Expenses 1,230,988. including grants of $ ) (Revenue $ 2,909. )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ 20,765. including grants of $ ) (Revenue $ 17,375. )
No match for third_largest_program_service_expenses in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services. (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services. (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other

9
3b9cc21f73f11bef5f2e4c15859661f7f7c9adee8dd6a1f2301099834fdb2926
EIN_611190087_YEAR_2008_FORMTYPE_990.pdf


No match for mission in 1 Briefly describe the organization's mission o most significant activities
No match for largest_program_service_grants in 4a (Code: ) (Expenses $ 178,327 including grants of $ ) (Revenue $ 58,210 )
No match for second_largest_program_service_expenses in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other (Expenses program $ services (D

10
82588f1ce9ca8cec2fc6b55ec07361b821be6650d1c01db4b9c948b8bf12689f
EIN_113489123_YEAR_2008_FORMTYPE_990.pdf


No match for website in J Website: H(c)
No match for gross_receipts in G Gross receipts $
No match for year_formation in L Year of formation. M State of legal domicile:
No match for state_of_domicile in M State of legal domicile:
No match for total_unrelated_biz_revenue in 7a
No match for net_unrelated_biz_taxable_revenue in 7b
No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)
No match for largest_program_service_expenses in 4a (Code: ) (Expenses $ including grants of $ (Revenue
No match for largest_program_service_grants in 4a (Code: ) (Expenses $ including grants of $ (Revenue
No match for largest_program_service_revenue in 4a (Code: ) (Expenses $ including grants of $ (Revenue
No match for second_largest_program_service_expenses in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_larges

11
f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2f4a49dd0a81b456649
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf


No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule 0) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule 0) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_pr

12
a2061356d7999388cbd49b79872883c92ce6c81a7e7820788f92db496cedd620
EIN_630329409_YEAR_2009_FORMTYPE_990.pdf


No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 7,808,227 including grants of $ ) (Revenue $ 8,637,281 )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ 3,815,233 including grants of $ ) (Revenue $ 3,003,569 )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


13
6e417b42fc15148e0489456f5086bbac28a8361d3452a0ddc23314afee5b6313
EIN_620988294_YEAR_2010_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 10,424,885 including grants of $ ) (Revenue $ 1,644 )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 6,864,383 including grants of $ ) (Revenue $ 218,386 )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expenses $ 11,764,040 including grants of $ 8,515,374 ) (Revenue $ )
No match for contributions_federated_campaigns in 1a
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b


14
e56d65e73cec9532561c42db4f4dc64c5b968441b4d492444292a9daf3921044
EIN_581954432_YEAR_2010_FORMTYPE_990.pdf


No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_

15
68a8d4678de1d3107eff3ae0bfa1acdd6a9787a173b49c1b9e3dfbad7de5b452
EIN_474865647_YEAR_2020_FORMTYPE_990.pdf


No match for largest_program_service_revenue in 4a (Code: ) (Expenses $ 2,673,661 including grants of $ 2,656,355 ) (Revenue $ )
No match for second_largest_program_service_expenses in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule o.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_ser

16
cd689dd466e417d074b1bde48b0928cc4ae08d6cca44be9d15f288fe13adb578
EIN_472208314_YEAR_2020_FORMTYPE_990.pdf


No match for largest_program_service_revenue in 4a (Code: ) (Expenses $ 628,015 including grants of $ 28,072 ) (Revenue $ )
No match for second_largest_program_service_expenses in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule o.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_

17
2ef32905e24a7a69d5bb4e4ac22448b279cbe84df831d57b17daa69df0219dfd
EIN_262414132_YEAR_2009_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 21,570,807 including grants of $ ) (Revenue $ 1,723,239 )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_gr

18
b71782c8204cadf98ef57d1e9a6968d35368fc940dede7bc85dff661df77a27e
EIN_411601449_YEAR_2010_FORMTYPE_990.pdf


No match for largest_program_service_revenue in 4a (Code ) (Expenses $ 10,706,345 including grants of $ 6,877,759 ) (Revenue $ )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_g

19
aefc7b65c34db330d8d9f56a1226e116b63ee9be7dbdd4ae4c7bea5d87359f97
EIN_362428692_YEAR_2009_FORMTYPE_990.pdf


No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule 0) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule 0) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule 0) (Expenses $ including grants of $ ) (Revenue $ )
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b


20
a5a3cbfcf844be8862bbb61ad46d4c795891ab1143e420db5ed99fc79eeb66c9
EIN_271377148_YEAR_2016_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code ) (Expenses $ 199,347 including grants of $ ) (Revenue $ 70 )
No match for second_largest_program_service_grants in 4b (Code: ) (Expenses $ 4,921 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code: ) (Expenses $ 4,921 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code: ) (Expenses $ including grants of $ ) (Revenue $ )
No match for total_program_service_expenses in https://eup.eps.irs.gov/mef/rrdprd/sdi/proxy/printSul 6/1/2018
No match for rental_expenses_real in of tax-exempt bond (i) Real
No match for rental_expenses_personal in tax-exempt bond proceeds (i) Real (ii) Personal
No match for gross_fundraising

21
67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5
EIN_42103580_YEAR_2010_FORMTYPE_990.pdf


No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for contributions_federated_campaigns in 1a
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b


22
a6529e504df346097da99104a353e977426e018cd5ac33b62cd2dd89c90763c5
EIN_311002913_YEAR_2008_FORMTYPE_990.pdf


No match for largest_program_service_grants in 4a (Code: ) (Expenses $ 490,957 including grants of $ ) (Revenue $ )
No match for largest_program_service_revenue in 4a (Code: ) (Expenses $ 490,957 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 430,463 including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ 430,463 including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services. (Descnbe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_servi

23
d1925c2d74adaa3f150ded3ce67dfe7ae3a306f0db0289ad5755a28d801b2b0b
EIN_202408857_YEAR_2010_FORMTYPE_990.pdf


No match for largest_program_service_revenue in 4a (Code ) (Expenses $ 1,703,322 including grants of $ 1,688,327 ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ 246,600 including grants of $ 246,600 ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_revenue in 4d Other program services (Describe in Schedule O ) (Expens

24
01d89ee5d14575c1321b2e4d67431d172ba76212b4a266bdaf474275029fd78b
EIN_521830327_YEAR_2009_FORMTYPE_990.pdf


No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)
No match for largest_program_service_grants in 4a (Code ) (Expenses $ 169,614. including grants of $ ) (Revenue $ 155,310. )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ 132,003 including grants of $ ) (Revenue $ 199,486. )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_grants in 4d Other program services (Describe in Schedule O.) (Expenses $ including grants of $ ) (Revenue $ )
No ma

In [8]:
if VALIDATE_TOP:
    output_data = pd.concat(filing_rows).reset_index(drop=True).set_index("job_id")

In [9]:
VALIDATE_TOP and output_data.head()

field_name,name,address,city,state,zip,website,gross_receipts,year_formation,state_of_domicile,mission,...,total_functional_expense_fundraising,activities_per_region_totals_number_of_offices,activities_per_region_totals_number_of_employees,activities_per_region_totals_total_expenditure,total_number_recipient_foreign_orgs_listed_as_charities,total_number_other_recipient_foreign_orgs_entities,pdf_key,ein,year,filing_id
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4,MAKE WAY PARTNERS INC,PO BOX 26367,BIRMINGHAM,OX,26367,www MAKEWAYPARTNERS ORG,1426217,,,EVANGELICAL MISSIOI WORK TO PREVENT AND COMBAT...,...,4503,,,,,,EIN_760733035_YEAR_2009_FORMTYPE_990.pdf,760733035,2009,760733035_2009
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc,THE FEDERALIST SOCIETY FOR LAW AND,"1015 18TH ST., N.W. 425",WASHINGTON,DC,20036,FED-SOC.ORG,11033302,1982.0,IL,THE ORGANIZATION'S MISSION IS TO PROMOTE INTEL...,...,597789,,,,,,EIN_363235550_YEAR_2009_FORMTYPE_990.pdf,363235550,2009,363235550_2009
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b,Ron Hutchcraft Ministries Inc,PO Box 400,Harnson,AR,20400,www hutchcraft com,3390,1991.0,NJ,To communicate Christ to the lost in their lan...,...,263765,,,,,,EIN_223134995_YEAR_2010_FORMTYPE_990.pdf,223134995,2010,223134995_2010
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82,FREEDOM 4 24,21430 TIMBERLAKE ROAD STE 101,LYNCHBURG,VA,24502,,61857,2009.0,VA,Freedom 424s mission is to provide a pathway t...,...,669,,,,,,EIN_264320885_YEAR_2009_FORMTYPE_990.pdf,264320885,2009,264320885_2009
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be,GEORGIA PUBLIC POLICY FOUNDTION,,LAKE FORREST,GA,30328,www GPPF org,583209,1991.0,GA,To further goals of economic growth & individu...,...,14409,,,,,,EIN_581943161_YEAR_2009_FORMTYPE_990.pdf,581943161,2009,581943161_2009


In [10]:
def clean(x):
    x = str(x)
    x = re.sub(r"\.0\b", "", x)
    x = re.sub("\D", "", x)
    return x

In [11]:
def compare_output(to_validate, to_compare, col):
    return pd.DataFrame(
        {
            "extracted": to_validate.loc[col].loc[
                lambda series: series != to_compare.loc[col]
            ],
            "expected": to_compare.loc[col].loc[
                lambda series: series != to_validate.loc[col]
            ],
        }
    )

In [12]:
if VALIDATE_TOP:
    to_compare = validation_data.set_index("pdf_key").applymap(clean)
    to_validate = output_data[validation_data.columns].set_index("pdf_key").applymap(clean)

    for col in to_validate.index:
        validated = compare_output(to_validate, to_compare, col)
        if validated.any().any():
            print(col)
            print(f"{validated.shape[0]} mismatched items.")
            print(validated)
            print("-"*79)

EIN_611190087_YEAR_2008_FORMTYPE_990.pdf
1 mismatched items.
                        extracted expected
total_revenue_unrelated                  0
-------------------------------------------------------------------------------
EIN_113489123_YEAR_2008_FORMTYPE_990.pdf
3 mismatched items.
                    extracted expected
total_revenue            1017  1017506
travel_total              137    13710
total_revenue_total   1011506  1017506
-------------------------------------------------------------------------------
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf
1 mismatched items.
                                                   extracted expected
total_number_other_recipient_foreign_orgs_entities        10         
-------------------------------------------------------------------------------
EIN_271377148_YEAR_2016_FORMTYPE_990.pdf
4 mismatched items.
                                 extracted expected
program_service_revenue_2a_total    900099       70
total_revenue_total           

In [18]:
if VALIDATE_TOP:
    assert False

AssertionError: 

In [28]:
table_test_df = open_df(bucket, "4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82")

In [30]:
test_lines = table_test_df.loc[
    table_test_df["BlockType"] == "LINE"
]
test_words = table_test_df.loc[
    table_test_df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")

In [31]:
page_map = find_pages(test_lines)
roadmap = create_roadmap(
    test_lines, roadmap_df, page_map
)

row = extract_from_roadmap(
    test_words, test_lines, roadmap, extractor_df, page_map
)
row = postprocess(row, "foo_2_3_4_5", "bar_2_3_4_5", clean_filing)
row.iloc[0]["total_program_service_expenses"]

No match for website in J Website: freedom424 org
No match for largest_program_service_grants in 4a (Code ) (Expenses $ 26,032 including grants of $ ) (Revenue $ 46,117 )
No match for second_largest_program_service_expenses in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_grants in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for second_largest_program_service_revenue in 4b (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_expenses in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_grants in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for third_largest_program_service_revenue in 4c (Code ) (Expenses $ including grants of $ ) (Revenue $ )
No match for other_program_service_expenses in 4d Other program services (Describe in Schedule O ) (Expenses $ including grants of $ ) (Revenu

'26032'

In [45]:
page_map = find_pages(test_lines)

In [46]:
page_map

{'Page 1': 2,
 'Page 3': 3,
 'Page 9': 10,
 'Page 10': 11,
 'Schedule F, Page 1': 0,
 'Schedule F, Page 2': 0}

In [20]:
extractors = create_extractors(extractor_df, roadmap, page_map)

In [26]:
extractors.iloc[35]

Extractor(name='largest_program_service_revenue', strategy='words', page=3, bounding_box=BoundingBox(left=0.06653264164924622, left_delta=0.0, top=0.29883649945259094, top_delta=0.01, right=1.0, right_delta=0.0, bottom=0.29883649945259094, bottom_delta=0.01), regex=re.compile('Revenue\\s+\\$?\\s*(?P<match>[iIl]{0,3},?\\d{1,3}\\S+)'))

In [18]:
page_3 = test_lines.loc[
    lambda df: df["Page"] == 3
]

In [19]:
page_3.loc[
    lambda df: df["Text"].str.contains("129,753")
]

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4ad7ac40-3b8a-488c-9a69-e03cda103a9e,LINE,96.448097,{'BoundingBox': {'Height': 0.01150413043797016...,3,,[{'Ids': ['b941f319-4f48-4cd9-a902-999d8197e9f...,"129,753. )",,"[{'X': 0.8282544016838074, 'Y': 0.299269586801...",0.011504,0.828254,0.29927,0.897263,0.310774,0.862758,0.305022,0.069008,"[b941f319-4f48-4cd9-a902-999d8197e9f8, 5357c82...",33,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...


In [51]:
page_9_words = test_words.loc[
    lambda df: df["Page"] == 10
]

In [52]:
page_9_words.loc[
    lambda df: df["Text"].str.contains("741")
]

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
eb0b2210-a80d-4814-b125-503f93340140,WORD,99.685066,{'BoundingBox': {'Height': 0.00659855129197239...,10,,,5741,PRINTED,"[{'X': 0.7446264028549194, 'Y': 0.903555333614...",0.006599,0.744626,0.903555,0.772291,0.910154,0.758459,0.906855,0.027665,,91,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...
35fddd8a-c61e-42fb-a004-92e5ec77ad0a,WORD,99.644402,{'BoundingBox': {'Height': 0.00675594061613082...,10,,,5741,PRINTED,"[{'X': 0.5144724249839783, 'Y': 0.903552949428...",0.006756,0.514472,0.903553,0.543597,0.910309,0.529035,0.906931,0.029124,,91,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...


In [20]:
sort_words(test_words).sort_values("Page")

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
46ee2dba-1fae-4674-829c-d1da7d794abe,WORD,99.896782,"{'BoundingBox': {'Height': 0.1651940941810608,...",1,,,Identity,PRINTED,"[{'X': 0.38765281438827515, 'Y': 0.62289738655...",0.165194,0.387653,0.622897,0.453978,0.788091,0.420815,0.705494,0.066325,,66,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
a87fe50c-ab58-45f5-a5a8-ff4916fb6ddc,WORD,99.994255,{'BoundingBox': {'Height': 0.12260233610868454...,1,,,to,PRINTED,"[{'X': 0.44995108246803284, 'Y': 0.43935668468...",0.122602,0.449951,0.439357,0.467966,0.561959,0.458959,0.500658,0.018015,,48,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
d671fed6-0dee-488e-9515-622a0bdb2a96,WORD,99.633995,{'BoundingBox': {'Height': 0.13367076218128204...,1,,,1-800-908-4490,PRINTED,"[{'X': 0.6079996228218079, 'Y': 0.625898420810...",0.133671,0.608000,0.625898,0.754668,0.759569,0.681334,0.692734,0.146669,,66,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
6a302a63-b051-46aa-86f3-360798fad12c,WORD,99.969673,{'BoundingBox': {'Height': 0.16924890875816345...,1,,,Report,PRINTED,"[{'X': 0.2156895399093628, 'Y': 0.428333848714...",0.169249,0.215690,0.428334,0.277392,0.597583,0.246541,0.512958,0.061702,,47,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
d87939ea-860e-42a1-ba61-5c4055ba6ddf,WORD,99.641563,{'BoundingBox': {'Height': 0.13529200851917267...,1,,,Problems,PRINTED,"[{'X': 0.35812556743621826, 'Y': 0.42916509509...",0.135292,0.358126,0.429165,0.444617,0.564457,0.401371,0.496811,0.086491,,47,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76d67158-c5b5-47db-9fd1-f7e554b7be04,WORD,97.489304,{'BoundingBox': {'Height': 0.03425183147192001...,38,,,26888.,PRINTED,"[{'X': 0.20950397849082947, 'Y': 0.44076749682...",0.011811,0.559233,0.209504,0.593484,0.221315,0.576358,0.215409,0.034252,,45,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
28555142-5b0a-4509-820f-8fe759e5714a,WORD,98.065315,{'BoundingBox': {'Height': 0.01903511583805084...,38,,,247.,PRINTED,"[{'X': 0.22905126214027405, 'Y': 0.48682674765...",0.011040,0.513173,0.229051,0.532208,0.240092,0.522691,0.234571,0.019035,,51,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
83011c2c-7317-45c5-801f-33f75c82cd48,WORD,96.199333,{'BoundingBox': {'Height': 0.03451774269342422...,38,,,75073.,PRINTED,"[{'X': 0.20958559215068817, 'Y': 0.50240594148...",0.012060,0.497594,0.209586,0.532112,0.221646,0.514853,0.215616,0.034518,,51,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...
b7ab68c3-e218-4cf7-85ff-d3e9a44e3c0c,WORD,97.550308,{'BoundingBox': {'Height': 0.03448380529880524...,38,,,77174.,PRINTED,"[{'X': 0.2685639262199402, 'Y': 0.373663336038...",0.012357,0.626337,0.268564,0.660820,0.280921,0.643579,0.274742,0.034484,,37,bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a3...


In [30]:
test_words.loc[test_words.groupby("Page").groups[1]]

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
68b570df-accc-4297-abd6-e31678993f57,WORD,99.633995,{'BoundingBox': {'Height': 0.13367076218128204...,1,,,1-800-908-4490,PRINTED,"[{'X': 0.6079996228218079, 'Y': 0.625898420810...",0.133671,0.608,0.625898,0.754668,0.759569,0.681334,0.692734,0.146669,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
cf1ffd57-b9f2-4a25-9c19-662741810e54,WORD,99.990837,"{'BoundingBox': {'Height': 0.1280379742383957,...",1,,,at,PRINTED,"[{'X': 0.5828642249107361, 'Y': 0.629679083824...",0.128038,0.582864,0.629679,0.600989,0.757717,0.591926,0.693698,0.018124,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
a9ac16e3-5840-4ecf-a2f9-4eb9890f6834,WORD,99.920433,{'BoundingBox': {'Height': 0.13315647840499878...,1,,,Hotline,PRINTED,"[{'X': 0.5137181878089905, 'Y': 0.622949957847...",0.133156,0.513718,0.62295,0.577148,0.756106,0.545433,0.689528,0.06343,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
abb4a9f3-ce9e-4fe0-bf06-a60eabb3489b,WORD,99.948036,{'BoundingBox': {'Height': 0.13557077944278717...,1,,,Theft,PRINTED,"[{'X': 0.4585777819156647, 'Y': 0.622158050537...",0.135571,0.458578,0.622158,0.507138,0.757729,0.482858,0.689943,0.04856,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
dea6f18f-4965-4132-b0d9-39b7ab6effcb,WORD,99.896782,"{'BoundingBox': {'Height': 0.1651940941810608,...",1,,,Identity,PRINTED,"[{'X': 0.38765281438827515, 'Y': 0.62289738655...",0.165194,0.387653,0.622897,0.453978,0.788091,0.420815,0.705494,0.066325,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
c4463032-27d9-468a-b3ab-8368b4563a33,WORD,99.893997,{'BoundingBox': {'Height': 0.13539275527000427...,1,,,IRS,PRINTED,"[{'X': 0.3474552035331726, 'Y': 0.622128844261...",0.135393,0.347455,0.622129,0.380315,0.757522,0.363885,0.689825,0.03286,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
553f8070-2680-4a98-a341-3dc481ded6b2,WORD,99.949875,"{'BoundingBox': {'Height': 0.1349395215511322,...",1,,,call,PRINTED,"[{'X': 0.2740101218223572, 'Y': 0.623206555843...",0.13494,0.27401,0.623207,0.305,0.758146,0.289505,0.690676,0.03099,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
cf849225-61d8-446e-ba35-e442a4db92b3,WORD,99.885559,"{'BoundingBox': {'Height': 0.1330478936433792,...",1,,,Or,PRINTED,"[{'X': 0.24524500966072083, 'Y': 0.62238115072...",0.133048,0.245245,0.622381,0.269018,0.755429,0.257131,0.688905,0.023773,,67,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
c0c472c5-eeed-4b6b-b7a3-8ed1a462c9c4,WORD,98.678192,{'BoundingBox': {'Height': 0.16821227967739105...,1,,,https://public.resource.org/privacy,PRINTED,"[{'X': 0.4742083251476288, 'Y': 0.429521828889...",0.168212,0.474208,0.429522,0.785567,0.597734,0.629888,0.513628,0.311359,,49,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
b6831991-7e8a-4cce-93ab-19499467be9e,WORD,99.994255,{'BoundingBox': {'Height': 0.12260233610868454...,1,,,to,PRINTED,"[{'X': 0.44995108246803284, 'Y': 0.43935668468...",0.122602,0.449951,0.439357,0.467966,0.561959,0.458959,0.500658,0.018015,,50,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...
