# Parsing Debug Notebook

This somewhat messy notebook makes it easier to debug the parser, because we can just rerun the cells needed to set up the debugging process with various parameters.

In [1]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii, postprocess
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex, cluster_words, columnize, cluster_x

In [2]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

Setting `VALIDATE_TOP` to `True` will parse all 25 validation PDFs and compare non-Schedule F output to the validation data. If set to `False`, we run the test code for Schedule F instead.

In [3]:
VALIDATE_TOP = True

In [4]:
if VALIDATE_TOP:
    validation_data = pd.read_csv("validation_data.csv", index_col="job_id").fillna("")
    validation_data.head()

In [5]:
extractor_df = load_extractor_df("parse_data/990_extractors.csv")
roadmap_df = load_extractor_df("parse_data/990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("parse_data/schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("parse_data/schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("parse_data/schedule_f_row_extractors.csv")

In [6]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = r"Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = r"Grants to Individuals Outside the United States"

In [7]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []
if VALIDATE_TOP:
    values = validation_data.index.values
else:
    values = []

for i, job_id in enumerate(values):
    print(i)
    print(job_id)
    pdf_key = validation_data.at[job_id, "pdf_key"]
    print(pdf_key)
    
    data = open_df(bucket, job_id)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    row = postprocess(row, job_id, pdf_key, clean_filing)
    filing_rows.append(row)
    
    pages = lines.groupby("Page")
    
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_i_table = postprocess(part_i_table, job_id, pdf_key, clean_f_i)
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_ii_table = postprocess(part_ii_table, job_id, pdf_key, clean_f_ii)
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_iii_table = postprocess(part_iii_table, job_id, pdf_key, clean_f_iii)
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

0
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4
EIN_760733035_YEAR_2009_FORMTYPE_990.pdf


No match for year_formation in L Year of Formation M State of legal domicile
No match for state_of_domicile in M State of legal domicile
No match for contributions_federated_campaigns in 1a
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for misc_revenue_total_total in $
No match for total_revenue_unrelated in Form


1
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc
EIN_363235550_YEAR_2009_FORMTYPE_990.pdf


No match for total_unrelated_biz_revenue in 7a NONE
No match for net_unrelated_biz_taxable_revenue in 7b NONE
No match for benefits_paid_members_expenses in NONE
No match for professional_fundraising_fees_expenses in NONE
No match for investment_tax_exempt_bonds_total in NONE
No match for royalties_total in NONE
No match for net_rental_income_total in NONE
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for fundraising_net_income_total in NONE
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gaming_net_income_total in NONE
No match for gross_income_sales in a
No match for direct_expenses_sales in b
No match for sales_net_income_total in NONE
No match for total_revenue_unrelated in Form
No match for grants_foreign_individuals_govt_orgs_total in NONE
No match for benefits_to_members_total in NONE
No match for compensation_disq_persons_total in NONE
No match for fees_for_mgmt_services_total in NON

2
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b
EIN_223134995_YEAR_2010_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a
No match for direct_expenses_fundraising in a b


3
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82
EIN_264320885_YEAR_2009_FORMTYPE_990.pdf


No match for website in J Website: freedom424 org
No match for contributions_federated_campaigns in 1a


4
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be
EIN_581943161_YEAR_2009_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for direct_expenses_gaming in b events a b
No match for total_revenue_unrelated in Form


5
5596f51a999ebbd4cb992f490ceaffcddbac9bce532b911997683ae6897c2797
EIN_231352689_YEAR_2009_FORMTYPE_990.pdf


No match for website in J Web site: wts edu www
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b
No match for total_revenue_unrelated in Form


6
481a62c75776cc7f62075c1d60c230ea01f2dbc5b6224c65cb73db1728e86b00
EIN_954806856_YEAR_2009_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a
No match for total_revenue_unrelated in Form


7
39d270117e4b6354850551c8237421403da7d2b3f5d06657a28f5a9d07febf17
EIN_521238301_YEAR_2010_FORMTYPE_990.pdf


No match for net_unrelated_biz_taxable_revenue in 7b
No match for contributions_noncash in $
No match for gross_sales_securities in (i)
No match for gross_sales_other in (i) (ii)
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b
No match for total_number_recipient_foreign_orgs_listed_as_charities in which for or


8
9a69ea9d5c8d5397603dbb77d027f4ff4ef9dcdc6609b7a1657f9484529353a0
EIN_521703065_YEAR_2010_FORMTYPE_990.pdf


No match for gross_sales_securities in of (I) Securities
No match for gross_sales_other in (I) Securities (II) Other
No match for gross_income_gaming in See a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


9
3b9cc21f73f11bef5f2e4c15859661f7f7c9adee8dd6a1f2301099834fdb2926
EIN_611190087_YEAR_2008_FORMTYPE_990.pdf


No match for mission in 1 Briefly describe the organization's mission o most significant activities
No match for contributions_noncash in $
No match for gross_sales_securities in sales of (i) Sec inties inventory
No match for gross_sales_other in (i) Sec inties (II) Other
No match for gross_fundraising_income in a b ents a
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a b Business Codo


10
82588f1ce9ca8cec2fc6b55ec07361b821be6650d1c01db4b9c948b8bf12689f
EIN_113489123_YEAR_2008_FORMTYPE_990.pdf


No match for website in J Website: H(c)
No match for gross_receipts in G Gross receipts $
No match for year_formation in L Year of formation. M State of legal domicile:
No match for state_of_domicile in M State of legal domicile:
No match for total_unrelated_biz_revenue in 7a
No match for net_unrelated_biz_taxable_revenue in 7b
No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)
No match for contributions_noncash in $
No match for gross_sales_securities in of (i) Securities
No match for gross_sales_other in Securities (ii) Other
No match for cost_securities in basis
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for direct_expenses_gaming in a b
No match for gross_income_sales in a
No match for direct_expenses_sales in b
No match for total_revenue_unrelated in Form


11
f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2f4a49dd0a81b456649
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf


No match for gross_sales_securities in of (I) Securities
No match for gross_sales_other in (I) Securities (II) Other
No match for gross_fundraising_income in (not a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


12
a2061356d7999388cbd49b79872883c92ce6c81a7e7820788f92db496cedd620
EIN_630329409_YEAR_2009_FORMTYPE_990.pdf


No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


13
6e417b42fc15148e0489456f5086bbac28a8361d3452a0ddc23314afee5b6313
EIN_620988294_YEAR_2010_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a


14
e56d65e73cec9532561c42db4f4dc64c5b968441b4d492444292a9daf3921044
EIN_581954432_YEAR_2010_FORMTYPE_990.pdf


No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for direct_expenses_gaming in b events a b


15
68a8d4678de1d3107eff3ae0bfa1acdd6a9787a173b49c1b9e3dfbad7de5b452
EIN_474865647_YEAR_2020_FORMTYPE_990.pdf


No match for gross_rents_real in 6a
No match for rental_expenses_real in 6b
No match for rental_income_real in 6c
No match for gross_sales_securities in 7a
No match for cost_securities in 7b
No match for gain_securities in 7c
No match for gross_fundraising_income in 8a
No match for direct_expenses_fundraising in 8b
No match for gross_income_gaming in 9a
No match for direct_expenses_gaming in 9b
No match for gross_income_sales in 10a
No match for direct_expenses_sales in 10b


16
cd689dd466e417d074b1bde48b0928cc4ae08d6cca44be9d15f288fe13adb578
EIN_472208314_YEAR_2020_FORMTYPE_990.pdf


No match for cost_securities in 7b
No match for gross_fundraising_income in 8a
No match for direct_expenses_fundraising in 8b
No match for gross_income_gaming in 9a
No match for direct_expenses_gaming in 9b
No match for gross_income_sales in 10a
No match for direct_expenses_sales in 10b


17
2ef32905e24a7a69d5bb4e4ac22448b279cbe84df831d57b17daa69df0219dfd
EIN_262414132_YEAR_2009_FORMTYPE_990.pdf


No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for direct_expenses_gaming in b events a b
No match for total_revenue_unrelated in Form


18
b71782c8204cadf98ef57d1e9a6968d35368fc940dede7bc85dff661df77a27e
EIN_411601449_YEAR_2010_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a
No match for total_revenue_unrelated in Form
No match for activities_per_region_totals_number_of_offices in O


19
aefc7b65c34db330d8d9f56a1226e116b63ee9be7dbdd4ae4c7bea5d87359f97
EIN_362428692_YEAR_2009_FORMTYPE_990.pdf


No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b


20
a5a3cbfcf844be8862bbb61ad46d4c795891ab1143e420db5ed99fc79eeb66c9
EIN_271377148_YEAR_2016_FORMTYPE_990.pdf


No match for rental_expenses_real in of tax-exempt bond (i) Real
No match for rental_expenses_personal in tax-exempt bond proceeds (i) Real (ii) Personal
No match for gross_fundraising_income in (ii) Other


21
67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5
EIN_42103580_YEAR_2010_FORMTYPE_990.pdf


No match for contributions_federated_campaigns in 1a


22
a6529e504df346097da99104a353e977426e018cd5ac33b62cd2dd89c90763c5
EIN_311002913_YEAR_2008_FORMTYPE_990.pdf


No match for contributions_noncash in $
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


23
d1925c2d74adaa3f150ded3ce67dfe7ae3a306f0db0289ad5755a28d801b2b0b
EIN_202408857_YEAR_2010_FORMTYPE_990.pdf


No match for contributions_noncash in $
No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in b
No match for direct_expenses_gaming in b events See b
No match for activities_per_region_totals_number_of_offices in Act the


24
01d89ee5d14575c1321b2e4d67431d172ba76212b4a266bdaf474275029fd78b
EIN_521830327_YEAR_2009_FORMTYPE_990.pdf


No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)
No match for gross_sales_securities in (I) Securities of
No match for gross_sales_other in (I) Securities (ii) Other
No match for gross_fundraising_income in (not a
No match for direct_expenses_fundraising in b
No match for gross_income_gaming in See a
No match for direct_expenses_gaming in b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


In [8]:
if VALIDATE_TOP:
    output_data = pd.concat(filing_rows).reset_index(drop=True).set_index("job_id")

In [9]:
VALIDATE_TOP and output_data.head()

field_name,name,address,city,state,zip,website,gross_receipts,year_formation,state_of_domicile,mission,...,total_functional_expense_fundraising,activities_per_region_totals_number_of_offices,activities_per_region_totals_number_of_employees,activities_per_region_totals_total_expenditure,total_number_recipient_foreign_orgs_listed_as_charities,total_number_other_recipient_foreign_orgs_entities,pdf_key,ein,year,filing_id
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4,MAKE WAY PARTNERS INC,PO BOX 26367,BIRMINGHAM,OX,26367,www MAKEWAYPARTNERS ORG,1426217,,,EVANGELICAL MISSIOI WORK TO PREVENT AND COMBAT...,...,4503,,,,,,EIN_760733035_YEAR_2009_FORMTYPE_990.pdf,760733035,2009,760733035_2009
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc,THE FEDERALIST SOCIETY FOR LAW AND,"1015 18TH ST., N.W. 425",WASHINGTON,DC,20036,FED-SOC.ORG,11033302,1982.0,IL,THE ORGANIZATION'S MISSION IS TO PROMOTE INTEL...,...,597789,,,,,,EIN_363235550_YEAR_2009_FORMTYPE_990.pdf,363235550,2009,363235550_2009
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b,Ron Hutchcraft Ministries Inc,PO Box 400,Harnson,AR,20400,www hutchcraft com,3390,1991.0,NJ,To communicate Christ to the lost in their lan...,...,263765,,,,,,EIN_223134995_YEAR_2010_FORMTYPE_990.pdf,223134995,2010,223134995_2010
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82,FREEDOM 4 24,21430 TIMBERLAKE ROAD STE 101,LYNCHBURG,VA,24502,,61857,2009.0,VA,Freedom 424s mission is to provide a pathway t...,...,669,,,,,,EIN_264320885_YEAR_2009_FORMTYPE_990.pdf,264320885,2009,264320885_2009
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be,GEORGIA PUBLIC POLICY FOUNDTION,6100 LAKE FORREST DR,LAKE FORREST,GA,30328,www GPPF org,583209,1991.0,GA,To further goals of economic growth & individu...,...,14409,,,,,,EIN_581943161_YEAR_2009_FORMTYPE_990.pdf,581943161,2009,581943161_2009


In [10]:
def clean(x):
    x = str(x)
    x = re.sub(r"\.0\b", "", x)
    x = re.sub("\D", "", x)
    return x

In [11]:
def compare_output(to_validate, to_compare, col):
    return pd.DataFrame(
        {
            "extracted": to_validate.loc[col].loc[
                lambda series: series != to_compare.loc[col]
            ],
            "expected": to_compare.loc[col].loc[
                lambda series: series != to_validate.loc[col]
            ],
        }
    )

In [12]:
if VALIDATE_TOP:
    to_compare = validation_data.set_index("pdf_key").applymap(clean)
    to_validate = output_data[validation_data.columns].set_index("pdf_key").applymap(clean)

    for col in to_validate.index:
        validated = compare_output(to_validate, to_compare, col)
        if validated.any().any():
            print(col)
            print(f"{validated.shape[0]} mismatched items.")
            print(validated)
            print("-"*79)

EIN_113489123_YEAR_2008_FORMTYPE_990.pdf
3 mismatched items.
                    extracted expected
total_revenue            1017  1017506
travel_total              137    13710
total_revenue_total   1011506  1017506
-------------------------------------------------------------------------------
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf
2 mismatched items.
                                                   extracted expected
total_number_other_recipient_foreign_orgs_entities        10         
program_service_revenue_2a_label                           2         
-------------------------------------------------------------------------------
EIN_271377148_YEAR_2016_FORMTYPE_990.pdf
3 mismatched items.
                                 extracted expected
program_service_revenue_2a_total    900099       70
total_revenue_total                          246927
total_revenue_unrelated                           0
-------------------------------------------------------------------------------
EIN

In [13]:
VALIDATE_TOP and pd.concat(schedule_f_part_i_rows).head(50)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures,job_id,pdf_key,ein,year,filing_id
0,Europe,1.0,0.0,theological training,offers a Th M degree,6661,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,231352689,2009,231352689_2009
1,Totals,1.0,0.0,,,6661,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,231352689,2009,231352689_2009
0,(1),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
1,(2),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
2,(3),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
3,(4),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
4,(5),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
5,(6),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
6,(7),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
7,(8),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010


In [14]:
VALIDATE_TOP and pd.concat(schedule_f_part_i_rows).tail(50)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures,job_id,pdf_key,ein,year,filing_id
25,Europe :luding Icelan & Greenland,,,vestments,,,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
26,Middle East and North Africa,1.0,2.0,Grantmaking,,698609.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
27,Middle East and North Africa,,,Progr services,Service centers,26658.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
28,Middle East and North Africa,,,Progr services,Academic support,31354.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
29,Middle East and North Africa,,,Programservices,Institutional support,7637.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
30,Middle East and North Africa,,,Progr Services,Instruction,674753.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
31,Middle East and North Africa,,,Progr Services,Research other academicactivity,325683.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
32,Middle East and North Africa,,,Progr Services,Student Services,80648.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
33,Middle East and North Africa,,,Investment,,,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010
34,North merica,,,Fundraising,,33431.0,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,EIN_42103580_YEAR_2010_FORMTYPE_990.pdf,42103580,2010,42103580_2010


In [15]:
VALIDATE_TOP and pd.concat(schedule_f_part_ii_rows).head(50)

field,org_name,irs_code,region,grant_purpose,amount_cash,manner_cash,amount_noncash,desc_noncash,method_valuation,job_id,pdf_key,ein,year,filing_id
0,(1),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
1,(2),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
2,(3),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
3,(4),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
4,(5),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
5,(6),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
6,(7),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
7,(8),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
8,(9),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
9,(10),,,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010


In [16]:
VALIDATE_TOP and pd.concat(schedule_f_part_iii_rows).head(50)

field,type_of_grant_assistance,region,number_recipients,amount_cash_grant,manner_cash_disbursement,amount_noncash_assistance,desc_noncash_assistance,method_valuation,job_id,pdf_key,ein,year,filing_id
0,,,,,,,Schedule F,,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,231352689,2009,231352689_2009
0,(1),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
1,(2),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
2,(3),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
3,(4),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
4,(5),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
5,(6),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
6,(7),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
7,(8),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010
8,(9),,,,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,521238301,2010,521238301_2010


In [17]:
if VALIDATE_TOP:
    assert False

AssertionError: 

In [29]:
table_test_df = open_df(bucket, "a2061356d7999388cbd49b79872883c92ce6c81a7e7820788f92db496cedd620")

In [37]:
test_lines = table_test_df.loc[
    table_test_df["BlockType"] == "LINE"
]
test_words = table_test_df.loc[
    table_test_df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")

In [44]:
page_map = find_pages(test_lines)
roadmap = create_roadmap(
    test_lines, roadmap_df, page_map
)

row = extract_from_roadmap(
    test_words, test_lines, roadmap, extractor_df, page_map
)
row = postprocess(row, "foo_2_3_4_5", "bar_2_3_4_5", clean_filing)
row.iloc[0]["misc_revenue_11c_unrelated"]

No match for gross_fundraising_income in a
No match for direct_expenses_fundraising in .b
No match for gross_income_gaming in a
No match for direct_expenses_gaming in .b
No match for gross_income_sales in a
No match for direct_expenses_sales in b


'5741'

In [39]:
page_map

{'Page 1': 2,
 'Page 9': 10,
 'Page 10': 11,
 'Schedule F, Page 1': 0,
 'Schedule F, Page 2': 0}

In [40]:
revenue_page = test_lines.loc[
    lambda df: df["Page"] == 10
]
revenue_page.head()

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
e1c0a823-96f5-4e74-9c3b-75adebaa6d28,LINE,99.514374,{'BoundingBox': {'Height': 0.00767599558457732...,10,,[{'Ids': ['ca76a14e-cd74-4102-a067-89e182fb4ca...,Form 990 (2008),,"[{'X': 0.007383246906101704, 'Y': 0.0062185968...",0.007676,0.007383,0.006219,0.117397,0.013895,0.06239,0.010057,0.110014,"[ca76a14e-cd74-4102-a067-89e182fb4ca4, e9dd2b9...",0,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...
0763460e-cdea-4fee-aaa3-4181212c3954,LINE,99.019516,{'BoundingBox': {'Height': 0.00879328697919845...,10,,[{'Ids': ['dd20eada-60d8-4103-93f3-f2dd951a21a...,Page 9,,"[{'X': 0.8458542823791504, 'Y': 0.005997187457...",0.008793,0.845854,0.005997,0.89075,0.01479,0.868302,0.010394,0.044895,"[dd20eada-60d8-4103-93f3-f2dd951a21a3, 15f2212...",0,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...
99e51e96-5318-4520-ab68-be3ea91190b6,LINE,97.87722,{'BoundingBox': {'Height': 0.00731519563123583...,10,,[{'Ids': ['571daea0-320b-4447-a99a-ba3f070a0f2...,VIII,,"[{'X': 0.023435186594724655, 'Y': 0.0281912274...",0.007315,0.023435,0.028191,0.057231,0.035506,0.040333,0.031849,0.033796,[571daea0-320b-4447-a99a-ba3f070a0f22],2,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...
f58741b0-e504-443f-b26f-77a83921f911,LINE,99.056938,{'BoundingBox': {'Height': 0.00767854414880275...,10,,[{'Ids': ['bc0b6fbf-5844-4661-9f6c-c217fa2b699...,Part,,"[{'X': 0.024107644334435463, 'Y': 0.0182459149...",0.007679,0.024108,0.018246,0.057966,0.025924,0.041037,0.022085,0.033859,[bc0b6fbf-5844-4661-9f6c-c217fa2b6992],2,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...
6f2dc4da-4106-4ae1-a1eb-c5b18ced58c9,LINE,99.84494,{'BoundingBox': {'Height': 0.00794700067490339...,10,,[{'Ids': ['0aab01a9-0291-434c-abb8-d19e8ff107d...,Statement of Revenue,,"[{'X': 0.08663994818925858, 'Y': 0.01794482953...",0.007947,0.08664,0.017945,0.266322,0.025892,0.176481,0.021918,0.179682,"[0aab01a9-0291-434c-abb8-d19e8ff107dc, 44c315e...",2,a2061356d7999388cbd49b79872883c92ce6c81a7e7820...


In [41]:
roadmap.iloc[40:90]

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Part VIII, Item 1a",0.097475,0.10152,0.14,0.1
"Part VIII, Item 1b",0.111956,0.101202,0.15,0.1
"Part VIII, Item 1c",0.131673,0.100308,0.17,0.1
"Part VIII, Item 1d",0.1512,0.100966,0.18,0.1
"Part VIII, Item 1e",0.164107,0.10101,0.19,0.1
"Part VIII, Item 1f",0.179621,0.100705,0.2,0.1
"Part VIII, Item 1g",0.20654,0.101056,0.25,0.1
"Part VIII, Item 1h",0.228477,0.100766,0.27,0.09
"Part VIII, Item 2a",0.258937,0.062224,0.29,0.09
"Part VIII, Item 2b",0.273206,0.069586,0.31,0.09


In [42]:
revenue_page.loc[
    lambda df: df["Text"].str.contains("5"),
    ["Text", "Left", "Top", "Right", "Bottom"]
]

Unnamed: 0_level_0,Text,Left,Top,Right,Bottom
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ff9afdc3-c655-4314-a409-ada2bcf925fc,"512,513,or 514",0.777267,0.085836,0.885656,0.093309
e6b5abb1-81e5-4aed-9bb1-5aa7e7f16cfc,1252593,0.365298,0.164487,0.416484,0.17121
5f3fac79-a2e8-4129-8fc7-df08761eee68,3125682,0.491779,0.227241,0.544242,0.234546
d6a23ad8-112e-4f5f-9c69-c77c324f107a,35604017,0.485726,0.259887,0.543555,0.266733
5cdb0eec-74fd-4efb-939f-3df09f4286d2,35604017,0.600647,0.259803,0.658427,0.266716
87727197-56e3-4f6e-bd8b-867dd7c0b89b,3003569,0.491499,0.274293,0.544115,0.281518
d15e5d44-674f-435c-90bb-44c74ec79899,3003569,0.836563,0.274342,0.888493,0.281467
d32aa049-c5c9-4da7-86c2-9604d7185d35,"$ 38,607,586",0.119291,0.362238,0.193486,0.369172
94cb722c-6e95-4781-8d0d-9c385b9a4fb8,5,0.061627,0.423343,0.070275,0.429717
9b811770-3a6a-4d45-9a03-366c64379191,12649105,0.250346,0.523686,0.306848,0.530472


In [42]:
revenue_page.loc[
    lambda df: df["Text"].str.contains("Federated"),
    ["Text", "Left", "Top"]
]

Unnamed: 0_level_0,Text,Left,Top
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1986d565-7360-4af1-b926-d342e2486e34,1a Federated campaigns,0.05731,0.155154


In [43]:
revenue_page_words = test_words.loc[
    lambda df: df["Page"] == 10
]


In [80]:
revenue_page_words.loc[
    lambda df: df["Text"].str.contains("325"),
    ["Text", "Left", "Top", "Right", "Bottom"]
]

Unnamed: 0_level_0,Text,Left,Top,Right,Bottom
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b830bfc4-8a48-4cab-b4ff-45ab0aea2e56,3325919,0.557346,0.242215,0.60892,0.249584
36b84abc-9140-4778-b33d-ac1941d86119,3325919,0.685861,0.279767,0.738522,0.287028


In [44]:
revenue_page_words.loc[
    lambda df: df["Top"].between(.281,.331)
]

Unnamed: 0_level_0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4aacf2dc-fc45-4350-bf54-2fce052db30c,WORD,99.814667,{'BoundingBox': {'Height': 0.00682590762153267...,10,,,h,PRINTED,"[{'X': 0.06532908231019974, 'Y': 0.28116348385...",0.006826,0.065329,0.281163,0.072918,0.287989,0.069124,0.284576,0.007589,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
48cb3045-0e0b-4460-b77d-3b622fc830c4,WORD,92.738487,{'BoundingBox': {'Height': 0.00704292813315987...,10,,,Total.,PRINTED,"[{'X': 0.08165137469768524, 'Y': 0.28108042478...",0.007043,0.081651,0.28108,0.118466,0.288123,0.100059,0.284602,0.036815,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
6734d432-38a6-4568-8696-4f5bd6213173,WORD,98.038437,{'BoundingBox': {'Height': 0.00660097226500511...,10,,,Add,PRINTED,"[{'X': 0.12240481376647949, 'Y': 0.28114494681...",0.006601,0.122405,0.281145,0.146967,0.287746,0.134686,0.284445,0.024562,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
0c2902fa-a181-4815-b2d4-c73f73e73f9f,WORD,99.234352,{'BoundingBox': {'Height': 0.00673250341787934...,10,,,lines,PRINTED,"[{'X': 0.15030936896800995, 'Y': 0.28115898370...",0.006733,0.150309,0.281159,0.179896,0.287891,0.165103,0.284525,0.029587,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
0d044ed2-9bfe-4cf5-b392-35154388aab8,WORD,89.544731,{'BoundingBox': {'Height': 0.00693145766854286...,10,,,1a-1f,PRINTED,"[{'X': 0.18508261442184448, 'Y': 0.28113293647...",0.006931,0.185083,0.281133,0.220462,0.288064,0.202772,0.284599,0.035379,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
46d239e5-b703-46dc-afe4-55948e2c550a,WORD,99.074989,{'BoundingBox': {'Height': 0.00700754811987280...,10,,,Business,PRINTED,"[{'X': 0.516339898109436, 'Y': 0.2976036369800...",0.007008,0.51634,0.297604,0.572853,0.304611,0.544596,0.301107,0.056513,,35,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
dff56efd-154d-4977-acc1-7795c1a1f271,WORD,98.238556,{'BoundingBox': {'Height': 0.00673390692099928...,10,,,Code,PRINTED,"[{'X': 0.5772353410720825, 'Y': 0.297744065523...",0.006734,0.577235,0.297744,0.608413,0.304478,0.592824,0.301111,0.031178,,35,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
4ee6385e-610f-43d9-a8f4-4b86490659b8,WORD,99.742828,{'BoundingBox': {'Height': 0.00684572337195277...,10,,,2a,PRINTED,"[{'X': 0.05748225376009941, 'Y': 0.31187033653...",0.006846,0.057482,0.31187,0.072025,0.318716,0.064754,0.315293,0.014543,,37,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
6d1e2e22-9edf-4c08-9a24-67ddc1e8a47b,WORD,99.538528,{'BoundingBox': {'Height': 0.00641339132562279...,10,,,Honoraria,PRINTED,"[{'X': 0.08235233277082443, 'Y': 0.31231465935...",0.006413,0.082352,0.312315,0.13144,0.318728,0.106896,0.315521,0.049088,,37,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
10be1c9a-104d-434c-b9ef-80228ebcefef,WORD,99.132187,{'BoundingBox': {'Height': 0.00658489996567368...,10,,,900099,PRINTED,"[{'X': 0.5786972641944885, 'Y': 0.312885344028...",0.006585,0.578697,0.312885,0.615502,0.31947,0.5971,0.316178,0.036805,,37,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...


In [46]:
revenue_page_words.columns

Index(['BlockType', 'Confidence', 'Geometry', 'Page', 'PageClassification',
       'Relationships', 'Text', 'TextType', 'Polygon', 'Height', 'Left', 'Top',
       'Right', 'Bottom', 'Midpoint_X', 'Midpoint_Y', 'Width', 'Children',
       'Line_No', 'File'],
      dtype='object')

In [47]:
revenue_page_words.loc[
    lambda df: df["Text"].str.contains("Federated"),
    ["Text", "Top", "Left"]
]

Unnamed: 0_level_0,Text,Top,Left
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0c0f0166-5df9-491a-a502-7b724a87c329,Federated,0.155154,0.081694


In [48]:
revenue_page_words.loc[
    lambda df: df["Text"].str.contains("Membership"),
    ["Text", "Top", "Left"]
]


Unnamed: 0_level_0,Text,Top,Left
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d92ead8d-6aba-404d-9278-79d3697ce4dd,Membership,0.172782,0.08167


In [50]:
revenue_page_words.loc[
    lambda df: df["Text"].str.contains("Fundraising"),
    ["Text", "Top", "Left"]
]


Unnamed: 0_level_0,Text,Top,Left
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0bba877d-42a9-41a3-ac6f-b9a950446c1b,Fundraising,0.190196,0.081843


In [51]:
0.190196-0.172782


0.017414000000000013

In [52]:
0.172782-0.155154

0.017628000000000005

In [59]:
word_clusters = [
    cluster.sort_values(by="Left")
    for cluster in cluster_words(revenue_page_words, 0.01, "Top")
]

In [60]:
len(word_clusters)

51

In [61]:
word_clusters[12]

Unnamed: 0,BlockType,Confidence,Geometry,Page,PageClassification,Relationships,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Midpoint_X,Midpoint_Y,Width,Children,Line_No,File
4aacf2dc-fc45-4350-bf54-2fce052db30c,WORD,99.814667,{'BoundingBox': {'Height': 0.00682590762153267...,10,,,h,PRINTED,"[{'X': 0.06532908231019974, 'Y': 0.28116348385...",0.006826,0.065329,0.281163,0.072918,0.287989,0.069124,0.284576,0.007589,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
48cb3045-0e0b-4460-b77d-3b622fc830c4,WORD,92.738487,{'BoundingBox': {'Height': 0.00704292813315987...,10,,,Total.,PRINTED,"[{'X': 0.08165137469768524, 'Y': 0.28108042478...",0.007043,0.081651,0.28108,0.118466,0.288123,0.100059,0.284602,0.036815,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
6734d432-38a6-4568-8696-4f5bd6213173,WORD,98.038437,{'BoundingBox': {'Height': 0.00660097226500511...,10,,,Add,PRINTED,"[{'X': 0.12240481376647949, 'Y': 0.28114494681...",0.006601,0.122405,0.281145,0.146967,0.287746,0.134686,0.284445,0.024562,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
0c2902fa-a181-4815-b2d4-c73f73e73f9f,WORD,99.234352,{'BoundingBox': {'Height': 0.00673250341787934...,10,,,lines,PRINTED,"[{'X': 0.15030936896800995, 'Y': 0.28115898370...",0.006733,0.150309,0.281159,0.179896,0.287891,0.165103,0.284525,0.029587,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
0d044ed2-9bfe-4cf5-b392-35154388aab8,WORD,89.544731,{'BoundingBox': {'Height': 0.00693145766854286...,10,,,1a-1f,PRINTED,"[{'X': 0.18508261442184448, 'Y': 0.28113293647...",0.006931,0.185083,0.281133,0.220462,0.288064,0.202772,0.284599,0.035379,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...
36b84abc-9140-4778-b33d-ac1941d86119,WORD,98.784912,{'BoundingBox': {'Height': 0.00726110860705375...,10,,,3325919,PRINTED,"[{'X': 0.6858614087104797, 'Y': 0.279766768217...",0.007261,0.685861,0.279767,0.738522,0.287028,0.712191,0.283397,0.05266,,34,f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a337...


In [66]:
for cluster in word_clusters:
    print(" ".join(cluster["Text"].values))

Form 990 (2010) Page 9
Part VIII Statement of Revenue
Total (A) revenue function exempt Related (B) or Unrelated business revenue (C) excluded Revenue from (D)
revenue sections under tax
513, 512, 514 or
1a Federated campaigns 1a
b Membership dues 1b
c Fundraising events 1c
d Related organizations 1d
e Government grants (contributions) 1e
f All similar other amounts contributions, not included gifts, grants, above and 1f 3,325,919
g Noncash contributions included in lines 1a-1f $ 21,000
h Total. Add lines 1a-1f 3,325,919
Business Code
2a Honoraria 900099 8,600 8,600
b
c
d
e
f All other program service revenue
g Total. Add lines 2a-2f 8,600
3 Investment income (including dividends, interest
and other similar amounts) 7 7
4 Income from investment of tax-exempt bond proceeds
5 Royalties
(1) Real (ii) Personal
6a Gross Rents
d c b Rental expenses Net or Less (loss) rental rental income income or (loss)
(1) Securities (ii) o ther
7a c b than from assets other Less sales Gross Gain inventory

In [70]:
for cluster in word_clusters:
    x_cluster = cluster_x(cluster, 0.05)
    for col in x_cluster:
        print(" ".join(col["Text"].values))
        print("----")
    print("***********")

Form 990 (2010)
----
Page 9
----
***********
Part VIII Statement of Revenue
----
***********
Total (A) revenue function exempt Related (B) or Unrelated business revenue (C) excluded Revenue from (D)
----
***********
revenue
----
sections under tax
----
***********
513, 512, 514 or
----
***********
1a Federated campaigns
----
1a
----
***********
b Membership dues
----
1b
----
***********
c Fundraising events
----
1c
----
***********
d Related organizations
----
1d
----
***********
e Government grants (contributions) 1e
----
***********
f All similar other amounts contributions, not included gifts, grants, above and 1f
----
3,325,919
----
***********
g Noncash contributions included in lines 1a-1f $
----
21,000
----
***********
h Total. Add lines 1a-1f
----
3,325,919
----
***********
Business Code
----
***********
2a Honoraria
----
900099
----
8,600 8,600
----
***********


TypeError: list indices must be integers or slices, not str

In [84]:
bbox = BoundingBox(.271,0,.22,0,.29,0,.331,0)

In [85]:
bbox.get_text_in_box(revenue_page_words, 10)

'and'