In [1]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii, postprocess
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex

In [2]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

In [3]:
validation_data = pd.read_csv("validation_data.csv", index_col="job_id").fillna("")
validation_data.head()

Unnamed: 0_level_0,pdf_key,other_expenses_c_mgmt_general,payments_affiliates_total,other_expenses_d_prog_service,total_revenue,fees_for_lobbying_services_total,other_employee_benefits_fundraising,total_fundraising_expenses,compensation_officers_prog_service,fees_for_mgmt_services_total,...,travel_total,pension_plan_acc_contrib_prog_service,advertising_promotion_mgmt_general,pension_plan_acc_contrib_mgmt_general,compensation_disq_persons_total,other_salaries_wages_mgmt_general,office_expenses_mgmt_general,fees_for_other_services_fundraising,total_number_other_recipient_foreign_orgs_entities,activities_per_region_totals_total_expenditure
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4,EIN_760733035_YEAR_2009_FORMTYPE_990.pdf,,,33000.0,1415024,,,4503.0,,,...,18307.0,,,,,,,,,
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc,EIN_363235550_YEAR_2009_FORMTYPE_990.pdf,,,15832.0,9899921,,27161.0,597789.0,295722.0,,...,677722.0,34261.0,30.0,3462.0,,159156.0,48381.0,22350.0,,
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b,EIN_223134995_YEAR_2010_FORMTYPE_990.pdf,0.0,,10823.0,3370595,,12776.0,263765.0,148689.0,2250.0,...,197092.0,,127381.0,,396855.0,103312.0,11723.0,23021.0,,
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82,EIN_264320885_YEAR_2009_FORMTYPE_990.pdf,,,,46117,,,669.0,,,...,,,,,,,458.0,,,
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be,EIN_581943161_YEAR_2009_FORMTYPE_990.pdf,581.0,,0.0,583209,,2521.0,14409.0,298484.0,,...,7233.0,,,,,,4010.0,0.0,,


In [14]:
extractor_df = load_extractor_df("990_extractors.csv")
roadmap_df = load_extractor_df("990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("schedule_f_row_extractors.csv")

In [5]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = "Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = "Grants to Individuals Outside the United States"

In [21]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []


for i, job_id in enumerate(validation_data.index.values):
    print(i)
    print(job_id)
    pdf_key = validation_data.at[job_id, "pdf_key"]
    print(pdf_key)
    
    data = open_df(bucket, job_id)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    row = postprocess(row, job_id, pdf_key, clean_filing)
    filing_rows.append(row)
    
    pages = lines.groupby("Page")
    
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_i_table = postprocess(part_i_table, job_id, pdf_key, clean_f_i)
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_ii_table = postprocess(part_ii_table, job_id, pdf_key, clean_f_ii)
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_iii_table = postprocess(part_iii_table, job_id, pdf_key, clean_f_iii)
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

0
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4
EIN_760733035_YEAR_2009_FORMTYPE_990.pdf
1
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc
EIN_363235550_YEAR_2009_FORMTYPE_990.pdf
2
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b
EIN_223134995_YEAR_2010_FORMTYPE_990.pdf
3
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82
EIN_264320885_YEAR_2009_FORMTYPE_990.pdf
4
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be
EIN_581943161_YEAR_2009_FORMTYPE_990.pdf
5
5596f51a999ebbd4cb992f490ceaffcddbac9bce532b911997683ae6897c2797
EIN_231352689_YEAR_2009_FORMTYPE_990.pdf


<class 'KeyError'>: '(i) Method'


6
481a62c75776cc7f62075c1d60c230ea01f2dbc5b6224c65cb73db1728e86b00
EIN_954806856_YEAR_2009_FORMTYPE_990.pdf
7
39d270117e4b6354850551c8237421403da7d2b3f5d06657a28f5a9d07febf17
EIN_521238301_YEAR_2010_FORMTYPE_990.pdf
8
9a69ea9d5c8d5397603dbb77d027f4ff4ef9dcdc6609b7a1657f9484529353a0
EIN_521703065_YEAR_2010_FORMTYPE_990.pdf
9
3b9cc21f73f11bef5f2e4c15859661f7f7c9adee8dd6a1f2301099834fdb2926
EIN_611190087_YEAR_2008_FORMTYPE_990.pdf
10
82588f1ce9ca8cec2fc6b55ec07361b821be6650d1c01db4b9c948b8bf12689f
EIN_113489123_YEAR_2008_FORMTYPE_990.pdf
11
f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2f4a49dd0a81b456649
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf
12
a2061356d7999388cbd49b79872883c92ce6c81a7e7820788f92db496cedd620
EIN_630329409_YEAR_2009_FORMTYPE_990.pdf
13
6e417b42fc15148e0489456f5086bbac28a8361d3452a0ddc23314afee5b6313
EIN_620988294_YEAR_2010_FORMTYPE_990.pdf
14
e56d65e73cec9532561c42db4f4dc64c5b968441b4d492444292a9daf3921044
EIN_581954432_YEAR_2010_FORMTYPE_990.pdf
15
68a8d4678de1d3107eff

<class 'KeyError'>: '(i) Method'
<class 'KeyError'>: '(c) Number of recipients'


22
a6529e504df346097da99104a353e977426e018cd5ac33b62cd2dd89c90763c5
EIN_311002913_YEAR_2008_FORMTYPE_990.pdf
23
d1925c2d74adaa3f150ded3ce67dfe7ae3a306f0db0289ad5755a28d801b2b0b
EIN_202408857_YEAR_2010_FORMTYPE_990.pdf
24
01d89ee5d14575c1321b2e4d67431d172ba76212b4a266bdaf474275029fd78b
EIN_521830327_YEAR_2009_FORMTYPE_990.pdf


In [None]:
output_data = pd.concat(filing_rows).reset_index(drop=True).set_index("job_id")

In [None]:
output_data.head()

In [None]:
def clean(x):
    x = str(x)
    x = re.sub(r"\.0\b", "", x)
    x = re.sub("\D", "", x)
    return x

In [None]:
def compare_output(to_validate, to_compare, col):
    return pd.DataFrame(
        {
            "extracted": to_validate.loc[col].loc[
                lambda series: series != to_compare.loc[col]
            ],
            "expected": to_compare.loc[col].loc[
                lambda series: series != to_validate.loc[col]
            ],
        }
    )

In [None]:
to_compare = validation_data.set_index("pdf_key").applymap(clean)
to_validate = output_data[validation_data.columns].set_index("pdf_key").applymap(clean)

for col in to_validate.index:
    validated = compare_output(to_validate, to_compare, col)
    if validated.any().any():
        print(col)
        print(f"{validated.shape[0]} mismatched items.")
        print(validated)
        print("-"*79)

In [None]:
assert False

## BREAK

In [25]:
pd.concat(schedule_f_part_i_rows).head(50)

field,index,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures,job_id,pdf_key,split_pdf_key,ein,year,filing_id
0,0,,,,the region),region,,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 231352689, YEAR, 2009, FORMTYPE, 990.pdf]",231352689,2009,231352689_2009
1,1,Europe,1.0,,0 theological training,offers a Th M degree,6661,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 231352689, YEAR, 2009, FORMTYPE, 990.pdf]",231352689,2009,231352689_2009
2,2,Totals,1.0,,o,,6661,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 231352689, YEAR, 2009, FORMTYPE, 990.pdf]",231352689,2009,231352689_2009
0,0,,,10.0,located in region),,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
1,1,(1),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
2,2,(2),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
3,3,(3),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
4,4,(4),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
5,5,(5),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
6,6,(6),,,,,,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010


In [28]:
pd.concat(schedule_f_part_iii_rows).tail(50)

field,index,type_of_grant_assistance,region,number_recipients,amount_cash_grant,manner_cash_disbursement,amount_noncash_assistance,desc_noncash_assistance,method_valuation,job_id,pdf_key,split_pdf_key,ein,year,filing_id
0,0,,,,,,,,110,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 231352689, YEAR, 2009, FORMTYPE, 990.pdf]",231352689,2009,231352689_2009
1,1,,,,,,,Schedule F,09902008,5596f51a999ebbd4cb992f490ceaffcddbac9bce532b91...,EIN_231352689_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 231352689, YEAR, 2009, FORMTYPE, 990.pdf]",231352689,2009,231352689_2009
0,0,3 Page Method (h),,,,,,,020100110990110000,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
1,1,"990, of Description Form","990, Form",,,,,,15101100,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
2,2,"990, of Description Form 52-1238301 to (g) ""Ye...","990, Form to ""Yes' answered organization the i...",,,is,,,15101100100010010271003503,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
3,3,Amount States. needed. (d) United is,States. needed. United is,,,is,,,03503,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
4,4,Amount States. needed. (d) United is space Num...,States. needed. United is space the additional...,,,"is space duplicated can Part line IV, Part",,,0350311010100123456789101112131415161718,39d270117e4b6354850551c8237421403da7d2b3f5d066...,EIN_521238301_YEAR_2010_FORMTYPE_990.pdf,"[EIN, 521238301, YEAR, 2010, FORMTYPE, 990.pdf]",521238301,2010,521238301_2010
0,0,Page of Method,,,,,,,200800990011011000,f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2...,EIN_582248383_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 582248383, YEAR, 2009, FORMTYPE, 990.pdf]",582248383,2009,582248383_2009
1,1,Method (h),,,,,,,011011000,f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2...,EIN_582248383_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 582248383, YEAR, 2009, FORMTYPE, 990.pdf]",582248383,2009,582248383_2009
2,2,of,,,,,,,510,f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2...,EIN_582248383_YEAR_2009_FORMTYPE_990.pdf,"[EIN, 582248383, YEAR, 2009, FORMTYPE, 990.pdf]",582248383,2009,582248383_2009


In [29]:
table_test_df = open_df(bucket, "67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5")

In [30]:
test_lines = table_test_df.loc[
    table_test_df["BlockType"] == "LINE"
]
test_words = table_test_df.loc[
    table_test_df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")

In [31]:
part_i_table_pages = find_table_pages(
    test_pages["Text"].agg(lambda words: " ".join(words)),
    PART_I_HEADER,
)

In [32]:
part_i_tablemaps = pd.DataFrame(
    {
        "page": part_i_table_pages,
        "tablemap": part_i_table_pages.map(
            lambda page: create_tablemap(
                test_lines, schedule_f_tablemap_df, page
            ).dropna()
        ),
    }
)

In [33]:
part_i_row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_I_TABLE_NAME
]
part_i_table_data = schedule_f_table_extractor_df.loc[
    schedule_f_table_extractor_df["table"] == PART_I_TABLE_NAME
].iloc[0]

In [34]:
rows = part_i_tablemaps.assign(
    extractor=part_i_tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=part_i_table_data["table_top"],
            top_delta=part_i_table_data["table_top_delta"],
            bottom_label=part_i_table_data["table_bottom"],
            bottom_delta=part_i_table_data["table_bottom_delta"],
            tablemap=tablemap,
            row_extractors=part_i_row_extractors,
            fields=part_i_row_extractors["field"],
        )
    )
)

In [35]:
test_extractor = rows["extractor"].iloc[1]

In [36]:
test_extractor.extract_rows(test_words, 30).head(50)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,Central merica and the Carbbear,,,Fundraising,,481.0
1,Central merica the Caribbean,,,Grantmaking,,303788.0
2,Central merica the anbbea,,,Program services,Academic Support,20973.0
3,:entral :aribbean America the,,,Program Services,Instruction,58431.0
4,entral merica Caribbear the,,,Program services,Research and other academic activity,228975.0
5,Central Taribbean merica the,,,Investments,,
6,East Asia and Pacific,,,Fundraising,,3593.0
7,East Asia and Pacific,,,Grantmaking,,1269299.0
8,East Asia and Pacific,,,rogr services,Service Centers,118026.0
9,East Asia and Pacific,,,rogram Services,Academic Support,461230.0


## Extracting a bunch

In [None]:
ocr_outputs = list(Path("/mnt/c/Users/ethan/pdf_parsing").glob("EIN_*.json"))

In [None]:
len(ocr_outputs)

In [None]:
def open_local_df(path):
    with open(path) as json_data:
        output = json.load(json_data)
        return pd.DataFrame.from_records(
            output["Blocks"],
            index="Id",
            exclude=[
                "ColumnIndex",
                "ColumnSpan",
                "DocumentType",
                "EntityTypes",
                "Hint",
                "Query",
                "SelectionStatus",
                "RowIndex",
                "RowSpan",
            ]
        ).assign(
            Height=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Height"]),
            Left=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Left"]),
            Top=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Top"]),
            Width=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Width"]),
            Polygon=lambda df: df["Geometry"].map(lambda x: x["Polygon"]),
            Children=lambda df: df["Relationships"].map(lambda x: x[0]["Ids"] if x is not None else x),
            Line_No=lambda df: pd.qcut(df["Top"], 100, labels=list(range(100))).astype(int),
            File=lambda df: df["Geometry"].map(lambda x: path.name),
            Right=lambda df: df["Polygon"].map(
                lambda polygon: max(corner["X"] for corner in polygon)
            ),
            Bottom=lambda df: df["Polygon"].map(
                lambda polygon: max(corner["Y"] for corner in polygon)
            )
        ).drop(
            columns=[
                "Geometry",
                "Relationships",
            ]
        ).sort_values(
            by=["File", "Page", "Line_No", "Left"]
        )

In [None]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []

for (count, path) in enumerate(ocr_outputs[:100]):
    print(count, path.name)
    try:
        data = open_local_df(path)
    except Exception as e:
        print(path.name)
        print(e)
    else:
        lines = data.loc[data["BlockType"] == "LINE"]
        words = data.loc[data["BlockType"] == "WORD"]
        page_map = find_pages(lines)
        roadmap = create_roadmap(
            lines, roadmap_df, page_map
        )
        row = extract_from_roadmap(
            words, lines, roadmap, extractor_df, page_map
        )
        row["file"] = path.name
        filing_rows.append(row)
        pages = lines.groupby("Page")
        part_i_table = extract_table_data(
            pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_i_table is not None:
            schedule_f_part_i_rows.append(
                part_i_table.assign(file=path.name)
            )
        part_ii_table = extract_table_data(
            pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_ii_table is not None:
            schedule_f_part_ii_rows.append(
                part_ii_table.assign(file=path.name)
            )
        part_iii_table = extract_table_data(
            pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_iii_table is not None:
            schedule_f_part_iii_rows.append(
                part_iii_table.assign(file=path.name)
            )
    
filing_output_df = pd.DataFrame(filing_rows).set_index("file")


In [None]:
filing_output_df["total_revenue"].value_counts()

In [None]:
filing_output_df.iloc[24].iloc[:50]

In [None]:
schedule_f_part_i_df = pd.concat(schedule_f_part_i_rows)

In [None]:
schedule_f_part_i_df.iloc[:50]

In [None]:
schedule_f_part_ii_df = pd.concat(schedule_f_part_ii_rows)

In [None]:
schedule_f_part_ii_df.iloc[50:100]

In [None]:
schedule_f_part_iii_df = pd.concat(schedule_f_part_iii_rows)

In [None]:
schedule_f_part_iii_df.tail(50)

In [None]:
clean_filing(filing_output_df)

In [None]:
re.search(r"(?P<open_par>\()(\d+)(?(open_par)\)|\b)", "(2425)").group(2)

In [None]:
test_new_filing = open_local_df(
    Path("/mnt/c/Users/ethan/pdf_parsing") / "EIN_200478411_YEAR_2017_FORMTYPE_990.json"
)

In [None]:
test_new_filing

In [None]:
test_new_filing_words = test_new_filing.loc[
    test_new_filing["BlockType"] == "WORD"
]

test_new_filing_lines = test_new_filing.loc[
    test_new_filing["BlockType"] == "LINE"
]
test_new_filing_pages = test_new_filing_lines.groupby("Page")


In [None]:
page_map = find_pages(test_new_filing_lines)

In [None]:
page_map

In [None]:
roadmap = create_roadmap(test_new_filing_lines, roadmap_df, page_map)

In [None]:
roadmap.iloc[:50]

In [None]:
extractors = create_extractors(extractor_df, roadmap, page_map)

In [None]:
test_extractor = extractors.iloc[2]
test_page = 1

In [None]:
test_extractor.bounding_box.get_text_in_box(test_new_filing_lines, 1)

In [None]:
test_extractor.bounding_box.top -= 0.001

In [None]:
extract_table_data(
    test_new_filing_pages, test_new_filing_lines, test_new_filing_words,
    PART_II_HEADER, PART_II_TABLE_NAME, schedule_f_tablemap_df,
    schedule_f_table_extractor_df, schedule_f_row_extractor_df,
)

In [None]:
table_pages = find_table_pages(
    test_new_filing_pages["Text"].agg(lambda words: " ".join(words)), PART_II_HEADER,
)

In [None]:
table_pages

In [None]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(test_new_filing_lines, schedule_f_tablemap_df, page).dropna()
        )
    }
)
tablemaps

In [None]:
tablemaps.iloc[1]["tablemap"]

In [None]:
row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_II_TABLE_NAME
]

In [None]:
row_extractors

In [None]:
table = schedule_f_table_extractor_df.loc[
    schedule_f_table_extractor_df["table"] == PART_II_TABLE_NAME
].iloc[0]
table

In [None]:
rows = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=table["table_top"],
            top_delta=table["table_top_delta"],
            bottom_label=table["table_bottom"],
            bottom_delta=table["table_bottom_delta"],
            row_margin=table["row_margin"],
            index_col_left_label=table["index_col_left"],
            index_col_left_delta=table["index_col_left_delta"],
            index_col_right_label=table["index_col_right"],
            index_col_right_delta=table["index_col_right_delta"],
            tablemap=tablemap,
            row_extractors=row_extractors,
            fields=row_extractors["field"],
        )
    )
)

In [None]:
rows

In [None]:
extractor = rows["extractor"].iloc[1]

In [None]:
page_74_lines = test_new_filing_lines.loc[
    test_new_filing_lines["Page"] == 74
]

In [None]:
extractor.row_extractors

In [None]:
extractor.extract_rows(page_74_words, 74)

In [None]:
extractor.get_col_span("(b) Region", "(c) Number of recipients")

In [None]:
extractor.get_row_spans(test_new_filing_words, 74)

In [None]:
BoundingBox(
    left=0.55,
    left_delta=0,
    top=0.365,
    top_delta=-.01,
    right=0.66,
    right_delta=0,
    bottom=1,
    bottom_delta=0.01
).get_text_in_box(page_74_lines, 74)

In [None]:
extractor.index_col_left_label

In [None]:
extractor.table_top

In [None]:
extractor.extract_rows(test_new_filing_words, 74).iloc[0].any()