# Parsing Debug Notebook

This somewhat messy notebook makes it easier to debug the parser, because we can just rerun the cells needed to set up the debugging process with various parameters.

In [1]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii, postprocess
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex, cluster_words, columnize, cluster_x

In [2]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

Setting `VALIDATE_TOP` to `True` will parse all 25 validation PDFs and compare non-Schedule F output to the validation data. If set to `False`, we run the test code for Schedule F instead.

In [3]:
VALIDATE_TOP = False

In [4]:
if VALIDATE_TOP:
    validation_data = pd.read_csv("validation_data.csv", index_col="job_id").fillna("")
    validation_data.head()

In [5]:
extractor_df = load_extractor_df("parse_data/990_extractors.csv")
roadmap_df = load_extractor_df("parse_data/990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("parse_data/schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("parse_data/schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("parse_data/schedule_f_row_extractors.csv")

In [6]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = r"Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = r"Grants to Individuals Outside the United States"

In [7]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []
if VALIDATE_TOP:
    values = validation_data.index.values
else:
    values = []

for i, job_id in enumerate(values):
    print(i)
    print(job_id)
    pdf_key = validation_data.at[job_id, "pdf_key"]
    print(pdf_key)
    
    data = open_df(bucket, job_id)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    row = postprocess(row, job_id, pdf_key, clean_filing)
    filing_rows.append(row)
    
    pages = lines.groupby("Page")
    
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_i_table = postprocess(part_i_table, job_id, pdf_key, clean_f_i)
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_ii_table = postprocess(part_ii_table, job_id, pdf_key, clean_f_ii)
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_iii_table = postprocess(part_iii_table, job_id, pdf_key, clean_f_iii)
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

In [8]:
if VALIDATE_TOP:
    output_data = pd.concat(filing_rows).reset_index(drop=True).set_index("job_id")

In [9]:
VALIDATE_TOP and output_data.head()

False

In [10]:
def clean(x):
    x = str(x)
    x = re.sub(r"\.0\b", "", x)
    x = re.sub("\D", "", x)
    return x

In [11]:
def compare_output(to_validate, to_compare, col):
    return pd.DataFrame(
        {
            "extracted": to_validate.loc[col].loc[
                lambda series: series != to_compare.loc[col]
            ],
            "expected": to_compare.loc[col].loc[
                lambda series: series != to_validate.loc[col]
            ],
        }
    )

In [12]:
if VALIDATE_TOP:
    to_compare = validation_data.set_index("pdf_key").applymap(clean)
    to_validate = output_data[validation_data.columns].set_index("pdf_key").applymap(clean)

    for col in to_validate.index:
        validated = compare_output(to_validate, to_compare, col)
        if validated.any().any():
            print(col)
            print(f"{validated.shape[0]} mismatched items.")
            print(validated)
            print("-"*79)

In [13]:
VALIDATE_TOP and pd.concat(schedule_f_part_i_rows).head(50)

False

In [14]:
VALIDATE_TOP and pd.concat(schedule_f_part_i_rows).tail(50)

False

In [15]:
VALIDATE_TOP and pd.concat(schedule_f_part_ii_rows).head(50)

False

In [16]:
VALIDATE_TOP and pd.concat(schedule_f_part_iii_rows).head(50)

False

In [17]:
if VALIDATE_TOP:
    assert False

In [18]:
table_test_df = open_df(bucket, "4b6a0febec2c15d1432c6b4f450316397596e0f9ef8700bcd6846361eeac297a")

In [19]:
test_lines = table_test_df.loc[
    table_test_df["BlockType"] == "LINE"
]
test_words = table_test_df.loc[
    table_test_df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")

In [20]:
page_map = find_pages(test_lines)
roadmap = create_roadmap(
    test_lines, roadmap_df, page_map
)

row = extract_from_roadmap(
    test_words, test_lines, roadmap, extractor_df, page_map
)
row = postprocess(row, "foo_2_3_4_5", "bar_2_3_4_5", clean_filing)
row

No match for address in 7/01 6/30 For the 2008 calendar year, or tax year beginning 2008, and ending Please use LIBERTY UNIVERSITY FOUNDATION IRS label or type or print 1971 UNIVERSITY BLVD See LYNCHBURG, VA 24502 specific Instruc- tions G
No match for city in G
No match for grants_us_govt_orgs_prog_service in expenses general
No match for grants_us_govt_orgs_mgmt_and_general in expenses
No match for grants_us_govt_orgs_fundraising in expenses


field_name,name,address,city,state,zip,website,gross_receipts,year_formation,state_of_domicile,mission,...,activities_per_region_totals_number_of_offices,activities_per_region_totals_number_of_employees,activities_per_region_totals_total_expenditure,total_number_recipient_foreign_orgs_listed_as_charities,total_number_other_recipient_foreign_orgs_entities,job_id,pdf_key,ein,year,filing_id
0,LIBERTY UNIVERSITY FOUNDATION,,,VA,24502,,222685,2001,DC,"Receive, administer and expend funds for Chani...",...,,,,,,foo_2_3_4_5,bar_2_3_4_5,2,4,2_4


In [27]:
test_lines.loc[
    (test_lines["Page"] == page_map["Page 1"])
    & test_lines["Text"].str.contains(
        "Net rental income|Direct public|2007 calendar"
    ),
    "Page",
]

Series([], Name: Page, dtype: int64)

In [21]:
HEADER = PART_I_HEADER
NAME = PART_I_TABLE_NAME

In [22]:
table_pages = find_table_pages(
    test_pages["Text"].agg(lambda words: " ".join(words)),
    HEADER,
)

In [23]:
table_pages

Series([], Name: Page, dtype: int64)

In [24]:
TEST_INDEX = 1
TEST_PAGE = 28

In [25]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(
                test_lines, schedule_f_tablemap_df, page, NAME
            ).dropna()
        ),
    }
)

In [26]:
tablemaps["tablemap"].iloc[TEST_INDEX]

IndexError: single positional indexer is out-of-bounds

In [None]:
row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == NAME
]
table_data = schedule_f_table_extractor_df.loc[
    schedule_f_table_extractor_df["table"] == NAME
].iloc[0]

In [None]:
rows = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            header_top_label=table_data["header_top"],
            top_label=table_data["table_top"],
            bottom_label=table_data["table_bottom"],
            tablemap=tablemap,
            fields=row_extractors["field"],
            field_labels=row_extractors["col_left"]
        )
    )
)

In [None]:
test_extractor = rows["extractor"].iloc[TEST_INDEX]

In [None]:
test_extractor.get_col_spans(test_words, TEST_PAGE)

In [None]:
test_extractor.field_labels

In [None]:
test_extractor.header_top_label

In [None]:
extracted = test_extractor.extract_rows(test_words, TEST_PAGE)

In [None]:
extracted.head(25)

In [None]:
table_words = test_extractor.get_table_words(test_words, TEST_PAGE)

In [None]:
word_clusters = cluster_words(table_words, table_words["Height"].min(), "Midpoint_Y")
[" ".join(word.sort_values(by="Left")["Text"].values) for word in word_clusters]

In [None]:
word_clusters

In [None]:
def columnize(word_cluster, col_spans):
    return col_spans.map(
        lambda span: word_cluster.loc[
            (word_cluster["Right"].between(*span, inclusive="right"))
        ]
    )


def get_cluster_coords(cluster):
    cluster_coords = {
        "Left": cluster["Left"].min(),
        "Right": cluster["Right"].max(),
        "Height": cluster["Height"].max(),
        "Midpoint_X": cluster["Midpoint_X"].median(),
        "Midpoint_Y": cluster["Midpoint_Y"].median(),
        "Top": cluster["Top"].min(),
        "Bottom": cluster["Bottom"].min(),
    }
    cluster_coords["Width"] = cluster_coords["Right"] - cluster_coords["Left"]
    return cluster_coords


def combine_row(row):
    return pd.Series([
        line.map(
            lambda x: x.sort_values(
                by="Left"
            ).reset_index(drop=True)["Text"].fillna("")
        ).agg(
            lambda x: " ".join(x.values)
        ) + " "
        for line in row
    ]).sum().str.strip()

col_spans = test_extractor.get_col_spans(test_words, TEST_PAGE)

col_spans

Row break scenarios:
1. Previous cluster and current cluster both have entries in the same numeric column
2. Alignment is TOP and current cluster has non-empty columns that are empty in previous row
3. Alignment is BOTTOM and current cluster has empty columns that are non-empty in previous row

In [None]:
y_tol = table_words["Height"].median()
columnized = columnize(word_clusters[0], col_spans)
columnized.index = test_extractor.fields
last_col_coords = pd.DataFrame.from_records(
    columnized.map(
        get_cluster_coords
    )
)
rows = []
if NAME == PART_I_TABLE_NAME:
    numeric_cols = (1,2,5)
elif NAME == PART_II_TABLE_NAME:
    numeric_cols = (4,6)
elif NAME == PART_III_TABLE_NAME:
    numeric_cols = (2,3,5)
current_row = [columnized]
top_ws = (
    last_col_coords["Top"].min()
    - test_extractor.get_table_top(test_words, TEST_PAGE)
)
print(f"Y tolerance: {y_tol}")
print(f"Top whitespace: {top_ws}")
if top_ws > y_tol * 4:
    alignment = "BOTTOM"
else:
    alignment = "UNKNOWN"
print(f"Alignment: {alignment}")
print("First cluster:")
print(" ".join(word_clusters[0].sort_values(by="Left")["Text"].values))
for count, cluster in enumerate(word_clusters[1:]):
    print("-"*50)
    print(f"Alignment: {alignment}")
    print("Cluster:", " ".join(cluster.sort_values(by="Left")["Text"].values))
    columnized = columnize(cluster, col_spans)
    columnized.index = test_extractor.fields
    col_coords = pd.DataFrame.from_records(columnized.map(get_cluster_coords))
    nonempty = col_coords.dropna().index.to_series()
    last_nonempty = last_col_coords.dropna().index.to_series()
    print("Nonempty\n", nonempty)
    print("Last nonempty\n", last_nonempty)
    # more_cols true if current row has non-empty cells that
    # are empty in the preceding row
    more_cols = (~nonempty.isin(last_nonempty)).any()
    print("More cols:", more_cols)
    # less_cols true if last row has non-empty cells that
    # are empty in current row
    less_cols = (~last_nonempty.isin(nonempty)).any()
    print("Less cols:", less_cols)
    # both_numeric true if both rows have entries in numeric cols
    both_numeric = (
        nonempty.isin(numeric_cols) 
        & last_nonempty.isin(numeric_cols)
    ).any()
    y_delta = (
        col_coords["Top"].min()
        - last_col_coords["Bottom"].max()
    )
    print("Y delta", y_delta)
    print("Both numeric:", both_numeric)
    if (
        both_numeric
        or (more_cols and (alignment== "TOP"))
        or (less_cols and (alignment == "BOTTOM"))
        or (y_delta > y_tol)
    ):
        combined_row = combine_row(current_row)
        print(combined_row)
        rows.append(combined_row)
        current_row = [columnized]
    elif less_cols and (alignment == "UNKNOWN"):
        alignment = "TOP"
        current_row.append(columnized)
    elif more_cols and (alignment == "UNKNOWN"):
        alignment = "BOTTOM"
        current_row.append(columnized)
    else:
        current_row.append(columnized)
    last_col_coords = col_coords            

In [None]:
header_words = test_extractor.get_header_words(test_words, TEST_PAGE)

In [None]:
header_words.sort_values(by="Left")[["Text", "Left", "Right", "Midpoint_Y"]].tail(50)