# Parsing Debug Notebook

This somewhat messy notebook makes it easier to debug the parser, because we can just rerun the cells needed to set up the debugging process with various parameters.

In [1]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii, postprocess
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex, cluster_words, columnize, cluster_x

In [2]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

In [3]:
validation_data = pd.read_csv("validation_data.csv", index_col="job_id").fillna("")
validation_data.head()

Unnamed: 0_level_0,pdf_key,other_expenses_c_mgmt_general,payments_affiliates_total,other_expenses_d_prog_service,total_revenue,fees_for_lobbying_services_total,other_employee_benefits_fundraising,total_fundraising_expenses,compensation_officers_prog_service,fees_for_mgmt_services_total,...,travel_total,pension_plan_acc_contrib_prog_service,advertising_promotion_mgmt_general,pension_plan_acc_contrib_mgmt_general,compensation_disq_persons_total,other_salaries_wages_mgmt_general,office_expenses_mgmt_general,fees_for_other_services_fundraising,total_number_other_recipient_foreign_orgs_entities,activities_per_region_totals_total_expenditure
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4,EIN_760733035_YEAR_2009_FORMTYPE_990.pdf,,,33000.0,1415024,,,4503.0,,,...,18307.0,,,,,,,,,
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc,EIN_363235550_YEAR_2009_FORMTYPE_990.pdf,,,15832.0,9899921,,27161.0,597789.0,295722.0,,...,677722.0,34261.0,30.0,3462.0,,159156.0,48381.0,22350.0,,
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b,EIN_223134995_YEAR_2010_FORMTYPE_990.pdf,0.0,,10823.0,3370595,,12776.0,263765.0,148689.0,2250.0,...,197092.0,,127381.0,,396855.0,103312.0,11723.0,23021.0,,
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82,EIN_264320885_YEAR_2009_FORMTYPE_990.pdf,,,,46117,,,669.0,,,...,,,,,,,458.0,,,
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be,EIN_581943161_YEAR_2009_FORMTYPE_990.pdf,581.0,,0.0,583209,,2521.0,14409.0,298484.0,,...,7233.0,,,,,,4010.0,0.0,,


In [4]:
extractor_df = load_extractor_df("990_extractors.csv")
roadmap_df = load_extractor_df("990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("schedule_f_row_extractors.csv")

In [5]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = r"Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = r"Grants to Individuals Outside the United States"

In [6]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []


for i, job_id in enumerate(validation_data.index.values):
    print(i)
    print(job_id)
    pdf_key = validation_data.at[job_id, "pdf_key"]
    print(pdf_key)
    
    data = open_df(bucket, job_id)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    row = postprocess(row, job_id, pdf_key, clean_filing)
    filing_rows.append(row)
    
    pages = lines.groupby("Page")
    
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_i_table = postprocess(part_i_table, job_id, pdf_key, clean_f_i)
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_ii_table = postprocess(part_ii_table, job_id, pdf_key, clean_f_ii)
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    part_iii_table = postprocess(part_iii_table, job_id, pdf_key, clean_f_iii)
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

0
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4
EIN_760733035_YEAR_2009_FORMTYPE_990.pdf


No match for year_formation in L Year of Formation M State of legal domicile
No match for state_of_domicile in M State of legal domicile


1
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc
EIN_363235550_YEAR_2009_FORMTYPE_990.pdf


No match for total_unrelated_biz_revenue in 7a NONE
No match for net_unrelated_biz_taxable_revenue in 7b NONE
No match for benefits_paid_members_expenses in NONE
No match for professional_fundraising_fees_expenses in NONE
No match for grants_foreign_individuals_govt_orgs_total in NONE
No match for benefits_to_members_total in NONE
No match for compensation_disq_persons_total in NONE
No match for fees_for_mgmt_services_total in NONE
No match for fees_for_lobbying_services_total in NONE
No match for fees_for_fundraising_services_total in NONE
No match for fees_for_investment_mgmt_services_total in NONE
No match for royalties_total in NONE
No match for entertainment_travel_govt_officials_total in NONE
No match for interest_total in NONE
No match for payments_affiliates_total in NONE


2
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b
EIN_223134995_YEAR_2010_FORMTYPE_990.pdf
3
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82
EIN_264320885_YEAR_2009_FORMTYPE_990.pdf
4
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be
EIN_581943161_YEAR_2009_FORMTYPE_990.pdf
5
5596f51a999ebbd4cb992f490ceaffcddbac9bce532b911997683ae6897c2797
EIN_231352689_YEAR_2009_FORMTYPE_990.pdf
6
481a62c75776cc7f62075c1d60c230ea01f2dbc5b6224c65cb73db1728e86b00
EIN_954806856_YEAR_2009_FORMTYPE_990.pdf
7
39d270117e4b6354850551c8237421403da7d2b3f5d06657a28f5a9d07febf17
EIN_521238301_YEAR_2010_FORMTYPE_990.pdf


No match for address in Name of organization AMERICAN LIFE LEAGUE, INC. Doing Business As Number and street (or PO box if mail is not delivered to street addr) Room/suite PO BOX 1350 City, town or country State ZIP code + 4 STAFFORD VA 22555
No match for net_unrelated_biz_taxable_revenue in 7b
No match for grants_us_govt_orgs_mgmt_and_general in expenses
No match for grants_us_govt_orgs_fundraising in expenses
No match for total_number_recipient_foreign_orgs_listed_as_charities in which for or
<class 'KeyError'>: '(i) Method'
<class 'KeyError'>: '(c) Number of recipients'


8
9a69ea9d5c8d5397603dbb77d027f4ff4ef9dcdc6609b7a1657f9484529353a0
EIN_521703065_YEAR_2010_FORMTYPE_990.pdf
9
3b9cc21f73f11bef5f2e4c15859661f7f7c9adee8dd6a1f2301099834fdb2926
EIN_611190087_YEAR_2008_FORMTYPE_990.pdf


No match for mission in 1 Briefly describe the organization's mission o most significant activities


10
82588f1ce9ca8cec2fc6b55ec07361b821be6650d1c01db4b9c948b8bf12689f
EIN_113489123_YEAR_2008_FORMTYPE_990.pdf


No match for website in J Website: H(c)
No match for gross_receipts in G Gross receipts $
No match for year_formation in L Year of formation. M State of legal domicile:
No match for state_of_domicile in M State of legal domicile:
No match for total_unrelated_biz_revenue in 7a
No match for net_unrelated_biz_taxable_revenue in 7b
No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)


11
f303b69e79844240beccf4fc5b3cecaa3a4f3024a955b2f4a49dd0a81b456649
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf
12
a2061356d7999388cbd49b79872883c92ce6c81a7e7820788f92db496cedd620
EIN_630329409_YEAR_2009_FORMTYPE_990.pdf


No match for website in J Web site: www faulkneredu


13
6e417b42fc15148e0489456f5086bbac28a8361d3452a0ddc23314afee5b6313
EIN_620988294_YEAR_2010_FORMTYPE_990.pdf
14
e56d65e73cec9532561c42db4f4dc64c5b968441b4d492444292a9daf3921044
EIN_581954432_YEAR_2010_FORMTYPE_990.pdf
15
68a8d4678de1d3107eff3ae0bfa1acdd6a9787a173b49c1b9e3dfbad7de5b452
EIN_474865647_YEAR_2020_FORMTYPE_990.pdf


<class 'KeyError'>: '(c) Number of recipients'


16
cd689dd466e417d074b1bde48b0928cc4ae08d6cca44be9d15f288fe13adb578
EIN_472208314_YEAR_2020_FORMTYPE_990.pdf
17
2ef32905e24a7a69d5bb4e4ac22448b279cbe84df831d57b17daa69df0219dfd
EIN_262414132_YEAR_2009_FORMTYPE_990.pdf
18
b71782c8204cadf98ef57d1e9a6968d35368fc940dede7bc85dff661df77a27e
EIN_411601449_YEAR_2010_FORMTYPE_990.pdf


No match for activities_per_region_totals_number_of_offices in O


19
aefc7b65c34db330d8d9f56a1226e116b63ee9be7dbdd4ae4c7bea5d87359f97
EIN_362428692_YEAR_2009_FORMTYPE_990.pdf
20
a5a3cbfcf844be8862bbb61ad46d4c795891ab1143e420db5ed99fc79eeb66c9
EIN_271377148_YEAR_2016_FORMTYPE_990.pdf
21
67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5
EIN_42103580_YEAR_2010_FORMTYPE_990.pdf
22
a6529e504df346097da99104a353e977426e018cd5ac33b62cd2dd89c90763c5
EIN_311002913_YEAR_2008_FORMTYPE_990.pdf
23
d1925c2d74adaa3f150ded3ce67dfe7ae3a306f0db0289ad5755a28d801b2b0b
EIN_202408857_YEAR_2010_FORMTYPE_990.pdf


No match for activities_per_region_totals_number_of_offices in Act the


24
01d89ee5d14575c1321b2e4d67431d172ba76212b4a266bdaf474275029fd78b
EIN_521830327_YEAR_2009_FORMTYPE_990.pdf


No match for total_fundraising_expenses in b Total fundraising expenses (Part IX, column (D), line 25)
No match for revenue_less_expenses in <43,775.> End of Year


In [7]:
output_data = pd.concat(filing_rows).reset_index(drop=True).set_index("job_id")

In [8]:
output_data.head()

field_name,name,address,city,state,zip,website,gross_receipts,year_formation,state_of_domicile,mission,...,total_functional_expense_fundraising,activities_per_region_totals_number_of_offices,activities_per_region_totals_number_of_employees,activities_per_region_totals_total_expenditure,total_number_recipient_foreign_orgs_listed_as_charities,total_number_other_recipient_foreign_orgs_entities,pdf_key,ein,year,filing_id
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0f908c03383d094f6c1386749189f281d188d5ea0cd64c4e424a5b1aae1650c4,MAKE WAY PARTNERS INC,PO BOX 26367,BIRMINGHAM,OX,26367,www MAKEWAYPARTNERS ORG,1426217,,,EVANGELICAL MISSIOI WORK TO PREVENT AND COMBAT...,...,4503,,,,,,EIN_760733035_YEAR_2009_FORMTYPE_990.pdf,760733035,2009,760733035_2009
bcc635fa2b088a72666c3d534d0221a1c1294f2974a6a304679654069e5cbcdc,THE FEDERALIST SOCIETY FOR LAW AND,"1015 18TH ST., N.W. 425",WASHINGTON,DC,20036,www. FED-SOC.ORG,11033302,1982.0,IL,THE ORGANIZATION'S MISSION IS TO PROMOTE INTEL...,...,597789,,,,,,EIN_363235550_YEAR_2009_FORMTYPE_990.pdf,363235550,2009,363235550_2009
f0545c488bd7ab7e775c25cfa7d050a3ff1f84dea9a3371b8fef236bd3d0624b,Ron Hutchcraft Ministries Inc,PO Box 400,Harnson,AR,20400,www hutchcraft com,3390,1991.0,NJ,To communicate Christ to the lost in their lan...,...,263765,,,,,,EIN_223134995_YEAR_2010_FORMTYPE_990.pdf,223134995,2010,223134995_2010
4f2b05354be9fb6483976694a1ed0494c7c387631c15130bdaaa1a85a1cf4f82,FREEDOM 4 24,21430 TIMBERLAKE ROAD STE 101,LYNCHBURG,VA,24502,freedom424 org,61857,2009.0,VA,Freedom 424s mission is to provide a pathway t...,...,669,,,,,,EIN_264320885_YEAR_2009_FORMTYPE_990.pdf,264320885,2009,264320885_2009
1dd5dc37f4e99e27668c769d04d0f72273a6c582d504bf0b88d35d5a4c15c1be,GEORGIA PUBLIC POLICY FOUNDTION,6100 LAKE FORREST DR,LAKE FORREST,GA,30328,www GPPF org,583209,1991.0,GA,To further goals of economic growth & individu...,...,14409,,,,,,EIN_581943161_YEAR_2009_FORMTYPE_990.pdf,581943161,2009,581943161_2009


In [9]:
def clean(x):
    x = str(x)
    x = re.sub(r"\.0\b", "", x)
    x = re.sub("\D", "", x)
    return x

In [10]:
def compare_output(to_validate, to_compare, col):
    return pd.DataFrame(
        {
            "extracted": to_validate.loc[col].loc[
                lambda series: series != to_compare.loc[col]
            ],
            "expected": to_compare.loc[col].loc[
                lambda series: series != to_validate.loc[col]
            ],
        }
    )

In [11]:
to_compare = validation_data.set_index("pdf_key").applymap(clean)
to_validate = output_data[validation_data.columns].set_index("pdf_key").applymap(clean)

for col in to_validate.index:
    validated = compare_output(to_validate, to_compare, col)
    if validated.any().any():
        print(col)
        print(f"{validated.shape[0]} mismatched items.")
        print(validated)
        print("-"*79)

EIN_113489123_YEAR_2008_FORMTYPE_990.pdf
2 mismatched items.
              extracted expected
total_revenue      1017  1017506
travel_total        137    13710
-------------------------------------------------------------------------------
EIN_582248383_YEAR_2009_FORMTYPE_990.pdf
1 mismatched items.
                                                   extracted expected
total_number_other_recipient_foreign_orgs_entities        10         
-------------------------------------------------------------------------------


In [12]:
assert False

AssertionError: 

## BREAK

In [None]:
pd.concat(schedule_f_part_i_rows).head(50)[
    [
        "region", "number_offices", "number_employees", 
        "activities_conducted", "specific_type_activity", 
        "total_expenditures", "pdf_key"
    ]
]

In [None]:
pd.concat(schedule_f_part_i_rows).tail(50)

In [None]:
pd.concat(schedule_f_part_ii_rows).shape

In [None]:
pd.concat(schedule_f_part_ii_rows).iloc[16:66]

In [None]:
pd.concat(schedule_f_part_iii_rows).iloc[19:]

In [None]:
table_test_df = open_df(bucket, "67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5")

In [None]:
test_lines = table_test_df.loc[
    table_test_df["BlockType"] == "LINE"
]
test_words = table_test_df.loc[
    table_test_df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")

In [None]:
HEADER = PART_II_HEADER
NAME = PART_II_TABLE_NAME

In [None]:
table_pages = find_table_pages(
    test_pages["Text"].agg(lambda words: " ".join(words)),
    HEADER,
)

In [None]:
table_pages

In [None]:
TEST_INDEX = 1
TEST_PAGE = 30

In [None]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(
                test_lines, schedule_f_tablemap_df, page, NAME
            ).dropna()
        ),
    }
)

In [None]:
tablemaps["tablemap"].iloc[TEST_INDEX]

In [None]:
row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == NAME
]
table_data = schedule_f_table_extractor_df.loc[
    schedule_f_table_extractor_df["table"] == NAME
].iloc[0]

In [None]:
rows = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            header_top_label=table_data["header_top"],
            top_label=table_data["table_top"],
            bottom_label=table_data["table_bottom"],
            tablemap=tablemap,
            fields=row_extractors["field"],
            field_labels=row_extractors["col_left"]
        )
    )
)

In [None]:
rows

In [None]:
result = rows.apply(
    lambda row: row["extractor"].extract_rows(test_words, row["page"]),
    axis=1
).dropna()

In [None]:
test_extractor = rows["extractor"].iloc[TEST_INDEX]

In [None]:
test_extractor.get_col_spans(test_words, TEST_PAGE)

In [None]:
test_extractor.field_labels

In [None]:
extracted = test_extractor.extract_rows(test_words, TEST_PAGE)

In [None]:
extract_table_data(
        test_pages, test_lines, test_words, HEADER, NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )

In [None]:
table_words = test_extractor.get_table_words(test_words, TEST_PAGE)

In [None]:
table_words.tail(50)

In [None]:
word_clusters = cluster_words(table_words, table_words["Height"].min(), "Midpoint_Y")
[" ".join(word.sort_values(by="Left")["Text"].values) for word in word_clusters]

In [None]:
def columnize(word_cluster, col_spans):
    return col_spans.map(
        lambda span: word_cluster.loc[
            (word_cluster["Right"].between(*span, inclusive="right"))
        ]
    )


def get_cluster_coords(cluster):
    cluster_coords = {
        "Left": cluster["Left"].min(),
        "Right": cluster["Right"].max(),
        "Height": cluster["Height"].max(),
        "Midpoint_X": cluster["Midpoint_X"].median(),
        "Midpoint_Y": cluster["Midpoint_Y"].median(),
        "Top": cluster["Top"].min(),
        "Bottom": cluster["Bottom"].min(),
    }
    cluster_coords["Width"] = cluster_coords["Right"] - cluster_coords["Left"]
    return cluster_coords


def combine_row(row):
    return pd.Series([
        line.map(
            lambda x: x.sort_values(
                by="Left"
            ).reset_index(drop=True)["Text"].fillna("")
        ).agg(
            lambda x: " ".join(x.values)
        ) + " "
        for line in row
    ]).sum().str.strip()

col_spans = test_extractor.get_col_spans(test_words, TEST_PAGE)

col_spans

In [None]:
y_tol = table_words["Height"].median() * .5
columnized = columnize(word_clusters[0], col_spans)
columnized.index = test_extractor.fields
last_col_coords = pd.DataFrame.from_records(
    columnized.map(
        get_cluster_coords
    )
)
rows = []
current_row = [columnized]
top_ws = (
    last_col_coords["Top"].min()
    - test_extractor.get_table_top(test_words, TEST_PAGE)
)
print(f"Y tolerance: {y_tol}")
print(f"Top whitespace: {top_ws}")
if top_ws > y_tol * 3:
    alignment = "BOTTOM"
else:
    alignment = "UNKNOWN"
print(f"Alignment: {alignment}")
print("First cluster:")
print(" ".join(word_clusters[0].sort_values(by="Left")["Text"].values))
for count, cluster in enumerate(word_clusters[1:]):
    print("-"*50)
    print(f"Alignment: {alignment}")
    print("Cluster:", " ".join(cluster.sort_values(by="Left")["Text"].values))
    columnized = columnize(cluster, col_spans)
    columnized.index = test_extractor.fields
    col_coords = pd.DataFrame.from_records(columnized.map(get_cluster_coords))
    print(f"Cluster top: {col_coords['Top'].min()}")
    print(f"Last cluster bottom: {last_col_coords['Bottom'].max()}")
    y_delta = (
        col_coords["Top"].min()
        - last_col_coords["Bottom"].max()
    )
    print("Y Delta:", y_delta)
    if y_delta > y_tol:
        print("Y tolerance exceeded")
        combined_row = combine_row(current_row)
        print(combined_row)
        rows.append(combined_row)
        current_row = [columnized]
    else:
        nonempty = col_coords.dropna().index.to_series()
        last_nonempty = last_col_coords.dropna().index.to_series()
        # delta_cols true if current row has non-empty cells that
        # are empty in the preceding row
        delta_cols = (~nonempty.isin(last_nonempty)).any()
        if not delta_cols and (alignment == "UNKNOWN"):
            alignment = "TOP"
            current_row.append(columnized)
        elif delta_cols and (alignment == "UNKNOWN"):
            alignment = "BOTTOM"
            current_row.append(columnized)
        elif delta_cols and (alignment == "TOP"):
            combined_row = combine_row(current_row)
            print(combined_row)
            rows.append(combined_row)
            current_row = [columnized]
        # (TOP&~delta_cols, BOTTOM&delta_cols, BOTTOM&~delta_cols)
        else:
            current_row.append(columnized)
    last_col_coords = col_coords            

In [None]:
test_lines.loc[
    test_lines["Text"].str.contains("other\)")
    & test_lines["Page"].between(26, 26)
]

In [None]:
test_extractor.get_table_bottom(test_words, TEST_PAGE)

In [None]:
test_words.loc[
    test_words["Text"].str.match("579")
    & test_words["Page"].between(TEST_PAGE, TEST_PAGE)
]

In [None]:
create_tablemap(
    test_lines,
    schedule_f_tablemap_df,
    26,
    NAME
)

In [None]:
test_extractor.tablemap

In [None]:
header_words = test_extractor.get_header_words(test_words, TEST_PAGE)

In [None]:
header_words.sort_values(by="Left")[["Text", "Left", "Right", "Midpoint_Y"]].tail(50)

In [None]:
pd.DataFrame(
    {
        "Gap": header_words.sort_values(by="Left").rolling(2).apply(lambda x: x),
        "Right": header_words.sort_values(by="Left")["Right"],
        "Text": header_words.sort_values(by="Left")["Text"],
    }
)

In [None]:
def cluster_x(words, tolerance):
    print("Tolerance", tolerance)
    if (tolerance == 0) or (words.shape[0] < 2):
        return [
            [word] for (idx, word)
            in words.sort_values(by="Left").iterrows()
        ]
    groups = []
    sorted_words = words.sort_values(by="Left")
    current_group = [sorted_words.iloc[0]]
    last = sorted_words.iloc[0]["Right"]
    for idx, word in sorted_words.iloc[1:].iterrows():
        if word["Left"] <= (last + tolerance):
            current_group.append(word)
        else:
            print("New Group")
            print("*"*100)
            groups.append(current_group)
            current_group = [word]
        print("Left:", word["Left"])
        print("Right:", word["Right"])
        last = max((last, word["Right"]))
    groups.append(current_group)
    return [pd.DataFrame(group) for group in groups]

In [None]:
left_right = pd.DataFrame(
    {
        "Left": header_words["Left"].sort_values().reset_index(drop=True),
        "Right": header_words["Right"].sort_values().reset_index(drop=True),
    }
)

header_lines = cluster_x(
    left_right, header_words["Width"].min()*.8
)

In [None]:
len(header_lines)

In [None]:
[
    (x["Left"].min(), x["Right"].max())
    for x in header_lines
]

In [None]:
x_clusters = cluster_x(header_words, header_words["Width"].min()*.8)

In [None]:
len(x_clusters)

In [None]:
0.6793086230754852-0.6723970174789429

In [None]:
[
    words.sort_values(by="Left")["Text"].agg(lambda x: " ".join(x.values))
    for words in x_clusters
]

In [None]:
left_bounds = pd.Series(
        cluster["Left"].min() for cluster in x_clusters[1:]
)
right_bounds = pd.Series(
    [cluster["Right"].max() for cluster in x_clusters[:-1]]
)
offsets = right_bounds - left_bounds
full_left = pd.concat([pd.Series([0]), left_bounds + offsets]).reset_index(drop=True)
full_right = pd.concat([right_bounds - offsets, pd.Series([1])]).reset_index(drop=True)
col_spans = full_left.combine(full_right, lambda x, y: (x, y))
col_spans

In [None]:
last_cluster_right = pd.concat([header_words, table_words])["Left"].min()
left_bounds = []
right_bounds = []
for cluster in x_clusters:
    print("Last Cluster Right", last_cluster_right)
    print("Cluster Left", cluster["Left"].min())
    print("Cluster Right", cluster["Right"].max())
    print("*"*50)
    left_bounds.append(last_cluster_right)
    last_cluster_right = max(
        cluster["Right"].max()
        + cluster["Left"].min()
        - last_cluster_right,
        cluster["Right"].max()
    )
    right_bounds.append(last_cluster_right)
col_spans = pd.Series(zip(left_bounds, right_bounds))
col_spans

In [None]:
header_words["Midpoint_X"].round(2).value_counts()

In [None]:
def count_crossing_lines(df, left, right):
    return (
        df.loc[
            df["Left"].between(left, right)
            & (df["Right"] > right),
            "Text"
        ].count()
        + df.loc[
            (df["Left"] < left)
            & df["Right"].between(left, right),
            "Text"
        ].count()
    )


def recalculate_intervals(left_interval, right_interval):
    left_left = left_interval[0]
    right_right = right_interval[1]
    total_span = right_right - left_left
    
    

In [None]:
col_spans.map(
    lambda x: (x[1] + x[0]) / 2
)

In [None]:
table_words.loc[
    table_words["Left"].between(0.12358373403549194, 0.3558424413204193)
    & (table_words["Midpoint_X"] > 0.3558424413204193),
    "Text"
]

In [None]:
left_right = pd.DataFrame(
    {
        "left": header_words["Left"].sort_values().reset_index(drop=True),
        "right": header_words["Right"].sort_values().reset_index(drop=True),
    }
)
left_right.rolling(2).apply(lambda x: print(x, "\n" + "*"*20) or 1)

In [None]:
left_right.head()

In [None]:
1 and 2

In [None]:
(test_words.groupby("Page")["Width"].max() / test_words.groupby("Page")["Height"].mean()).index.values

In [None]:
pd.cut(header_words["Right"], 1000).unique().categories

In [None]:
sorted_header_words = header_words.sort_values(by="Left")

In [None]:
len(x_clusters)

In [None]:
def cluster_x(words, tolerance):
    if (tolerance == 0) or (words.shape[0] < 2):
        return [
            [word] for (idx, word)
            in words.sort_values(by="Midpoint_X").iterrows()
        ]
    groups = []
    sorted_words = words.sort_values(by="Midpoint_X")
    current_group = [sorted_words.iloc[0]]
    last = sorted_words.iloc[0]["Midpoint_X"]
    for idx, word in sorted_words.iloc[1:].iterrows():
        last = (last + word["Midpoint_X"]) / 2
        if word["Midpoint_X"] <= (last + tolerance):
            current_group.append(word)
        else:
            print("New Group")
            print("*"*100)
            groups.append(current_group)
            current_group = [word]
            last = word["Right"]
        print("Word:", word["Text"])
        print("Left:", word["Left"])
        print("Right:", word["Right"])
    groups.append(current_group)
    return [pd.DataFrame(group) for group in groups]

In [None]:
clusters = cluster_x(header_words, header_words["Width"].mean()*.7)

In [None]:
len(clusters)

In [None]:
header_words.sort_values(by="Midpoint_X").rolling(4)["Midpoint_X"].mean()

In [None]:
left_clusters = cluster_words(header_words, header_words["Width"].mean()*.835, "Left")

In [None]:
len(left_clusters)

In [None]:
[
    cluster["Left"].mean() for cluster in left_clusters
]

In [None]:
def get_col_spans(self, words, page):
    pass

In [None]:
test_extractor.tablemap

In [None]:
test_col_spans = pd.Series(
    [
        (0, 0.205227),
        (0.205227, 0.311407),
        (0.311407, 0.420936),
        (0.420936, 0.572975),
        (0.572975, 0.735998),
        (0.735998, 1),
    ]
)

In [None]:
def find_new_right(df, right):
    return (
        df.loc[
            (df["Right"] > right*1.01)
            & (df["Left"] < right),
            "Left"
        ].min()
    )

In [None]:
crossing_right = test_col_spans.map(
    lambda x: get_new_right(
        pd.concat([
            test_extractor.get_header_words(test_words, 18),
            test_extractor.get_table_words(test_words, 18),
        ]),
        x[1]
    )
)
crossing_right

In [None]:
test_col_spans.where(
    crossing_right.isna(),
    test_col_spans.combine(crossing_right, lambda x, y: (x[0], y))
)

In [None]:
test_col_spans

In [None]:
init_left = test_extractor.field_labels.map(
    lambda x: get_coordinate(test_extractor.tablemap, x, "Left", "Left_Default")
)
init_right = pd.concat(
    [
        init_left.iloc[1:],
        pd.Series([1]),
    ],
    ignore_index=True,
)

In [None]:
crossing_right = init_right.map(
    lambda x: find_new_right(
        pd.concat([
            test_extractor.get_header_words(test_words, 25),
            test_extractor.get_table_words(test_words, 25),
        ]),
        x
    )
)
crossing_right

In [None]:
new_right = init_right.where(
    crossing_right.isna(),
    crossing_right,
)
new_right

In [None]:
init_left

In [None]:
init_left.iloc[1:] = new_right.iloc[:-1]

In [None]:
init_left

In [None]:
col_spans = init_left.combine(new_right, lambda x, y: (x, y))

In [None]:
test_lines.loc[
    test_lines["Text"].str.contains("\(a\)\s*Na")
]

In [None]:
test_lines.loc[
    test_lines["Text"].str.contains("A")
    & test_lines["Page"].between(34, 34),
    ["Text", "Height", "Top", "Bottom"]
]

In [None]:
page_words = pd.concat([
    test_extractor.get_header_words(test_words, TEST_PAGE),
    table_words
])

In [None]:
page_word_clusters = cluster_words(page_words, page_words["Height"].min(), "Midpoint_Y")

In [None]:
len(page_word_clusters)

In [None]:
test_cluster = page_word_clusters[0].sort_values(by="Left")

In [None]:
page_word_clusters[0]

In [None]:
def make_interval(left, right):
    try:
        return pd.Interval(left, right, closed="both")
    except ValueError:
        pass

In [None]:
def get_whitespace(cluster):
    sorted_cluster = cluster.sort_values(by="Left")
    left = pd.concat(
        [
            pd.Series([0]),
            sorted_cluster["Right"],
        ],
        ignore_index=True
    )
    right = pd.concat(
        [
            sorted_cluster["Left"],
            pd.Series([1]),
        ],
        ignore_index=True
    )
    return left.combine(right, make_interval).dropna()

In [None]:
whitespaces = get_whitespace(page_word_clusters[0])
whitespaces

In [None]:
all_whitespace = []
for cluster in page_word_clusters:
    all_whitespace.extend(get_whitespace(cluster))
all_whitespace = pd.Series(all_whitespace).sort_values()

In [None]:
curr_interval = all_whitespace.iloc[0]
intervals = []
rights = [all_whitespace.iloc[0].right]
for idx, val in all_whitespace.iloc[1:].items():
    #print(curr_interval, val)
    if val.overlaps(curr_interval):
        curr_interval = pd.Interval(
            max(curr_interval.left, val.left),
            min(val.right, curr_interval.right),
            closed="both",
        )
    else:
        intervals.append(curr_interval)
        overlapping_rights = [i for i in rights if i in val]
        if overlapping_rights:
            curr_interval = pd.Interval(
                val.left,
                min(overlapping_rights)
            )
        else:
            curr_interval = val
    rights.append(val.right)
intervals.append(curr_interval)
intervals

In [None]:
intervals

In [None]:
page_words.loc[
    page_words["Left"].between(intervals[1].left, intervals[1].right, inclusive="neither")
    | page_words["Right"].between(intervals[1].left, intervals[1].right, inclusive="neither")
]

In [None]:
all_whitespace.head(50)

In [None]:
page_words["Right"].min()

In [None]:
2 in pd.Interval(1,4)

In [None]:
min([])

In [None]:
interval = pd.Interval(0, 1, closed="both")
intervals = []
for idx, word in page_words.sort_values(by="Left").iterrows():
    if word["Left"] in interval:
        intervals.append(
            pd.Interval(interval.left, word["Left"], closed="both")
        )
        interval = pd.Interval(word["Right"], interval.right, closed="both")
intervals

In [None]:
test_extractor.get_table_top(test_words, TEST_PAGE)

In [None]:
word_clusters[0]