In [172]:
import json
import math
import re
from pathlib import Path

import boto3
import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.postprocessing import clean_filing, clean_f_i, clean_f_ii, clean_f_iii
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex

In [2]:
TEST_JOB_IDS = [
    "67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebdb32d3186b19d6a6fd5",
    #"a6529e504df346097da99104a353e977426e018cd5ac33b62cd2dd89c90763c5",
    #"9947884c88a577277428845f5afe6def65f0105bc543a2d0eaf159ab89a70725",
    #"01d89ee5d14575c1321b2e4d67431d172ba76212b4a266bdaf474275029fd78b",
    #"d1925c2d74adaa3f150ded3ce67dfe7ae3a306f0db0289ad5755a28d801b2b0b",
    #"5cab71fd7354b318e12bd076657e96dd6fd1f890c2a83d23e29b35766cfe11c9",
    #"d601dfbf6590b675c2e3ab8ae0ddab690ad71b4d899973d31c461ddc8fa86943",
    #"74eb93fa3b2a306954a811f75de80ac622f31952d2ba2088268c58348cb5c25b",
    #"c70cae52d0216aeabf997babbbece32608de694cbc74fcf72d1d61a4d13028ed",
    #"71a8cd09a44a93c76e65e3f211d8e97657a816c2c9d3cca61ab12dc830017002",
]

In [3]:
bucket = boto3.resource("s3").Bucket("s3-ocr-990s-demo")

In [184]:
extractor_df = load_extractor_df("990_extractors.csv")
roadmap_df = load_extractor_df("990_roadmap.csv")
schedule_f_tablemap_df = load_extractor_df("schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("schedule_f_row_extractors.csv")

In [11]:
PART_I_HEADER = r"\(a\)\s*Region|\(d\)\s*Activities|\(e\)\s*If activity|\(f\)Total expenditures"
PART_II_HEADER = r"\(b\)\s*IRS code|\(c\)\s*Region|\(d\)\s*Purpose|\(f\)\s*Manner|\(h\)\s*Description"
PART_III_HEADER = r"\(b\)\s*Region|\(e\)\s*Manner of cash|\(h\)\s*Method of va"
PART_I_TABLE_NAME = "Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = "Grants to Individuals Outside the United States"

In [6]:
TEST_DFS = [
    open_df(bucket, job_id)
    for job_id in TEST_JOB_IDS
]

In [12]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []


for i, data in enumerate(TEST_DFS):
    print(i)
    lines = data.loc[data["BlockType"] == "LINE"]
    words = data.loc[data["BlockType"] == "WORD"]
    page_map = find_pages(lines)
    roadmap = create_roadmap(
        lines, roadmap_df, page_map
    )
    row = extract_from_roadmap(
        words, lines, roadmap, extractor_df, page_map
    )
    filing_rows.append(row)
    pages = lines.groupby("Page")
    part_i_table = extract_table_data(
        pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    if part_i_table is not None:
        schedule_f_part_i_rows.append(
            part_i_table
        )
    part_ii_table = extract_table_data(
        pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    if part_ii_table is not None:
        schedule_f_part_ii_rows.append(
            part_ii_table
        )
    part_iii_table = extract_table_data(
        pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
        schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
    )
    if part_iii_table is not None:
        schedule_f_part_iii_rows.append(
            part_iii_table
        )

0


<class 'KeyError'>: '(c) Number of recipients'


In [13]:
filing_rows

[field_name
 name                                                       President and Fellows of Harvard College
 address                                                       1033 Massachusetts Avenue Third Floor
 city                                                                                      Cambridge
 state                                                                                            MA
 zip                                                                                           02138
                                                                              ...                   
 activities_per_region_totals_number_of_offices                                                   25
 activities_per_region_totals_number_of_employees                                                305
 activities_per_region_totals_total_expenditure                                           96,031,393
 total_number_recipient_foreign_orgs_listed_as_charities                       

In [17]:
filing_rows[0].iloc[-50:].to_frame().reset_index()

Unnamed: 0,field_name,0
0,payments_affiliates_prog_service,
1,payments_affiliates_mgmt_general,
2,payments_affiliates_fundraising,
3,depreciation_depletion_amortization_total,255661843
4,depreciation_depletion_amortization_prog_service,233291432
5,depreciation_depletion_amortization_mgmt_general,15531457
6,depreciation_depletion_amortization_fundraising,6838954
7,insurance_total,9072572
8,insurance_prog_service,8278722
9,insurance_mgmt_general,551159


In [None]:
assert False

In [185]:
test_lines = TEST_DFS[0].loc[
    lambda df: df["BlockType"] == "LINE"
]
test_words = TEST_DFS[0].loc[
    lambda df: df["BlockType"] == "WORD"
]
test_pages = test_lines.groupby("Page")
part_i_table_pages = find_table_pages(
    test_pages["Text"].agg(lambda words: " ".join(words)), "Schedule F Part I"
)
test_pagemap = find_pages(test_lines)
test_roadmap = create_roadmap(test_lines, roadmap_df, test_pagemap)
test_extractors = create_extractors(extractor_df, test_roadmap, test_pagemap)

In [186]:
part_i_table = extract_table_data(
    test_pages, test_lines, test_words, 
    PART_I_HEADER, PART_I_TABLE_NAME, 
    schedule_f_tablemap_df, schedule_f_table_extractor_df,
    schedule_f_row_extractor_df
)

In [188]:
part_i_table.head(50)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,Totals,25.0,305,,,96031393
1,Central merica and the Carbbear,,,,,481
2,Central merica the Caribbean,,,,,303788
3,Central merica the anbbea,,,,Academic Support,20973
4,:entral :aribbean America the,,,,Instruction,58431
5,entral merica Caribbear the Central Taribbean ...,,,,Research and other academic activity,228975
6,East Asia and Pacific,,,,,3593
7,East Asia and Pacific,,,,,1269299
8,East Asia and Pacific,,rogr,services,Service Centers,118026
9,East Asia and Pacific,,,Services,Academic Support,461230


In [189]:
create_tablemap(test_lines, schedule_f_tablemap_df, 30)

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(a) Region,0.056952,0.063677,0.0,0.0
(b) Number of offices,0.060424,0.167132,0.0,0.0
(c) Number of employees,0.060752,0.254655,0.0,0.0
(d) Activities conducted,0.056882,0.370626,0.0,0.0
(e) Specific type,0.056969,0.480311,0.0,0.0
(f) Total Expenditures,0.056891,0.612973,0.0,0.0
"Schedule F, Part I, Item 3a",,,0.0,0.0
(a) Name of organization,0.725613,0.032893,0.0,0.0
(b) IRS code,0.721736,0.128213,0.0,0.0
(c) Region,0.727929,0.233067,0.0,0.0


In [190]:
part_i_row_extractors = schedule_f_row_extractor_df.loc[
    lambda df: df["table"] == PART_I_TABLE_NAME
]
part_i_table = schedule_f_table_extractor_df.loc[
    lambda df: df["table"] == PART_I_TABLE_NAME
].iloc[0]

In [191]:
test_part_i_extractor = TableExtractor(
    top_label=part_i_table["table_top"],
    top_delta=part_i_table["table_top_delta"],
    bottom_label=part_i_table["table_bottom"],
    bottom_delta=part_i_table["table_bottom_delta"],
    row_margin=part_i_table["row_margin"],
    index_col_left_label=part_i_table["index_col_left"],
    index_col_left_delta=part_i_table["index_col_left_delta"],
    index_col_right_label=part_i_table["index_col_right"],
    index_col_right_delta=part_i_table["index_col_right_delta"],
    tablemap=create_tablemap(test_lines, schedule_f_tablemap_df, 30).dropna(),
    row_extractors=part_i_row_extractors,
    fields=part_i_row_extractors["field"],
)

In [196]:
table_words.loc[
    table_words["Text"].str.contains("Prog")
]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Polygon,Height,Left,Top,Right,Bottom,Width,Children,Line_No,File,Line_No2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
fdfdd2bc-ca30-49db-bd60-2b5da7ef566e,WORD,30,Program services,PRINTED,"[{'X': 0.33881068229675293, 'Y': 0.10080314427...",0.003579,0.338811,0.100803,0.424993,0.104382,0.086182,,14,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.0967, 0.103]"
02e068a8-4642-42a9-b1c6-75120f44de08,WORD,30,Program Services,PRINTED,"[{'X': 0.33822759985923767, 'Y': 0.10974000394...",0.00337,0.338228,0.10974,0.427858,0.11311,0.089631,,16,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.103, 0.11]"
67dd66d8-def2-47ff-bd2d-3168c745897a,WORD,30,Program services,PRINTED,"[{'X': 0.33902767300605774, 'Y': 0.11882317066...",0.003429,0.339028,0.118823,0.425879,0.122253,0.086851,,17,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.116, 0.123]"
06168896-0687-43f2-b52f-14117cde61af,WORD,30,Progr,PRINTED,"[{'X': 0.33977001905441284, 'Y': 0.17078070342...",0.00367,0.33977,0.170781,0.364665,0.174451,0.024895,,23,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.169, 0.175]"
5a6570b5-deba-4f1d-ad10-4a829bff7449,WORD,30,Program Services,PRINTED,"[{'X': 0.33891963958740234, 'Y': 0.17955258488...",0.003526,0.33892,0.179553,0.428889,0.183078,0.08997,,24,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.175, 0.182]"
97b2aefa-53ee-4dd7-98c1-f4d7ce1c83df,WORD,30,Progr,PRINTED,"[{'X': 0.33948445320129395, 'Y': 0.18799027800...",0.003371,0.339484,0.18799,0.364738,0.191361,0.025253,,25,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.182, 0.188]"
e452fb87-5b95-4fad-9842-5312ead676f4,WORD,30,Programservices,PRINTED,"[{'X': 0.33911728858947754, 'Y': 0.19722262024...",0.003651,0.339117,0.197223,0.428008,0.200874,0.08889,,26,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.195, 0.202]"
bb5f3d08-32fa-4074-b367-3769f16432c3,WORD,30,Program,PRINTED,"[{'X': 0.33959928154945374, 'Y': 0.23233115673...",0.003489,0.339599,0.232331,0.383929,0.23582,0.04433,,30,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.228, 0.234]"
4ebb4ab5-7e91-4bfb-b4e4-bb38001d9f10,WORD,30,Progr,PRINTED,"[{'X': 0.3418879508972168, 'Y': 0.240975663065...",0.003312,0.341888,0.240976,0.364684,0.244287,0.022796,,31,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.241, 0.247]"
8e0b38d6-18f7-40b0-84aa-4c0d7357fa74,WORD,30,Progr,PRINTED,"[{'X': 0.33994948863983154, 'Y': 0.25012984871...",0.00307,0.339949,0.25013,0.363776,0.2532,0.023827,,32,67217e04d83f69ffbbb461b0a23648b037b4a36d0c5ebd...,"(0.247, 0.254]"


In [195]:
table_words = test_words.loc[
    (test_words["Top"] > test_part_i_extractor.table_top)
    & (test_words["Bottom"] < .73)
    & (test_words["Page"] == 30)
].copy()
table_words["Line_No2"] = pd.cut(
    table_words["Top"], 
    int(
        0.5*(.73 - test_part_i_extractor.table_top) 
        / table_words["Height"].mean()
    )
)

In [114]:
table_words["Top"].sort_values().rolling(2).apply(lambda x: x.max() - x.min()).describe()

count    5.210000e+02
mean     1.245528e-03
std      2.490257e-03
min      1.192093e-07
25%      4.117191e-05
50%      9.840727e-05
75%      2.825856e-04
max      1.140487e-02
Name: Top, dtype: float64

In [117]:
pd.Series(
    range(
        int(
            (.73 - test_part_i_extractor.table_top) / row_gap
        )
    )
) * row_gap

0       0.000000
1       0.000098
2       0.000197
3       0.000295
4       0.000394
          ...   
6651    0.654458
6652    0.654557
6653    0.654655
6654    0.654754
6655    0.654852
Length: 6656, dtype: float64

In [164]:
row_gap = 3.825856e-03
rows = []
row = []
last_top = table_words["Top"].iloc[0]
for word in table_words.sort_values(.itertuples():
    if word.Top - last_top > row_gap:
        rows.append(row)
        row = []
    row.append(word.Text)
    last_top = word.Top

In [181]:
table_words["NewTop"] = table_words["Top"].round(int(abs(math.log(table_words["Height"].mean(), 10))))

In [183]:
table_words.groupby("NewTop")["Text"].agg(lambda x: " ".join(x)).head(50)

NewTop
0.08    located the region) Central merica and the Fun...
0.09      Central Carbbear merica the Grantmaking 303,788
0.10    Caribbean Central merica the Program services ...
0.11    :entral :aribbean America the Program Services...
0.12    entral merica Research and other 228,975 Carib...
0.13             Central Taribbean merica the Investments
0.14    East Asia and Pacific Fundraising 3,593 1,269,299
0.15    East Asia and Pacific Grantmaking East Asia an...
0.16    East Asia and Pacific rogram Services Academic...
0.17    East Asia and the Pacific Progr services Insti...
0.18    East Asia and Pacific Program Services instruc...
0.19    East Asia and Pacific 44 Progr Services Resear...
0.20    East Asia and Pacific Programservices Student ...
0.21    East Asia and Pacific Investments Europe Inclu...
0.22             Greenland) Iceland Grantmaking 5,351,568
0.23    Europe Including Iceland Program Services Serv...
0.24    freenland Europe Greenland Including Iceland 8...
0.25   

In [170]:
rows[3]

['merica', 'the', 'Grantmaking', '303,788']

In [118]:
row_gap = 9.84*10**-5
pd.cut(
    table_words["Top"],
    pd.Series(
        range(
            int(
                (.73 - test_part_i_extractor.table_top) / row_gap
            )
        )
    ) * row_gap
)

Id
ee25f092-fbd0-457d-8172-714915b18904    (0.077047, 0.077146]
f0b00ce5-6cfb-4c56-a5e2-1892e1d82f2d    (0.077047, 0.077146]
6f17b9fd-5937-4bf0-a004-63f84ada93e7    (0.082754, 0.082853]
33733a90-daf3-45cd-be33-db2b819aa02c    (0.082853, 0.082951]
b6c12310-3590-4462-8871-7b8cb2f8c636    (0.082853, 0.082951]
                                                ...         
681d3a2b-eb3c-49a0-bc67-0428f76fc234                     NaN
852701a0-ba50-4346-9296-cd5a50f55257                     NaN
fd7675aa-3acb-4b20-89e6-729e753d9168                     NaN
30da3627-2008-447f-9b95-4f4e6b67583a                     NaN
64f9c6a7-8c78-4f0d-a535-a9cd3f6f1695                     NaN
Name: Top, Length: 522, dtype: category
Categories (6655, interval[float64, right]): [(0.0, 9.84e-05] < (9.84e-05, 0.0001968] < (0.0001968, 0.0002952] < (0.0002952, 0.0003936] ... (0.65446, 0.65456] < (0.65456, 0.65466] < (0.65466, 0.65475] < (0.65475, 0.65485]]

In [113]:
table_words.groupby("Line_No2")["Text"].agg(lambda words: " ".join(words))

Line_No2
(0.0764, 0.0836]    located the region) Central merica and the Fun...
(0.0836, 0.0902]                                             Carbbear
(0.0902, 0.0967]     Central merica the Grantmaking 303,788 Caribbean
(0.0967, 0.103]     Central merica the Program services Academic S...
(0.103, 0.11]       anbbea :entral America the Program Services In...
                                          ...                        
(0.693, 0.7]        Sub- Saharan Africa Program services Research ...
(0.7, 0.706]             Saharan Africa Investments academic activity
(0.706, 0.713]                                                       
(0.713, 0.719]      Form 990 Schedule F Part II Grants or Entities...
(0.719, 0.726]      (b)IRS code (g) Amount of non- (h) Description...
Name: Text, Length: 99, dtype: object

In [30]:
test_part_i_extractor.table_top, test_part_i_extractor.table_bottom

(0.07500000000000001, 0.99)

In [102]:
test_part_i_extractor.get_row_spans(test_words, 30).tail(50)

Unnamed: 0,row_top,row_bottom
26,0.338497,0.347869
27,0.347869,0.365716
28,0.365716,0.374474
29,0.374474,0.382877
30,0.382877,0.391794
31,0.391794,0.400191
32,0.400191,0.408747
33,0.408747,0.4174
34,0.4174,0.426163
35,0.426163,0.443565


In [194]:
test_part_i_extractor.extract_rows(test_words, 30).head(50)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,Central merica and the Carbbear,,,,,481.0
1,Central merica the Caribbean,,,,,303788.0
2,Central merica the anbbea,,,,Academic Support,20973.0
3,:entral :aribbean America the,,,,Instruction,58431.0
4,entral merica Caribbear the Central Taribbean ...,,,,Research and other academic activity,228975.0
5,East Asia and Pacific,,,,,3593.0
6,East Asia and Pacific,,,,,1269299.0
7,East Asia and Pacific,,rogr,services,Service Centers,118026.0
8,East Asia and Pacific,,,Services,Academic Support,461230.0
9,East Asia and the Pacific,,Progr,services,Institutiona support,71217.0


In [70]:
test_part_i_extractor.row_extractors

Unnamed: 0,field,col_left,left_delta,col_right,right_delta,table
0,region,Top Left Corner,0.0,(b) Number of offices,0,Activities per Region
1,number_offices,(b) Number of offices,-0.01,(c) Number of employees,0,Activities per Region
2,number_employees,(c) Number of employees,-0.01,(d) Activities conducted,0,Activities per Region
3,activities_conducted,(d) Activities conducted,-0.01,(e) Specific type,0,Activities per Region
4,specific_type_activity,(e) Specific type,-0.01,(f) Total Expenditures,0,Activities per Region
5,total_expenditures,(f) Total Expenditures,-0.01,Bottom Right Corner,0,Activities per Region


In [192]:
test_part_i_extractor.get_col_span(
    "(c) Number of employees", "(d) Activities conducted"
)

(0.25, 0.37)

In [52]:
test_part_i_extractor.get_row_spans(test_words, 30).iloc[45]

row_top       0.531742
row_bottom    0.540419
Name: 45, dtype: float64

In [100]:
test_part_i_extractor.get_index_col_span()

(0.61, 1.0)

In [32]:
test_part_i_extractor.extract_row(
    test_words, 
    30,
    test_part_i_extractor.get_row_spans(test_words, 30).iloc[11]
)

0               East Asia and Pacific
1                                    
2                            44 Progr
3                            Services
4    Research Academic Activity other
5                           6,420,156
dtype: object

In [45]:
bb = BoundingBox(
    left=0.16,
    left_delta=-0.03,
    top=0.187750,
    top_delta=-0.01,
    right=0.25,
    right_delta=0,
    bottom=0.197107,
    bottom_delta=0.01,
)
bb.get_text_in_box(test_words, 30)

''

In [82]:
page_words["Height"].mean()

0.0033432768477079464

In [80]:
page_words = test_words.loc[
    test_words["Page"] == 30
]

page_words.loc[
    page_words["Text"].str.contains("urop|eenla"),
    "Top"
]

Id
c6478b7a-58cb-4bfb-b0ab-ccc3145cfe33    0.214395
97e5b0e8-da82-4a78-91ac-e901c1966d4a    0.218660
1d36838a-ce95-4a4f-85ba-ec16621f4a9c    0.231943
f54e512e-59bd-42f7-9ff1-ee9d79b1c2c0    0.236210
75042c6d-992f-4f82-ac4e-95a9e6a55e7b    0.241169
4adddbf0-56a9-4967-9c7e-59275f9a7871    0.244966
54c7ae31-c12a-4131-8c3d-3fc4ed85482d    0.253953
b627d912-424f-4397-a597-aca428554617    0.259386
d1a14086-6824-4d6d-a17b-0613399bc110    0.262851
514db85a-17a0-469a-b522-9f8caff4238b    0.271945
1b68b285-5394-44f1-9a13-eb2ec9d76a19    0.267809
a0291324-8670-4c78-87e7-6570304085a9    0.276868
ef8d4545-fd28-405d-b50c-8dd243103f7e    0.281122
c9702555-2c52-4bc1-ab0c-c4d3f44182e4    0.285709
8b6365f8-db4d-4403-a6db-015280fa1855    0.289733
Name: Top, dtype: float64

In [72]:
means = page_words.loc[
    page_words["Text"].str.contains("urop|eenla"),
    "Top"
].rolling(2).mean().dropna().reset_index(drop=True)

In [77]:
ranges = page_words.loc[
    page_words["Text"].str.contains("urop|eenla"),
    "Top"
].rolling(2).apply(lambda x: x.max() - x.min()).dropna().reset_index(drop=True)

In [81]:
ranges

0     0.004265
1     0.013283
2     0.004268
3     0.004958
4     0.003797
5     0.008987
6     0.005433
7     0.003465
8     0.009095
9     0.004137
10    0.009059
11    0.004254
12    0.004587
13    0.004023
Name: Top, dtype: float64

In [75]:
means+stds

0     0.219544
1     0.234694
2     0.237094
3     0.242196
4     0.245752
5     0.255814
6     0.260512
7     0.263569
8     0.273829
9     0.272802
10    0.278744
11    0.282003
12    0.286659
13    0.290566
Name: Top, dtype: float64

In [58]:
page_words.groupby("Line_No")["File"].count().value_counts()

6     14
4     13
9     13
7     12
8     10
3      6
2      5
5      5
11     4
10     3
12     2
16     2
14     2
1      1
17     1
18     1
20     1
22     1
Name: File, dtype: int64

In [102]:
test_part_i_extractor.get_row_spans(test_words, 30).head(50)

Unnamed: 0,row_top,row_bottom
0,0.082498,0.091744
1,0.091744,0.100374
2,0.100374,0.109502
3,0.109502,0.117976
4,0.117976,0.13604
5,0.13604,0.144817
6,0.144817,0.153323
7,0.153323,0.161888
8,0.161888,0.170387
9,0.170387,0.179089


In [31]:
test_extractors.iloc[-5]

Extractor(name='activities_per_region_totals_number_of_offices', strategy='words', page=26, bounding_box=BoundingBox(left=0.2, left_delta=0.0, top=0.96, top_delta=-0.001, right=0.31, right_delta=0.01, bottom=0.96, bottom_delta=0.02), regex=re.compile('(?P<match>\\d+[\\d,.]*\\b|\\b[ooIL]\\b)'))

In [32]:
test_extractors.iloc[-5].bounding_box.get_text_in_box(test_words, 26)

'25'

In [None]:
test_extractors.iloc[28].extract(test_words, test_lines)

In [21]:
BoundingBox(
    left=0, 
    left_delta=0, 
    top=0.96, 
    top_delta=-0.001, 
    right=1, 
    right_delta=0.01, 
    bottom=.96, 
    bottom_delta=0.02
).get_text_in_box(test_words, 26)

'25 305 96,031,393 Totals'

In [None]:
test_words.loc[
    test_words["Text"].str.contains("244")
    & (test_words["Page"] == 2)
]

In [None]:
BoundingBox(
    left=0,
    left_delta=0,
    top=.21,
    top_delta=0,
    right=1,
    right_delta=0,
    bottom=.25,
    bottom_delta=0,
).get_text_in_box(test_words, 2)

In [None]:
test_words.loc[test_words["Text"].str.contains("H\(a\)")]

In [30]:
test_roadmap.tail(50)

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Part IX, Item 11b",0.429712,0.026196,0.43,0.1
"Part IX, Item 11c",0.447892,0.025837,0.45,0.1
"Part IX, Item 11d",0.465651,0.025995,0.46,0.1
"Part IX, Item 11e",0.483246,0.025963,0.49,0.1
"Part IX, Item 11f",0.501698,0.047581,0.49,0.1
"Part IX, Item 11g",0.519664,0.025652,0.51,0.1
"Part IX, Item 12",0.537709,0.047724,0.53,0.1
"Part IX, Item 13",0.55537,0.048127,0.55,0.1
"Part IX, Item 14",0.57368,0.047936,0.57,0.1
"Part IX, Item 15",0.591688,0.01286,0.59,0.1


## BREAK

In [88]:
page_words["Line_No2"] = pd.cut(page_words["Top"], int(1 / page_words["Height"].mean()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_words["Line_No2"] = pd.cut(page_words["Top"], int(1 / page_words["Height"].mean()))


In [92]:
page_words.groupby("Line_No2")["Text"].agg(lambda words: " ".join(words)).head(50)

Line_No2
(0.00142, 0.00571]                                      Additional Data
(0.00571, 0.009]                                                       
(0.009, 0.0123]                                                        
(0.0123, 0.0156]                                                       
(0.0156, 0.0189]                                           Software ID:
(0.0189, 0.0222]                                                       
(0.0222, 0.0255]                                      Software Version:
(0.0255, 0.0288]                                                       
(0.0288, 0.0321]                                        EIN: 04-2103580
(0.0321, 0.0354]         Name: President and Fellows of Harvard College
(0.0354, 0.0387]                                                       
(0.0387, 0.042]                                                        
(0.042, 0.0453]                                                        
(0.0453, 0.0486]                                       

In [87]:
pd.cut(page_words["Top"], int(1 / page_words["Height"].mean()))

Id
d3a1c1bd-c3c4-4e73-890a-bac0bf76f386    (0.00142, 0.00571]
ee179b8a-9281-4f9f-a45d-f12360ec4060    (0.00142, 0.00571]
3d402eba-f5ab-452d-9627-9d1017f17b8f      (0.0156, 0.0189]
17bf6f46-936a-4fba-bbce-826528e88a97      (0.0156, 0.0189]
60eed702-c656-4f8a-9013-59aa4351f047      (0.0222, 0.0255]
                                               ...        
1cc5a3b3-919e-4aa2-b41b-43fecfa555a2        (0.985, 0.988]
c8693518-e160-4ea7-ab23-814955d32a01        (0.978, 0.982]
d62f2ffc-e184-460a-ba76-5be9a0d79e4c        (0.985, 0.988]
7eee2453-89a3-4a23-a8da-1cfaa0526efc        (0.978, 0.982]
50484db0-7736-4482-9980-06b47f26666f        (0.985, 0.988]
Name: Top, Length: 706, dtype: category
Categories (299, interval[float64, right]): [(0.00142, 0.00571] < (0.00571, 0.009] < (0.009, 0.0123] < (0.0123, 0.0156] ... (0.975, 0.978] < (0.978, 0.982] < (0.982, 0.985] < (0.985, 0.988]]

In [None]:
assert False

In [None]:
part_i_table = extract_table_data(
    test_pages, test_lines, test_words, PART_I_HEADER, PART_I_TABLE_NAME,
    schedule_f_tablemap_df, schedule_f_table_extractor_df,
    schedule_f_row_extractor_df
)

In [None]:
part_i_table

In [None]:
table_pages = find_table_pages(test_pages["Text"].agg(lambda words: " ".join(words)), PART_I_HEADER)

In [None]:
table_pages

In [None]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(test_lines, schedule_f_tablemap_df, page).dropna()
        )
    }
)

In [None]:
tablemaps["tablemap"].iloc[0]

In [None]:
table_row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_I_TABLE_NAME
]

table = schedule_f_table_extractor_df.loc[
        schedule_f_table_extractor_df["table"] == PART_I_TABLE_NAME
].iloc[0]


extractors = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=table["table_top"],
            top_delta=table["table_top_delta"],
            bottom_label=table["table_bottom"],
            bottom_delta=table["table_bottom_delta"],
            row_margin=table["row_margin"],
            index_col_left_label=table["index_col_left"],
            index_col_left_delta=table["index_col_left_delta"],
            index_col_right_label=table["index_col_right"],
            index_col_right_delta=table["index_col_right_delta"],
            tablemap=tablemap,
            row_extractors=table_row_extractors,
            fields=table_row_extractors["field"],   
        )
    )
)

In [None]:
extractors["extractor"].iloc[0]

In [None]:
extractors.apply(
    lambda row: row["extractor"].extract_rows(test_words, row["page"]),
    axis=1
)

In [None]:
extractors["extractor"].iloc[0].extract_rows(test_words, 20)

In [None]:
extractors["extractor"].iloc[0].get_row_spans(test_words, 20)

In [None]:
extractors["extractor"].iloc[0].get_index_col_span()

In [None]:
extractors["extractor"].iloc[0].table_bottom

In [None]:
page_20_words = test_words.loc[
    (test_words["Page"] == 20)
]

In [None]:
row_tops = page_20_words.loc[
    page_20_words["Left"].between(0.87, 1)
    & page_20_words["Top"].between(0.37, 1)
]

In [None]:
row_tops

In [None]:
extractors["extractor"].iloc[0].extract_row(test_words, 20, (.383, .8))

In [None]:
page_20_words.loc[page_20_words["Text"] == "ISRAEL"]

## Extracting a bunch

In [None]:
ocr_outputs = list(Path("/mnt/c/Users/ethan/pdf_parsing").glob("EIN_*.json"))

In [None]:
len(ocr_outputs)

In [None]:
def open_local_df(path):
    with open(path) as json_data:
        output = json.load(json_data)
        return pd.DataFrame.from_records(
            output["Blocks"],
            index="Id",
            exclude=[
                "ColumnIndex",
                "ColumnSpan",
                "DocumentType",
                "EntityTypes",
                "Hint",
                "Query",
                "SelectionStatus",
                "RowIndex",
                "RowSpan",
            ]
        ).assign(
            Height=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Height"]),
            Left=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Left"]),
            Top=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Top"]),
            Width=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Width"]),
            Polygon=lambda df: df["Geometry"].map(lambda x: x["Polygon"]),
            Children=lambda df: df["Relationships"].map(lambda x: x[0]["Ids"] if x is not None else x),
            Line_No=lambda df: pd.qcut(df["Top"], 100, labels=list(range(100))).astype(int),
            File=lambda df: df["Geometry"].map(lambda x: path.name),
            Right=lambda df: df["Polygon"].map(
                lambda polygon: max(corner["X"] for corner in polygon)
            ),
            Bottom=lambda df: df["Polygon"].map(
                lambda polygon: max(corner["Y"] for corner in polygon)
            )
        ).drop(
            columns=[
                "Geometry",
                "Relationships",
            ]
        ).sort_values(
            by=["File", "Page", "Line_No", "Left"]
        )

In [None]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []

for (count, path) in enumerate(ocr_outputs[:100]):
    print(count, path.name)
    try:
        data = open_local_df(path)
    except Exception as e:
        print(path.name)
        print(e)
    else:
        lines = data.loc[data["BlockType"] == "LINE"]
        words = data.loc[data["BlockType"] == "WORD"]
        page_map = find_pages(lines)
        roadmap = create_roadmap(
            lines, roadmap_df, page_map
        )
        row = extract_from_roadmap(
            words, lines, roadmap, extractor_df, page_map
        )
        row["file"] = path.name
        filing_rows.append(row)
        pages = lines.groupby("Page")
        part_i_table = extract_table_data(
            pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_i_table is not None:
            schedule_f_part_i_rows.append(
                part_i_table.assign(file=path.name)
            )
        part_ii_table = extract_table_data(
            pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_ii_table is not None:
            schedule_f_part_ii_rows.append(
                part_ii_table.assign(file=path.name)
            )
        part_iii_table = extract_table_data(
            pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_iii_table is not None:
            schedule_f_part_iii_rows.append(
                part_iii_table.assign(file=path.name)
            )
    
filing_output_df = pd.DataFrame(filing_rows).set_index("file")


In [None]:
filing_output_df["total_revenue"].value_counts()

In [None]:
filing_output_df.iloc[24].iloc[:50]

In [None]:
schedule_f_part_i_df = pd.concat(schedule_f_part_i_rows)

In [None]:
schedule_f_part_i_df.iloc[:50]

In [None]:
schedule_f_part_ii_df = pd.concat(schedule_f_part_ii_rows)

In [None]:
schedule_f_part_ii_df.iloc[50:100]

In [None]:
schedule_f_part_iii_df = pd.concat(schedule_f_part_iii_rows)

In [None]:
schedule_f_part_iii_df.tail(50)

In [None]:
clean_filing(filing_output_df)

In [None]:
re.search(r"(?P<open_par>\()(\d+)(?(open_par)\)|\b)", "(2425)").group(2)

In [None]:
test_new_filing = open_local_df(
    Path("/mnt/c/Users/ethan/pdf_parsing") / "EIN_200478411_YEAR_2017_FORMTYPE_990.json"
)

In [None]:
test_new_filing

In [None]:
test_new_filing_words = test_new_filing.loc[
    test_new_filing["BlockType"] == "WORD"
]

test_new_filing_lines = test_new_filing.loc[
    test_new_filing["BlockType"] == "LINE"
]
test_new_filing_pages = test_new_filing_lines.groupby("Page")


In [None]:
page_map = find_pages(test_new_filing_lines)

In [None]:
page_map

In [None]:
roadmap = create_roadmap(test_new_filing_lines, roadmap_df, page_map)

In [None]:
roadmap.iloc[:50]

In [None]:
extractors = create_extractors(extractor_df, roadmap, page_map)

In [None]:
test_extractor = extractors.iloc[2]
test_page = 1

In [None]:
test_extractor.bounding_box.get_text_in_box(test_new_filing_lines, 1)

In [None]:
test_extractor.bounding_box.top -= 0.001

In [None]:
extract_table_data(
    test_new_filing_pages, test_new_filing_lines, test_new_filing_words,
    PART_II_HEADER, PART_II_TABLE_NAME, schedule_f_tablemap_df,
    schedule_f_table_extractor_df, schedule_f_row_extractor_df,
)

In [None]:
table_pages = find_table_pages(
    test_new_filing_pages["Text"].agg(lambda words: " ".join(words)), PART_II_HEADER,
)

In [None]:
table_pages

In [None]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(test_new_filing_lines, schedule_f_tablemap_df, page).dropna()
        )
    }
)
tablemaps

In [None]:
tablemaps.iloc[1]["tablemap"]

In [None]:
row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_II_TABLE_NAME
]

In [None]:
row_extractors

In [None]:
table = schedule_f_table_extractor_df.loc[
    schedule_f_table_extractor_df["table"] == PART_II_TABLE_NAME
].iloc[0]
table

In [None]:
rows = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=table["table_top"],
            top_delta=table["table_top_delta"],
            bottom_label=table["table_bottom"],
            bottom_delta=table["table_bottom_delta"],
            row_margin=table["row_margin"],
            index_col_left_label=table["index_col_left"],
            index_col_left_delta=table["index_col_left_delta"],
            index_col_right_label=table["index_col_right"],
            index_col_right_delta=table["index_col_right_delta"],
            tablemap=tablemap,
            row_extractors=row_extractors,
            fields=row_extractors["field"],
        )
    )
)

In [None]:
rows

In [None]:
extractor = rows["extractor"].iloc[1]

In [None]:
page_74_lines = test_new_filing_lines.loc[
    test_new_filing_lines["Page"] == 74
]

In [None]:
extractor.row_extractors

In [None]:
extractor.extract_rows(page_74_words, 74)

In [None]:
extractor.get_col_span("(b) Region", "(c) Number of recipients")

In [None]:
extractor.get_row_spans(test_new_filing_words, 74)

In [None]:
BoundingBox(
    left=0.55,
    left_delta=0,
    top=0.365,
    top_delta=-.01,
    right=0.66,
    right_delta=0,
    bottom=1,
    bottom_delta=0.01
).get_text_in_box(page_74_lines, 74)

In [None]:
extractor.index_col_left_label

In [None]:
extractor.table_top

In [None]:
extractor.extract_rows(test_new_filing_words, 74).iloc[0].any()