In [1]:
import re

import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex

In [2]:
test_data = pd.read_csv("test_data.csv", index_col="Id").fillna("")

In [3]:
test_data.head()

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8c643d86-2362-451c-bcd6-d34254c5bcf4,PAGE,1,,,1.0,0.0,0.0,1.0,"[{'X': 1.5308084002953373e-17, 'Y': 0.0}, {'X'...","['8a9b6c2c-576e-4d10-9b4b-75551c65ce34', '0e51...",0,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
8a9b6c2c-576e-4d10-9b4b-75551c65ce34,LINE,1,See a Social Security Number? Say Something!,,0.170676,0.279266,0.22939,0.440054,"[{'X': 0.2792663276195526, 'Y': 0.229390263557...","['5727b887-ce51-4ad6-907d-a90be1b4217c', '9192...",24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
5727b887-ce51-4ad6-907d-a90be1b4217c,WORD,1,See,PRINTED,0.13614,0.279266,0.22939,0.036246,"[{'X': 0.2792663276195526, 'Y': 0.229390263557...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
74322323-44de-40c1-8b08-c3af634619f4,WORD,1,Social,PRINTED,0.139702,0.339339,0.229709,0.055955,"[{'X': 0.3393394947052002, 'Y': 0.229709059000...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
a2d5573c-d304-498c-810e-574afffa7838,WORD,1,Say,PRINTED,0.168877,0.574458,0.230767,0.035323,"[{'X': 0.5744580030441284, 'Y': 0.230766937136...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [4]:
test_lines = test_data.loc[test_data["BlockType"] == "LINE"]
test_words = test_data.loc[test_data["BlockType"] == "WORD"]
test_pages = test_lines.groupby("Page")

In [5]:
extractor_df = load_extractor_df("990_extractors.csv")
roadmap_df = pd.read_csv("990_roadmap.csv")
schedule_f_tablemap_df = pd.read_csv("schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("schedule_f_row_extractors.csv")

In [6]:
PART_I_HEADER = r"\(a\) Region\s*\(b\)\s*N|Schedule F,? Part I\b"
PART_II_HEADER = r"\([cC]\) Region\s*\(d\)\s*P|Schedule F,? Part II\b"
PART_III_HEADER = r"\(b\) Region\s*\(c\)\s*N|Schedule F,? Part III\b"
PART_I_TABLE_NAME = "Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = "Grants to Individuals Outside the United States"

In [7]:
extractor_df.head()

Unnamed: 0,field_name,strategy,left,left_delta,top,top_delta,right,right_delta,bottom,bottom_delta,page,regex
0,name,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('zation\\s*(?:Name\\s*)?(.+?)\\s*Do...
1,address,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('address\\)(?: Room/s\\w+e)?(?:.+um...
2,city,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,"re.compile('code\\s*(.+?),?\\s+[A-Z]{2}\\b|(\\..."
3,state,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('([A-Z]{2})[^A-Za-z]*\\d{5}')
4,zip,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('[A-Z]{2}[^A-Za-z]*(\\d{5})')


In [8]:
roadmap_df.head(16)

Unnamed: 0,landmark,regex,left_default,top_default,page,x_tolerance,y_tolerance
0,Item C,Name.+zation,0.15,0.11,Page 1,0.1,0.1
1,Item D,Employer,0.71,0.11,Page 1,0.1,0.1
2,Item E,Te[tl][ae]phone,0.81,0.17,Page 1,0.2,0.1
3,Item F,Name.+fficer,0.14,0.22,Page 1,0.1,0.1
4,Item G,"Gross re\w{3,}",0.71,0.19,Page 1,0.1,0.1
5,Item H,H\(a\),0.63,0.22,Page 1,0.09,0.1
6,Item I,Tax\W*exempt [se]tatu[es],0.02,0.25,Page 1,0.2,0.1
7,Item J,Website|J W\w+,0.02,0.26,Page 1,0.09,0.2
8,Item K,Form of org|Type of org,0.02,0.28,Page 1,0.09,0.2
9,Item L,Y[eo]ar of formation,0.54,0.28,Page 1,0.2,0.2


In [9]:
page_map = find_pages(test_lines)

In [10]:
roadmap = create_roadmap(test_lines, roadmap_df, page_map)

In [11]:
roadmap.tail(50)

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Part IX, Item 11b",0.443088,0.076055,0.43,0.1
"Part IX, Item 11c",0.458445,0.076297,0.45,0.1
"Part IX, Item 11d",0.473847,0.076393,0.46,0.1
"Part IX, Item 11e",0.487683,0.076274,0.49,0.1
"Part IX, Item 11f",0.50358,0.075251,0.49,0.1
"Part IX, Item 11g",0.519089,0.076211,0.51,0.1
"Part IX, Item 12",0.533814,0.093264,0.53,0.1
"Part IX, Item 13",0.549186,0.093982,0.55,0.1
"Part IX, Item 14",0.564256,0.09412,0.57,0.1
"Part IX, Item 15",0.57959,0.062554,0.59,0.1


In [12]:
test_lines.loc[
    test_lines["Top"].between(0.125752-.001, 0.246614-.001)
    & test_lines["Left"].between(.178131-.001, .663365-0.001)
    & (test_lines["Page"] == 2),
    "Text"
].agg(lambda x: " ".join(x.values))

'C Name of organization Doing Business As Number and street (or P.O. box If mail IS not delivered to street address) Room/suite City or town, state or country, and ZIP + 4'

In [13]:
test_words.loc[
    test_words["Top"].between(0.125752-.001, 0.246614-.001)
    & test_words["Left"].between(.178131-.001, .663365-0.001)
    & (test_words["Page"] == 2),
    "Text"
].agg(lambda x: " ".join(x.values))

'Name of C organization CHESED INC. Doing Business As (or P.O. box If mail delivered street address) Room/suite Number and street IS not to 6TH STREET City and ZIP 4 or town, state or country, 08701 + NJ'

In [14]:
extractors = create_extractors(extractor_df, roadmap, page_map)

In [15]:
extractors.iloc[-2]

Extractor(name='total_number_recipient_foreign_orgs_listed_as_charities', strategy='words', page=21, bounding_box=BoundingBox(left=0.775, top=0.78, right=1.0, bottom=0.899), regex=re.compile('(?<!\\()(\\d+[\\d.,]*?\\b|\\b[oOIl]\\b)(?!\\([cC]\\))'))

In [16]:
extractors.iloc[-2].bounding_box.get_text_in_box(test_words, 21)

'equivalency other of of number number 501(c)(3) total total section'

In [17]:
extractors.iloc[33].extract(test_words, test_lines)

1585802.
Getting best match
(-?\(?\d+[\d.,]*\)?|\b[oOIl]\b)
('1585802.',)


'1585802.'

In [18]:
BoundingBox(left=0.8, top=0, right=1, bottom=0.3).get_text_in_box(test_words, 21)

'2008 990) (Form Schedule F a provided'

In [19]:
roadmap.iloc[15]

Top             0.413119
Left            0.091636
Top_Default         0.41
Left_Default         0.1
Name: Part I, Item 5, dtype: object

In [20]:
test_lines.loc[test_lines["Text"].str.contains("employees") & (test_lines["Page"] == 2)]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3b246a65-73e4-459c-aac2-483e7d8330d7,LINE,2,"5 Total number of employees (Part V, line 2a)",,0.010808,0.091636,0.413119,0.275856,"[{'X': 0.09163600206375122, 'Y': 0.41311872005...","['701a001c-899b-404d-9801-57962688b23f', 'c626...",45,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [21]:
test_words.loc[test_words["Text"].str.contains("78083")]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
c904e0c0-dc0e-4cb9-9334-e3462ec7b00f,WORD,2,78083.0,PRINTED,0.011532,0.870146,0.637633,0.066518,"[{'X': 0.8701464533805847, 'Y': 0.637633442878...",,69,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [22]:
extractors.iloc[26]

Extractor(name='other_expenses', strategy='words', page=2, bounding_box=BoundingBox(left=0.819, top=0.637, right=1.0, bottom=0.649), regex=re.compile('(-?\\(?\\d+[\\d.,]*\\)?|\\b[oOIl]\\b)'))

In [23]:
test_results = extract_from_roadmap(test_words, test_lines, roadmap, extractor_df, page_map)

C Name of organization TORAS CHESED INC. Doing Business As Number and street (or P.O. box If mail IS not delivered to street address) Room/suite 421 6TH STREET City or town, state or country, and ZIP + 4 LAKEWOOD, NJ 08701
C Name of organization TORAS CHESED INC. Doing Business As Number and street (or P.O. box If mail IS not delivered to street address) Room/suite 421 6TH STREET City or town, state or country, and ZIP + 4 LAKEWOOD, NJ 08701
Getting best match
address\)(?: Room/s\w+e)?(?:.+umber)?\s*(.+?)\s*(?:City(?: or)? [ft]own|Room/suite)
('421 6TH STREET',)
C Name of organization TORAS CHESED INC. Doing Business As Number and street (or P.O. box If mail IS not delivered to street address) Room/suite 421 6TH STREET City or town, state or country, and ZIP + 4 LAKEWOOD, NJ 08701
Getting best match
code\s*(.+?),?\s+[A-Z]{2}\b|(\w+),\s*[A-Z]{2}\b
(None, 'LAKEWOOD')
C Name of organization TORAS CHESED INC. Doing Business As Number and street (or P.O. box If mail IS not delivered to stre

In [24]:
test_results.tail(50)

field_name
payments_affiliates_prog_service                                                       
payments_affiliates_mgmt_general                                                       
payments_affiliates_fundraising                                                        
depreciation_depletion_amortization_total                                              
depreciation_depletion_amortization_prog_service                                       
depreciation_depletion_amortization_mgmt_general                                       
depreciation_depletion_amortization_fundraising                                        
insurance_total                                                                        
insurance_prog_service                                                                 
insurance_mgmt_general                                                                 
insurance_fundraising                                                                  
other_expenses_a_labe

In [25]:
test_lines.loc[test_lines["Text"].str.contains("\(b\) Number")]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
85bd85c8-1b0e-445c-a7f8-3b84935f4b07,LINE,20,(b) Number of,,0.010837,0.247785,0.275659,0.082386,"[{'X': 0.24778541922569275, 'Y': 0.27565929293...","['772a05a4-e1ec-4ec9-9f95-4fcb7bb02db4', 'b7d3...",29,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
36d7f3a0-d78c-41af-a96e-a4acc0e1bddd,LINE,29,(b) Number of,,0.063778,0.124155,0.574337,0.013717,"[{'X': 0.12415481358766556, 'Y': 0.63811469078...","['bdcc632d-a2d3-410b-a258-5f84714b0780', 'b6c8...",63,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [26]:
part_i_table = extract_table_data(
    test_pages, test_lines, test_words, PART_I_HEADER, PART_I_TABLE_NAME,
    schedule_f_tablemap_df, schedule_f_table_extractor_df,
    schedule_f_row_extractor_df
)

Table pages: Page
20    20
Name: Page, dtype: int64
Tablemaps:                                Top      Left  Top_Default  Left_Default
Item                                                                   
(a) Region                0.275733  0.123071          0.0           0.0
(b) Number of offices     0.275659  0.247785          0.0           0.0
(c) Number of employees   0.275896   0.34236          0.0           0.0
(d) Activities conducted  0.276178  0.445321          0.0           0.0
(e) Specific type         0.276476  0.684013          0.0           0.0
(f) Total Expenditures    0.276897  0.876609          0.0           0.0
(c) Number of recipients  0.275896   0.34236          0.0           0.0
Schedule F                0.882503  0.770748          0.8           0.8
Top Left Corner                  0         0          0.0           0.0
Bottom Right Corner              1         1          1.0           1.0
Table row extractors:                     field                  col_left

In [27]:
part_i_table

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,ISRAEL,0,0,EDUCATIONAL AND CHARITABLE SERVICES,,249700.
1,,,,,,
2,,,,,,
3,Totals LHA For Privacy Act and Paperwork 83207...,Reduction,"Act Notice,",the Instructions for Form 990. see,Schedule F (Form,249700 990) 2008


In [28]:
table_pages = find_table_pages(test_pages["Text"].agg(lambda words: " ".join(words)), PART_I_HEADER)

In [29]:
table_pages

Page
20    20
Name: Page, dtype: int64

In [30]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(test_lines, schedule_f_tablemap_df, page).dropna()
        )
    }
)

In [31]:
tablemaps["tablemap"].iloc[0]

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(a) Region,0.275733,0.123071,0.0,0.0
(b) Number of offices,0.275659,0.247785,0.0,0.0
(c) Number of employees,0.275896,0.34236,0.0,0.0
(d) Activities conducted,0.276178,0.445321,0.0,0.0
(e) Specific type,0.276476,0.684013,0.0,0.0
(f) Total Expenditures,0.276897,0.876609,0.0,0.0
(c) Number of recipients,0.275896,0.34236,0.0,0.0
Schedule F,0.882503,0.770748,0.8,0.8
Top Left Corner,0.0,0.0,0.0,0.0
Bottom Right Corner,1.0,1.0,1.0,1.0


In [32]:
table_row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_I_TABLE_NAME
]

table = schedule_f_table_extractor_df.loc[
        schedule_f_table_extractor_df["table"] == PART_I_TABLE_NAME
].iloc[0]


extractors = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=table["table_top"],
            top_delta=table["table_top_delta"],
            bottom_label=table["table_bottom"],
            bottom_delta=table["table_bottom_delta"],
            row_margin=table["row_margin"],
            index_col_left_label=table["index_col_left"],
            index_col_left_delta=table["index_col_left_delta"],
            index_col_right_label=table["index_col_right"],
            index_col_right_delta=table["index_col_right_delta"],
            tablemap=tablemap,
            row_extractors=table_row_extractors,
            fields=table_row_extractors["field"],   
        )
    )
)

In [33]:
extractors["extractor"].iloc[0]

TableExtractor(top_label='(a) Region', top_delta=0.1, bottom_label='Schedule F, Part I, Item 3a', bottom_delta=-0.01, row_margin=0.05, index_col_left_label='(f) Total Expenditures', index_col_left_delta=-0.005, index_col_right_label='Bottom Right Corner', index_col_right_delta=0.0, tablemap=                               Top      Left  Top_Default  Left_Default
Item                                                                   
(a) Region                0.275733  0.123071          0.0           0.0
(b) Number of offices     0.275659  0.247785          0.0           0.0
(c) Number of employees   0.275896   0.34236          0.0           0.0
(d) Activities conducted  0.276178  0.445321          0.0           0.0
(e) Specific type         0.276476  0.684013          0.0           0.0
(f) Total Expenditures    0.276897  0.876609          0.0           0.0
(c) Number of recipients  0.275896   0.34236          0.0           0.0
Schedule F                0.882503  0.770748          0.8   

In [34]:
extractors.apply(
    lambda row: row["extractor"].extract_rows(test_words, row["page"]),
    axis=1
)

Page
20    field                                         ...
dtype: object

In [35]:
extractors["extractor"].iloc[0].extract_rows(test_words, 20)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,ISRAEL,0,0,EDUCATIONAL AND CHARITABLE SERVICES,,249700.
1,,,,,,
2,,,,,,
3,Totals LHA For Privacy Act and Paperwork 83207...,Reduction,"Act Notice,",the Instructions for Form 990. see,Schedule F (Form,249700 990) 2008


In [36]:
extractors["extractor"].iloc[0].get_row_spans(test_words, 20)

Unnamed: 0,row_top,row_bottom
0,0.334268,0.818748
1,0.818748,0.8333
2,0.8333,0.83348
3,0.83348,1.0


In [37]:
extractors["extractor"].iloc[0].get_index_col_span()

(0.87, 1.0)

In [38]:
extractors["extractor"].iloc[0].table_bottom

1

In [39]:
page_20_words = test_words.loc[
    (test_words["Page"] == 20)
]

In [40]:
row_tops = page_20_words.loc[
    page_20_words["Left"].between(0.87, 1)
    & page_20_words["Top"].between(0.37, 1)
]

In [41]:
row_tops

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4cb44b1d-965c-4973-9bc5-f98c7619e4f6,WORD,20,249700.,HANDWRITING,0.009623,0.891561,0.384268,0.052801,"[{'X': 0.8915610909461975, 'Y': 0.384267538785...",,42,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
2fee7b16-dfea-4ee3-9819-d99793ba1104,WORD,20,249700,PRINTED,0.008759,0.889181,0.868748,0.04898,"[{'X': 0.8891811370849609, 'Y': 0.868747532367...",,92,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
e5f68500-4d6b-4dcf-9e4d-2aa3584923bf,WORD,20,990),PRINTED,0.010549,0.882051,0.8833,0.027266,"[{'X': 0.882050633430481, 'Y': 0.8833003044128...",,93,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
64cb0c1e-03d5-4be6-927b-152de1c43f4e,WORD,20,2008,PRINTED,0.009108,0.911608,0.88348,0.030706,"[{'X': 0.911607563495636, 'Y': 0.8834795951843...",,93,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [42]:
extractors["extractor"].iloc[0].extract_row(test_words, 20, (.383, .8))

0      ISRAEL
1            
2           0
3    SERVICES
4            
5     249700.
dtype: object

In [43]:
page_20_words.loc[page_20_words["Text"] == "ISRAEL"]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
af47b985-6d18-4fac-bd41-7a9293203a14,WORD,20,ISRAEL,PRINTED,0.007724,0.064682,0.383542,0.048381,"[{'X': 0.06468217819929123, 'Y': 0.38354238867...",,42,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
