In [1]:
import json
import re
from pathlib import Path

import pandas as pd

from parse_990_textract.bucket import open_df
from parse_990_textract.filing import create_roadmap, extract_from_roadmap
from parse_990_textract.models import BoundingBox, TableExtractor
from parse_990_textract.parse import create_extractors, find_item, find_pages
from parse_990_textract.setup import load_extractor_df
from parse_990_textract.table import extract_table_data, find_table_pages, create_tablemap
from parse_990_textract.utils import get_coordinate, get_regex

In [2]:
test_data = pd.read_csv("test_data.csv", index_col="Id").fillna("")

In [3]:
test_data.head()

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8c643d86-2362-451c-bcd6-d34254c5bcf4,PAGE,1,,,1.0,0.0,0.0,1.0,"[{'X': 1.5308084002953373e-17, 'Y': 0.0}, {'X'...","['8a9b6c2c-576e-4d10-9b4b-75551c65ce34', '0e51...",0,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
8a9b6c2c-576e-4d10-9b4b-75551c65ce34,LINE,1,See a Social Security Number? Say Something!,,0.170676,0.279266,0.22939,0.440054,"[{'X': 0.2792663276195526, 'Y': 0.229390263557...","['5727b887-ce51-4ad6-907d-a90be1b4217c', '9192...",24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
5727b887-ce51-4ad6-907d-a90be1b4217c,WORD,1,See,PRINTED,0.13614,0.279266,0.22939,0.036246,"[{'X': 0.2792663276195526, 'Y': 0.229390263557...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
74322323-44de-40c1-8b08-c3af634619f4,WORD,1,Social,PRINTED,0.139702,0.339339,0.229709,0.055955,"[{'X': 0.3393394947052002, 'Y': 0.229709059000...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
a2d5573c-d304-498c-810e-574afffa7838,WORD,1,Say,PRINTED,0.168877,0.574458,0.230767,0.035323,"[{'X': 0.5744580030441284, 'Y': 0.230766937136...",,24,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [4]:
test_lines = test_data.loc[test_data["BlockType"] == "LINE"]
test_words = test_data.loc[test_data["BlockType"] == "WORD"]
test_pages = test_lines.groupby("Page")

In [5]:
extractor_df = load_extractor_df("990_extractors.csv")
roadmap_df = pd.read_csv("990_roadmap.csv")
schedule_f_tablemap_df = pd.read_csv("schedule_f_table_roadmap.csv")
schedule_f_table_extractor_df = pd.read_csv("schedule_f_table_extractors.csv")
schedule_f_row_extractor_df = pd.read_csv("schedule_f_row_extractors.csv")

In [6]:
PART_I_HEADER = r"\(a\) Region\s*\(b\)\s*N|Schedule F,? Part I\b"
PART_II_HEADER = r"\([cC]\) Region\s*\(d\)\s*P|Schedule F,? Part II\b"
PART_III_HEADER = r"\(b\) Region\s*\(c\)\s*N|Schedule F,? Part III\b"
PART_I_TABLE_NAME = "Activities per Region"
PART_II_TABLE_NAME = r"Grants to Organizations Outside the United States"
PART_III_TABLE_NAME = "Grants to Individuals Outside the United States"

In [7]:
extractor_df.head()

Unnamed: 0,field_name,strategy,left,left_delta,top,top_delta,right,right_delta,bottom,bottom_delta,page,regex
0,name,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('zation\\s*(?:Name\\s*)?(.+?)\\s*Do...
1,address,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('address\\)(?: Room/s\\w+e)?(?:.+um...
2,city,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,"re.compile('code\\s*(.+?),?\\s+[A-Z]{2}\\b|(\\..."
3,state,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('([A-Z]{2})[^A-Za-z]*\\d{5}')
4,zip,lines,Item C,-0.001,Item C,-0.001,Item D,-0.001,Item F,-0.001,Page 1,re.compile('[A-Z]{2}[^A-Za-z]*(\\d{5})')


In [8]:
roadmap_df.head(16)

Unnamed: 0,landmark,regex,left_default,top_default,page,x_tolerance,y_tolerance
0,Item C,Name.+zation,0.15,0.11,Page 1,0.1,0.1
1,Item D,Employer,0.71,0.11,Page 1,0.1,0.1
2,Item E,Te[tl][ae]phone,0.81,0.17,Page 1,0.2,0.1
3,Item F,Name.+fficer,0.14,0.22,Page 1,0.1,0.1
4,Item G,"Gross re\w{3,}",0.71,0.19,Page 1,0.1,0.1
5,Item H,H\(a\),0.63,0.22,Page 1,0.09,0.1
6,Item I,Tax\W*exempt [se]tatu[es],0.02,0.25,Page 1,0.2,0.1
7,Item J,Website|J W\w+,0.02,0.26,Page 1,0.09,0.2
8,Item K,Form of org|Type of org,0.02,0.28,Page 1,0.09,0.2
9,Item L,Y[eo]ar of formation,0.54,0.28,Page 1,0.2,0.2


In [9]:
page_map = find_pages(test_lines)

In [10]:
roadmap = create_roadmap(test_lines, roadmap_df, page_map)

In [11]:
roadmap.tail(50)

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Part IX, Item 11b",0.443088,0.076055,0.43,0.1
"Part IX, Item 11c",0.458445,0.076297,0.45,0.1
"Part IX, Item 11d",0.473847,0.076393,0.46,0.1
"Part IX, Item 11e",0.487683,0.076274,0.49,0.1
"Part IX, Item 11f",0.50358,0.075251,0.49,0.1
"Part IX, Item 11g",0.519089,0.076211,0.51,0.1
"Part IX, Item 12",0.533814,0.093264,0.53,0.1
"Part IX, Item 13",0.549186,0.093982,0.55,0.1
"Part IX, Item 14",0.564256,0.09412,0.57,0.1
"Part IX, Item 15",0.57959,0.062554,0.59,0.1


In [12]:
test_lines.loc[
    test_lines["Top"].between(0.125752-.001, 0.246614-.001)
    & test_lines["Left"].between(.178131-.001, .663365-0.001)
    & (test_lines["Page"] == 2),
    "Text"
].agg(lambda x: " ".join(x.values))

'C Name of organization Doing Business As Number and street (or P.O. box If mail IS not delivered to street address) Room/suite City or town, state or country, and ZIP + 4'

In [13]:
test_words.loc[
    test_words["Top"].between(0.125752-.001, 0.246614-.001)
    & test_words["Left"].between(.178131-.001, .663365-0.001)
    & (test_words["Page"] == 2),
    "Text"
].agg(lambda x: " ".join(x.values))

'Name of C organization CHESED INC. Doing Business As (or P.O. box If mail delivered street address) Room/suite Number and street IS not to 6TH STREET City and ZIP 4 or town, state or country, 08701 + NJ'

In [14]:
extractors = create_extractors(extractor_df, roadmap, page_map)

In [15]:
extractors.iloc[-2]

Extractor(name='total_number_recipient_foreign_orgs_listed_as_charities', strategy='words', page=21, bounding_box=BoundingBox(left=0.775, top=0.78, right=1.0, bottom=0.899), regex=re.compile('(?<!\\()(\\d+[\\d.,]*?\\b|\\b[oOIl]\\b)(?!\\([cC]\\))'))

In [16]:
extractors.iloc[-2].bounding_box.get_text_in_box(test_words, 21)

'equivalency other of of number number 501(c)(3) total total section'

In [17]:
extractors.iloc[33].extract(test_words, test_lines)

'1585802.'

In [18]:
BoundingBox(left=0.8, top=0, right=1, bottom=0.3).get_text_in_box(test_words, 21)

'2008 990) (Form Schedule F a provided'

In [19]:
roadmap.iloc[15]

Top             0.413119
Left            0.091636
Top_Default         0.41
Left_Default         0.1
Name: Part I, Item 5, dtype: object

In [20]:
test_lines.loc[test_lines["Text"].str.contains("employees") & (test_lines["Page"] == 2)]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3b246a65-73e4-459c-aac2-483e7d8330d7,LINE,2,"5 Total number of employees (Part V, line 2a)",,0.010808,0.091636,0.413119,0.275856,"[{'X': 0.09163600206375122, 'Y': 0.41311872005...","['701a001c-899b-404d-9801-57962688b23f', 'c626...",45,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [21]:
test_words.loc[test_words["Text"].str.contains("78083")]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
c904e0c0-dc0e-4cb9-9334-e3462ec7b00f,WORD,2,78083.0,PRINTED,0.011532,0.870146,0.637633,0.066518,"[{'X': 0.8701464533805847, 'Y': 0.637633442878...",,69,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [22]:
extractors.iloc[26]

Extractor(name='other_expenses', strategy='words', page=2, bounding_box=BoundingBox(left=0.819, top=0.637, right=1.0, bottom=0.649), regex=re.compile('(-?\\(?\\d+[\\d.,]*\\)?|\\b[oOIl]\\b)'))

In [23]:
test_results = extract_from_roadmap(test_words, test_lines, roadmap, extractor_df, page_map)

In [24]:
test_results.iloc[140:].to_frame()

Unnamed: 0_level_0,0
field_name,Unnamed: 1_level_1
depreciation_depletion_amortization_total,
depreciation_depletion_amortization_prog_service,
depreciation_depletion_amortization_mgmt_general,
depreciation_depletion_amortization_fundraising,
insurance_total,
insurance_prog_service,
insurance_mgmt_general,
insurance_fundraising,
other_expenses_a_label,PRINTING & PUBLICATIONS
other_expenses_a_total,55433.


In [25]:
test_lines.loc[test_lines["Text"].str.contains("\(b\) Number")]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
85bd85c8-1b0e-445c-a7f8-3b84935f4b07,LINE,20,(b) Number of,,0.010837,0.247785,0.275659,0.082386,"[{'X': 0.24778541922569275, 'Y': 0.27565929293...","['772a05a4-e1ec-4ec9-9f95-4fcb7bb02db4', 'b7d3...",29,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
36d7f3a0-d78c-41af-a96e-a4acc0e1bddd,LINE,29,(b) Number of,,0.063778,0.124155,0.574337,0.013717,"[{'X': 0.12415481358766556, 'Y': 0.63811469078...","['bdcc632d-a2d3-410b-a258-5f84714b0780', 'b6c8...",63,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [26]:
part_i_table = extract_table_data(
    test_pages, test_lines, test_words, PART_I_HEADER, PART_I_TABLE_NAME,
    schedule_f_tablemap_df, schedule_f_table_extractor_df,
    schedule_f_row_extractor_df
)

In [27]:
part_i_table

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,ISRAEL,0,,EDUCATIONAL AND CHARITABLE 0 SERVICES,,249700.0


In [28]:
table_pages = find_table_pages(test_pages["Text"].agg(lambda words: " ".join(words)), PART_I_HEADER)

In [29]:
table_pages

Page
20    20
Name: Page, dtype: int64

In [30]:
tablemaps = pd.DataFrame(
    {
        "page": table_pages,
        "tablemap": table_pages.map(
            lambda page: create_tablemap(test_lines, schedule_f_tablemap_df, page).dropna()
        )
    }
)

In [31]:
tablemaps["tablemap"].iloc[0]

Unnamed: 0_level_0,Top,Left,Top_Default,Left_Default
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(a) Region,0.275733,0.123071,0.0,0.0
(b) Number of offices,0.275659,0.247785,0.0,0.0
(c) Number of employees,0.275896,0.34236,0.0,0.0
(d) Activities conducted,0.276178,0.445321,0.0,0.0
(e) Specific type,0.276476,0.684013,0.0,0.0
(f) Total Expenditures,0.276897,0.876609,0.0,0.0
(c) Number of recipients,0.275896,0.34236,0.0,0.0
Schedule F,0.882503,0.770748,0.8,0.8
Top Left Corner,0.0,0.0,0.0,0.0
Bottom Right Corner,1.0,1.0,1.0,1.0


In [32]:
table_row_extractors = schedule_f_row_extractor_df.loc[
    schedule_f_row_extractor_df["table"] == PART_I_TABLE_NAME
]

table = schedule_f_table_extractor_df.loc[
        schedule_f_table_extractor_df["table"] == PART_I_TABLE_NAME
].iloc[0]


extractors = tablemaps.assign(
    extractor=tablemaps["tablemap"].map(
        lambda tablemap: TableExtractor(
            top_label=table["table_top"],
            top_delta=table["table_top_delta"],
            bottom_label=table["table_bottom"],
            bottom_delta=table["table_bottom_delta"],
            row_margin=table["row_margin"],
            index_col_left_label=table["index_col_left"],
            index_col_left_delta=table["index_col_left_delta"],
            index_col_right_label=table["index_col_right"],
            index_col_right_delta=table["index_col_right_delta"],
            tablemap=tablemap,
            row_extractors=table_row_extractors,
            fields=table_row_extractors["field"],   
        )
    )
)

In [33]:
extractors["extractor"].iloc[0]

TableExtractor(top_label='(a) Region', top_delta=0.1, bottom_label='Schedule F, Part I, Item 3a', bottom_delta=-0.01, row_margin=0.05, index_col_left_label='(f) Total Expenditures', index_col_left_delta=-0.005, index_col_right_label='Bottom Right Corner', index_col_right_delta=0.0, tablemap=                               Top      Left  Top_Default  Left_Default
Item                                                                   
(a) Region                0.275733  0.123071          0.0           0.0
(b) Number of offices     0.275659  0.247785          0.0           0.0
(c) Number of employees   0.275896   0.34236          0.0           0.0
(d) Activities conducted  0.276178  0.445321          0.0           0.0
(e) Specific type         0.276476  0.684013          0.0           0.0
(f) Total Expenditures    0.276897  0.876609          0.0           0.0
(c) Number of recipients  0.275896   0.34236          0.0           0.0
Schedule F                0.882503  0.770748          0.8   

In [34]:
extractors.apply(
    lambda row: row["extractor"].extract_rows(test_words, row["page"]),
    axis=1
)

Page
20    field  region number_offices number_employees ...
dtype: object

In [35]:
extractors["extractor"].iloc[0].extract_rows(test_words, 20)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures
0,ISRAEL,0,,EDUCATIONAL AND CHARITABLE 0 SERVICES,,249700.0


In [36]:
extractors["extractor"].iloc[0].get_row_spans(test_words, 20)

Unnamed: 0,row_top,row_bottom
0,0.334268,0.8


In [37]:
extractors["extractor"].iloc[0].get_index_col_span()

(0.87, 1.0)

In [38]:
extractors["extractor"].iloc[0].table_bottom

0.8

In [39]:
page_20_words = test_words.loc[
    (test_words["Page"] == 20)
]

In [40]:
row_tops = page_20_words.loc[
    page_20_words["Left"].between(0.87, 1)
    & page_20_words["Top"].between(0.37, 1)
]

In [41]:
row_tops

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4cb44b1d-965c-4973-9bc5-f98c7619e4f6,WORD,20,249700.,HANDWRITING,0.009623,0.891561,0.384268,0.052801,"[{'X': 0.8915610909461975, 'Y': 0.384267538785...",,42,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
2fee7b16-dfea-4ee3-9819-d99793ba1104,WORD,20,249700,PRINTED,0.008759,0.889181,0.868748,0.04898,"[{'X': 0.8891811370849609, 'Y': 0.868747532367...",,92,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
e5f68500-4d6b-4dcf-9e4d-2aa3584923bf,WORD,20,990),PRINTED,0.010549,0.882051,0.8833,0.027266,"[{'X': 0.882050633430481, 'Y': 0.8833003044128...",,93,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...
64cb0c1e-03d5-4be6-927b-152de1c43f4e,WORD,20,2008,PRINTED,0.009108,0.911608,0.88348,0.030706,"[{'X': 0.911607563495636, 'Y': 0.8834795951843...",,93,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [42]:
extractors["extractor"].iloc[0].extract_row(test_words, 20, (.383, .8))

0        ISRAEL
1              
2              
3    0 SERVICES
4              
5       249700.
dtype: object

In [43]:
page_20_words.loc[page_20_words["Text"] == "ISRAEL"]

Unnamed: 0_level_0,BlockType,Page,Text,TextType,Height,Left,Top,Width,Polygon,Children,Line_No,File
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
af47b985-6d18-4fac-bd41-7a9293203a14,WORD,20,ISRAEL,PRINTED,0.007724,0.064682,0.383542,0.048381,"[{'X': 0.06468217819929123, 'Y': 0.38354238867...",,42,00057f8f88b60da8bb6be94dc1f3da3012d0c139f46018...


In [44]:
assert False

AssertionError: 

## Extracting a bunch

In [45]:
ocr_outputs = list(Path("/mnt/c/Users/ethan/pdf_parsing").glob("EIN_*.json"))

In [46]:
len(ocr_outputs)

291

In [47]:
def open_local_df(path):
    with open(path) as json_data:
        output = json.load(json_data)
        return pd.DataFrame.from_records(
            output["Blocks"],
            index="Id",
            exclude=[
                "ColumnIndex",
                "ColumnSpan",
                "DocumentType",
                "EntityTypes",
                "Hint",
                "Query",
                "SelectionStatus",
                "RowIndex",
                "RowSpan",
            ]
        ).assign(
            Height=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Height"]),
            Left=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Left"]),
            Top=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Top"]),
            Width=lambda df: df["Geometry"].map(lambda x: x["BoundingBox"]["Width"]),
            Polygon=lambda df: df["Geometry"].map(lambda x: x["Polygon"]),
            Children=lambda df: df["Relationships"].map(lambda x: x[0]["Ids"] if x is not None else x),
            Line_No=lambda df: pd.qcut(df["Top"], 100, labels=list(range(100))).astype(int),
            File=lambda df: df["Geometry"].map(lambda x: path.name),
        ).drop(
            columns=[
                "Geometry",
                "Relationships",
            ]
        ).sort_values(
            by=["File", "Page", "Line_No", "Left"]
        )

In [48]:
filing_rows = []
schedule_f_part_i_rows = []
schedule_f_part_ii_rows = []
schedule_f_part_iii_rows = []

for (count, path) in enumerate(ocr_outputs[:50]):
    print(count, path.name)
    try:
        data = open_local_df(path)
    except Exception as e:
        print(path.name)
        print(e)
    else:
        lines = data.loc[data["BlockType"] == "LINE"]
        words = data.loc[data["BlockType"] == "WORD"]
        page_map = find_pages(lines)
        roadmap = create_roadmap(
            lines, roadmap_df, page_map
        )
        row = extract_from_roadmap(
            words, lines, roadmap, extractor_df, page_map
        )
        row["file"] = path.name
        filing_rows.append(row)
        pages = lines.groupby("Page")
        part_i_table = extract_table_data(
            pages, lines, words, PART_I_HEADER, PART_I_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_i_table is not None:
            schedule_f_part_i_rows.append(
                part_i_table.assign(file=path.name)
            )
        part_ii_table = extract_table_data(
            pages, lines, words, PART_II_HEADER, PART_II_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_ii_table is not None:
            schedule_f_part_ii_rows.append(
                part_ii_table.assign(file=path.name)
            )
        part_iii_table = extract_table_data(
            pages, lines, words, PART_III_HEADER, PART_III_TABLE_NAME, 
            schedule_f_tablemap_df, schedule_f_table_extractor_df, schedule_f_row_extractor_df,
        )
        if part_iii_table is not None:
            schedule_f_part_iii_rows.append(
                part_iii_table.assign(file=path.name)
            )
    
filing_output_df = pd.DataFrame(filing_rows).set_index("file")


0 EIN_10690242_YEAR_2020_FORMTYPE_990.json
1 EIN_10797083_YEAR_2020_FORMTYPE_990.json
2 EIN_10797083_YEAR_2021_FORMTYPE_990.json
3 EIN_10962762_YEAR_2020_FORMTYPE_990.json
4 EIN_112849619_YEAR_2019_FORMTYPE_990.json
5 EIN_113150521_YEAR_2020_FORMTYPE_990.json
6 EIN_113150521_YEAR_2021_FORMTYPE_990.json
7 EIN_113489123_YEAR_2020_FORMTYPE_990.json
8 EIN_113515344_YEAR_2020_FORMTYPE_990.json
9 EIN_113691843_YEAR_2019_FORMTYPE_990.json
10 EIN_116078704_YEAR_2020_FORMTYPE_990.json
11 EIN_131624067_YEAR_2020_FORMTYPE_990.json
12 EIN_131628150_YEAR_2020_FORMTYPE_990.json
13 EIN_131740448_YEAR_2020_FORMTYPE_990.json
14 EIN_131945157_YEAR_2020_FORMTYPE_990.json
15 EIN_132574963_YEAR_2020_FORMTYPE_990.json


'(f) Total Expenditures': ('(f) Total Expenditures',)
'(f) Manner': ('(f) Manner',)


16 EIN_132608326_YEAR_2020_FORMTYPE_990.json
17 EIN_133400377_YEAR_2020_FORMTYPE_990.json
18 EIN_133976873_YEAR_2019_FORMTYPE_990.json
19 EIN_135562163_YEAR_2020_FORMTYPE_990.json
20 EIN_135604164_YEAR_2019_FORMTYPE_990.json
21 EIN_160975270_YEAR_2019_FORMTYPE_990.json
22 EIN_161667739_YEAR_2019_FORMTYPE_990.json
23 EIN_200478411_YEAR_2016_FORMTYPE_990.json
24 EIN_200478411_YEAR_2017_FORMTYPE_990.json
25 EIN_200909475_YEAR_2020_FORMTYPE_990.json
26 EIN_201572620_YEAR_2020_FORMTYPE_990.json
27 EIN_201572620_YEAR_2021_FORMTYPE_990.json
28 EIN_201667945_YEAR_2020_FORMTYPE_990.json
29 EIN_202669700_YEAR_2020_FORMTYPE_990.json
30 EIN_202754466_YEAR_2020_FORMTYPE_990.json
31 EIN_202803848_YEAR_2020_FORMTYPE_990.json
32 EIN_202913418_YEAR_2020_FORMTYPE_990.json
33 EIN_202927564_YEAR_2020_FORMTYPE_990.json
34 EIN_203003912_YEAR_2020_FORMTYPE_990.json
35 EIN_203237801_YEAR_2020_FORMTYPE_990.json
36 EIN_203273423_YEAR_2020_FORMTYPE_990.json
37 EIN_203366904_YEAR_2020_FORMTYPE_990.json
38 EIN_203

In [49]:
filing_output_df.head()

field_name,name,address,city,state,zip,website,gross_receipts,year_formation,state_of_domicile,mission,...,activities_per_region_subtotal_number_of_employees,activities_per_region_subtotal_total_expenditure,activities_per_region_continuation_total_number_of_offices,activities_per_region_continuation_total_number_of_employees,activities_per_region_continuation_total_total_expenditure,activities_per_region_totals_number_of_offices,activities_per_region_totals_number_of_employees,activities_per_region_totals_total_expenditure,total_number_recipient_foreign_orgs_listed_as_charities,total_number_other_recipient_foreign_orgs_entities
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EIN_10690242_YEAR_2020_FORMTYPE_990.json,PORTER'S CALL,228 2ND AVE SOUTH,FRANKLIN,TN,37064,WWW.PORTERSCALL.COM,1724569,2001,TN,"TO PROVIDE COUNSEL, ENCOURAGEMENT AND SUPPORT ...",...,,,,,,,,,,
EIN_10797083_YEAR_2020_FORMTYPE_990.json,DESIRING GOD MINISTRIES,2112 BROADWAY STREET NE NO 150,MINNEAPOLIS,MN,55413,WWW.DESIRINGGOD.ORG,9514876,2004,MN,SPREAD A PASSION FOR SUPREMACY OF GOD IN ALL T...,...,19.0,337706.0,0.0,0.0,0.0,19.0,19.0,337706.0,0.0,1.0
EIN_10797083_YEAR_2021_FORMTYPE_990.json,DESIRING GOD MINISTRIES,2112 BROADWAY ST NE 150,MINNEAPOLIS,MN,55402,WWW.DESIRINGGOD.ORG,10318002,2004,MN,SPREAD A PASSION FOR SUPREMACY OF GOD IN ALL T...,...,35.0,286882.0,0.0,0.0,0.0,0.0,35.0,286882.0,,
EIN_10962762_YEAR_2020_FORMTYPE_990.json,HAND IN HAND,9502 19TH AVE SE SUITE F,EVERETT,WA,98208,WWW.HANDINHANDKIDS.ORG,1121748,2010,WA,TO SUPPORT CHILDREN AND FAMILIES TO ENSURE EVE...,...,,,,,,,,,,
EIN_112849619_YEAR_2019_FORMTYPE_990.json,American Friends of Merkaz Hachinuch Habeer Inc,1560-48th Street,Brooklyn,NY,11219,,354211,1987,NY,Provide aid to support and assist tax exempt J...,...,0.0,260415.0,,,,0.0,0.0,260415.0,1.0,


In [50]:
pd.concat(schedule_f_part_i_rows)

field,region,number_offices,number_employees,activities_conducted,specific_type_activity,total_expenditures,file
0,NORTH AMERICA,1.0,,1 PROGRAM SERVICES,TRANSLATIONS,133302,EIN_10797083_YEAR_2020_FORMTYPE_990.json
1,SOUTH ASIA,6.0,,6 PROGRAM SERVICES,TRANSLATIONS,38580,EIN_10797083_YEAR_2020_FORMTYPE_990.json
2,EUROPE,,,GRANTS 1,,12000,EIN_10797083_YEAR_2020_FORMTYPE_990.json
3,SUB-SAHARAN AFRICA,1.0,,1 PROGRAM SERVICES,TRANSLATIONS,11964,EIN_10797083_YEAR_2020_FORMTYPE_990.json
4,EAST ASIA AND THE PACIFIC,,,PROGRAM SERVICES 1,TRANSLATIONS,4570,EIN_10797083_YEAR_2020_FORMTYPE_990.json
5,SOUTH AMERICA,1.0,,1 PROGRAM SERVICES,TRANSLATIONS,4500,EIN_10797083_YEAR_2020_FORMTYPE_990.json
6,SUB-SAHARAN AFRICA,,,GRANTS 1,,4179,EIN_10797083_YEAR_2020_FORMTYPE_990.json
0,MIDDLE EAST AND NORTH AFRICA,0.0,,2 PROGRAM SERVICES,TRANSLATIONS,6160,EIN_10797083_YEAR_2021_FORMTYPE_990.json
1,EUROPE ICELAND (INCLUDING & GREENLAND),0.0,15,PROGRAM SERVICES,TRANSLATIONS,148702,EIN_10797083_YEAR_2021_FORMTYPE_990.json
2,NORTH AMERICA,,,3 PROGRAM SERVICES,TRANSLATIONS,44325,EIN_10797083_YEAR_2021_FORMTYPE_990.json


In [51]:
pd.concat(schedule_f_part_ii_rows)

field,org_name,irs_code,region,grant_purpose,amount_cash,manner_cash,amount_noncash,desc_noncash,method_valuation,file
0,,,"SOUTH ASIA - AFGHANISTAN, BANGLADESH, BHUTAN, ...","EARLY DETECTION & TREATMENT, DISABILITY",255000,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
1,,,"EUROPE (INCLUDING ICELAND & GREENLAND) - ALBANIA,","EARLY DETECTION & TREATMENT; HEALTHY, DEVELOPI...",128020,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
2,,,"SUB-SAHARAN AFRICA - ANGOLA, BENIN, BOTSWANA, ...",EARLY DETECTION & TREATMENT; HEALTHY DEVELOPIN...,9277,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
3,,,"SOUTH ASIA - AFGHANISTAN, BANGLADESH,",DISABILITY PREVENTION AND MANAGEMENTN,248228,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
4,,,"SUB-SAHARAN AFRICA - ANGOLA, BENIN, BOTSWANA, ...","EARLY DETECTION AND TREATMENT, HEALTHY DEVELOP...",206742,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
5,,,"SUB-SAHARAN AFRICA - ANGOLA, BENIN, BOTSWANA, ...","EARLY DETECTION AND TREATMENT, HEALTHY DEVELOP...",35261,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
6,,,SUB-SAHARAN AFRICA,HEALTH SYSTEM STRENGTHENING,11246,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
7,,,SUB-SAHARAN,COMMUNITY,5075,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
8,,,SUB-SAHARAN AFRICA,HEALTH SYSTEM STRENGTHENING,16075,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json
9,,,SUB-SAHARAN,FIELD STAFF,12360,WIRE TRANSFER,,,,EIN_135562163_YEAR_2020_FORMTYPE_990.json


In [53]:
pd.concat(schedule_f_part_iii_rows)

field,type_of_grant_assistance,region,number_recipients,amount_cash_grant,manner_cash_disbursemetn,amount_noncash_assistance,desc_noncash_assistance,method_valuation,file
0,FREEDOM FELLOWSHIP AWARD,CENTRAL AMERICA AND THE CARIBBEAN,1,25000,WIRE/CHECK,,,,EIN_202669700_YEAR_2020_FORMTYPE_990.json
1,"FREEDOM FELLOWSHIP AWARD, MICRO",EAST ASIA AND PACIFIC THE,5,71000,WIRE/CHECK,11747.0,BITCOIN 1.0 BTC BITCOIN PRICE,MARKET,EIN_202669700_YEAR_2020_FORMTYPE_990.json
2,"FREEDOM FELLOWSHIP AWARD, MICRO GRANTS, BITCOI...",EUROPE (INCLUDING ICELAND & GREENLAND),7,50500,WIRE/CHECK,61747.0,BITCOIN 1.0 BTC BITCOIN PRICE,MARKET,EIN_202669700_YEAR_2020_FORMTYPE_990.json
3,"MICRO GRANTS, VENEZUELA AID AWARD",SOUTH AMERICA,8,28350,WIRE/CHECK,,,,EIN_202669700_YEAR_2020_FORMTYPE_990.json
4,FREEDOM FELLOWSHIP,SUB-SAHARAN,5,82500,WIRE/CHECK,,,,EIN_202669700_YEAR_2020_FORMTYPE_990.json
