In [1]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import requests
from bs4 import BeautifulSoup
import time
import lxml
import re

# Parsing XML Files from ProPublica

Last updated: 6/1/23 by Christopher Lum

### Methods

`scrape_website(url, attempts=0)`
- Takes a URL and returns the response object if successful. Retries three times on failure

`info_grabber(fp, url=True)`
- Takes the URL/filepath of an XML file. The `url` parameter indicates whether the given filepath is a URL or local
- Returns a dictionary of the row information for section VIII (along with a couple other columns)
- Older (prior to 2013) XML files have different tags and thus can't be read. Results in a row of all zeroes and `'Not Found'` in the `ba` column
- Of successfully scraped columns, has ~1.5% error rate for rows (0.13% of cells are inaccurate), only in the `Fundraising Events` column due to inconsistent labeling in XMLs
    - This error rate is lower than the human error rate
- In 2013, they had a different way of tagging zip codes, so all 2013 zips will be 0

`contact_grabber(fp, url=True)`
- Returns a dataframe of each contact in the 990 form, according to section VII
- Incomplete and edge cases aren't tested fully, as we didn't need the function.

`get_xmls(ein)`
- Takes the EIN of a company and returns a list of the URLs for each XML file that is linked on the ProPublica page for that given company

`grabber(eins, verbose=False, clean=False)`
- Takes a list of EINs and uses `get_xmls()` to find each XML file, then uses `info_grabber()` to scrape each XML for the relevant information.
- Returns a pandas DataFrame, doesn't automatically save as csv.
- Takes around 22 minutes on my laptop for 71 businesses
- The `clean` parameter automatically cleans the dataframe by removing empty rows

`find_errors(info)`
- Data columns are supposed to add up to the total, if it doesn't this method will find it
- Takes a dataframe of 990 rows from `grabber` and returns a tuple of incides that didn't properly grab all of the data
    - Preferably indiced on `EIN_YEAR`

#### `scrape_website`

In [2]:
def scrape_website(url, attempts=0):
    # Send GET request to the website
    response = requests.get(url)
    time.sleep(1)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        if attempts > 0:
            print('Success')
        return response
    elif attempts < 3:
        print("Error:", response.status_code, "Trying again.")
        time.sleep(1)
        return scrape_website(url, attempts + 1)
    else:
        # Handle the request error
        print("Error: ", response.status_code, 'Failed on:', url)
        return None


#### `info_grabber`

In [46]:
def info_grabber(fp, url=True, errors=False):
    if url:
        xml = scrape_website(fp)
        xml.encoding = 'UTF-8'
        xml = xml.text.strip()
    else:
        try:
            with open(fp, 'r') as file:
                xml = file.read()
        except FileNotFoundError:
            print(f"File not found: {fp}")
            return dict()
    if xml is None:
        return dict()
    
    soup = BeautifulSoup(xml, 'xml')

    row = dict()
    row['ba'] = soup.find('BusinessName')
    row['EIN'] = soup.find('EIN')
    row['Tax Year'] = soup.find('TaxPeriodBeginDt')
    row['Location (Zipcode)'] = soup.find('USAddress')
    row['Federate Campaigns'] = soup.find('FederatedCampaignsAmt')
    row['Membership Dues'] = soup.find('MembershipDuesAmt')
    row['Fundraising Events'] = soup.find('ContriRptFundraisingEventAmt')
    
    row['Related Organizations'] = soup.find('RelatedOrganizationsAmt')
    row['Government Grants'] = soup.find('GovernmentGrantsAmt')
    row['All Other Contributions'] = soup.find('AllOtherContributionsAmt')
    row['Noncash Contributions'] = soup.find('NoncashContributionsAmt')
    row['Total'] = soup.find('TotalContributionsAmt')
    # Can make default=0

    if row['Location (Zipcode)'] is not None:
        row['Location (Zipcode)'] = row['Location (Zipcode)'].find('ZIPCd')
    else:
        row['Location (Zipcode)'] = None

    row = dict(map(lambda item: (item[0], item[1].text if item[1] is not None else None), row.items()))

    if row['ba'] is not None:
        row['ba'] = row['ba'].strip()
    else:
        row['ba'] = 'Not found'
    if row['Tax Year'] is not None:
        row['Tax Year'] = pd.Timestamp(row['Tax Year']).year
    else:
        row['Tax Year'] = np.nan
    
    if errors:
        sumKeys = ['Federate Campaigns', 'Membership Dues', 'Related Organizations', 'Government Grants', 'All Other Contributions']
        sumAmt = sum(float(row[key]) for key in sumKeys if key in row and row[key] is not None)
        fundAmt_element = soup.find('FundraisingAmt')
        fundAmt = int(fundAmt_element.text) if fundAmt_element is not None else 0
        sumAmt += fundAmt

        if sumAmt is None or row['Total'] is None:
            pass
        elif int(sumAmt) == int(row['Total']):
            row['Fundraising Events'] = fundAmt
        else:
            pass

    row = dict(map(lambda item: (item[0], item[1] if item[1] is not None else 0), row.items()))

    return row


#### `contact_grabber`

Old, might not work

In [4]:
def contact_grabber(fp, url=True):
    '''
    Return a dataframe of the important names and their positions in an organization for each year

    :param fp: filepath of the xml file
    :param url: Whether the fp is a url or not. Default = True
    '''
    if url:
        xml = scrape_website(fp)
    else:
        xml = requests.get(fp)
    if xml is None:
        return dict()
    xml.encoding = 'UTF-8'
    xml = xml.text.strip()
    soup = BeautifulSoup(xml, 'xml')

    people = soup.find_all('Form990PartVIISectionAGrp')
    names = [x.find('PersonNm').text for x in people if x.find('PersonNm') is not None]
    titles = [x.find('TitleTxt').text for x in people if x.find('TitleTxt') is not None]
    try:
        organization = [soup.find('BusinessName').text.strip()] * len(names)
    except AttributeError:
        organization = ['Not found'] * len(names)
    try:
        year = [pd.Timestamp(soup.find('TaxPeriodBeginDt').text).year] * len(names)
    except AttributeError:
        year = [0] * len(names)
        
    return pd.DataFrame({'organization': organization, 'year': year, 'names': names, 'titles': titles})

#### `get_xmls`

In [5]:
def get_xmls(ein):
    '''
    Return a list of xml urls from ProRepublica based on an organization's EIN

    :param ein: Employer Identification number as String or Int
    '''
    url = f'https://projects.propublica.org/nonprofits/organizations/{ein}'
    html = scrape_website(url)
    if html is None:
        return []
    soup = BeautifulSoup(html.text)
    links = soup.find_all('a', class_='action xml')
    base_url = 'https://projects.propublica.org'
    # urls = [base_url + x.get('href') for x in links if x.text == '990']
    urls = [x for x in soup.find_all(class_='action xml') if re.search(r'990\b', x.text)]
    urls = [base_url + x.get('href') if x.name == 'a' else base_url + x.select_one('select.action.xml option[data-href]').get('data-href') for x in urls]
    return urls

#### `grabber`

In [6]:
def grabber(eins, verbose=False, clean=False, errors=False):
    if type(eins) == int:
        eins = [eins]
    data = []
    # contacts = pd.DataFrame()
    overall_index = 0
    for index, ein in enumerate(eins):
        print(str(index) + (' / ') + str(len(eins)))
        xmls = get_xmls(ein)
        for xml in xmls:
            if verbose:
                print(f'{overall_index} {xml}')
            if errors:
                data.append(info_grabber(xml,errors=True))
            else:
                data.append(info_grabber(xml))
            #contacts = pd.concat([contacts, contact_grabber(xml)])
            overall_index += 1
    print(str(len(eins)) + (' / ') + str(len(eins)))
    info = pd.DataFrame(data)
    if not clean:
        return info

    no_error = info.dropna().reset_index(drop=True)
    no_error.loc[:, 'EIN':'Total'] = no_error.loc[:, 'EIN':'Total'].astype(int)
    no_error.loc[:, 'EIN_YEAR'] = no_error['EIN'].astype(str) + '_' + no_error['Tax Year'].astype(str)
    no_error = no_error.set_index('EIN_YEAR')

    return no_error

#### `find_errors`

In [7]:
def find_errors(info):
    '''
    Takes a dataframe of 990 rows and returns a tuple of the indices where it failed to grab the data properly

    :param info: The dataframe of 990 rows
    '''
    accurate = info[['Federate Campaigns', 'Membership Dues', 'Fundraising Events', 'Related Organizations', 'Government Grants', 'All Other Contributions']].sum(axis=1) == info['Total']
    return tuple(accurate[~accurate].index)

### Using the `grabber()` method

`grabber()` takes a list of EINs. This was curated from the old "990 Manual Pull" spreadsheet by pasting and splitting the entire `EIN` column and putting it into a set.

In [8]:
all_eins = '''113723093
113723093
113723093
113723093
113723093
237279074
237279074
237279074
237279074
237279074
237334012
237334012
237334012
237334012
237334012
261712580
261712580
261712580
261712580
261712580
261712580
330006089
330006089
330006089
330006089
330215585
330215585
330215585
330215585
330215585
330317950
330317950
330317950
330317950
330433314
330433314
330433314
330433314
330433314
330492304
330492304
330492304
330492304
330492304
330496092
330496092
330496092
330496092
330623634
330623634
330623634
330623634
330623634
331029843
331029843
331029843
331029843
331146733
331146733
331146733
331146733
465055513
465055513
465055513
465055513
942358632
942358632
942358632
942358632
942358632
951644024
951644024
951644024
951644024
951869806
951869806
951869806
951869806
951869806
951869806
951874073
951874073
951874073
951874073
951874073
951874073
951944230
951944230
951944230
951944230
951944230
952039198
952039198
952039198
952039198
952157559
952157559
952157559
952157559
952157559
952648050
952648050
952648050
952648050
952648050
952648050
952653869
952653869
952653869
952653869
952653869
952833205
952833205
952833205
952833205
952833205
952850121
952850121
952850121
952850121
952850121
952880426
952880426
952880426
952880426
952880426
953244085
953244085
953244085
953244085
953244085
953248148
953248148
953248148
953248148
953248148
953302967
953302967
953302967
953302967
953302967
953302967
953315571
953315571
953315571
953315571
953315571
953649525
953649525
953649525
953649525
956379598
956379598
956379598
956379598
953497926
953497926
953497926
953497926
953497926
953497926
953837714
953837714
953837714
953837714
953837714
953140767
953140767
953140767
953140767
951648184
951648184
951648184
951648184
951648184
952693142
952693142
952693142
952693142
952693142
330217339
330217339
330217339
330217339
330217339
330217339
952794253
952794253
952794253
952794253
952794253
952794253
951945256
951945256
951945256
951945256
951945256
270865318
270865318
270865318
270865318
270865318
331146733
331146733
331146733
331146733
331146733
821946283
821946283
263405689
263405689
263405689
263405689
263405689
263405689
330602842
330602842
330602842
330602842
330602842
330902617
330902617
330902617
330902617
330902617
952457354
952457354
952457354
952457354
952457354
330008269
330008269
330008269
330008269
330008269
953031682
953031682
953031682
953031682
953031682
237161267
237161267
237161267
237161267
330497515
330497515
330497515
330497515
330497515
953750738
953750738
953750738
953750738
953750738
953368020
953368020
953368020
953368020
953368020
953368020
953798088
953798088
953798088
953798088
952111196
952111196
952111196
952111196
952949636
952949636
952949636
952949636
952949636
330618893
330618893
953782164
953782164
953782164
953782164
953782164
953782164
272917644
272917644
272917644
272917644
273390797
273390797
273390797
931008369
931008369
931008369
931008369
931008369
952213995
952213995
952213995
952213995
562613191
562613191
562613191
562613191
562613191
952422704
952422704
952422704
952422704
941676390
941676390
941676390
941676390
941676390
550806460
550806460
550806460
550806460
550806460
550806460
330122462
330122462
330122462
330122462
270447059
270447059
270447059
270447059
270447059
330210280
822363154
953138268
953950196'''

In [9]:
wrong_eins = '''952850121 562613191 952457354 330210280 952877102 330433314 204374795 330602842 465055513 263405689 263405689 330602842 330602842 330602842 330433314 951644024 330553621 952653869 330433314 941676390 953302967 941676390 941676390 941676390 465055513 330602842 952850121 330006089 952111196 330433314 562613191 952880426 953244085
'''

In [10]:
eins = list(set(all_eins.split()))
# eins = list(set(wrong_eins.split()))

<b>Warning</b>: `grabber()` takes a long time. The following cell will take about 20 minutes.

# 990 - Manual Pull Testing

In [11]:
info990 = grabber(eins, verbose=True)

0 / 71
0 https://projects.propublica.org/nonprofits/download-xml?object_id=202213189349309271
1 https://projects.propublica.org/nonprofits/download-xml?object_id=202113159349305206
2 https://projects.propublica.org/nonprofits/download-xml?object_id=202003219349313685
3 https://projects.propublica.org/nonprofits/download-xml?object_id=201943199349317894
4 https://projects.propublica.org/nonprofits/download-xml?object_id=201802989349301595
5 https://projects.propublica.org/nonprofits/download-xml?object_id=201723119349302897
6 https://projects.propublica.org/nonprofits/download-xml?object_id=201643209349310824
7 https://projects.propublica.org/nonprofits/download-xml?object_id=201403219349311755
8 https://projects.propublica.org/nonprofits/download-xml?object_id=201303199349309400
9 https://projects.propublica.org/nonprofits/download-xml?object_id=201223209349304657
10 https://projects.propublica.org/nonprofits/download-xml?object_id=201123199349309062
1 / 71
11 https://projects.propubli

93 https://projects.propublica.org/nonprofits/download-xml?object_id=202211239349301641
94 https://projects.propublica.org/nonprofits/download-xml?object_id=202032749349300628
10 / 71
95 https://projects.propublica.org/nonprofits/download-xml?object_id=202311159349300301
96 https://projects.propublica.org/nonprofits/download-xml?object_id=202241089349300824
97 https://projects.propublica.org/nonprofits/download-xml?object_id=202101109349300840
98 https://projects.propublica.org/nonprofits/download-xml?object_id=202020739349300702
99 https://projects.propublica.org/nonprofits/download-xml?object_id=201941309349301444
100 https://projects.propublica.org/nonprofits/download-xml?object_id=201830649349300853
101 https://projects.propublica.org/nonprofits/download-xml?object_id=201740459349301999
102 https://projects.propublica.org/nonprofits/download-xml?object_id=201640469349302904
103 https://projects.propublica.org/nonprofits/download-xml?object_id=201520489349301277
104 https://projects

185 https://projects.propublica.org/nonprofits/download-xml?object_id=201613559349300771
186 https://projects.propublica.org/nonprofits/download-xml?object_id=201630429349302213
20 / 71
187 https://projects.propublica.org/nonprofits/download-xml?object_id=202243559349300224
188 https://projects.propublica.org/nonprofits/download-xml?object_id=202103199349328120
189 https://projects.propublica.org/nonprofits/download-xml?object_id=202002539349301195
190 https://projects.propublica.org/nonprofits/download-xml?object_id=201942059349301014
191 https://projects.propublica.org/nonprofits/download-xml?object_id=201802419349300235
192 https://projects.propublica.org/nonprofits/download-xml?object_id=201713179349302491
193 https://projects.propublica.org/nonprofits/download-xml?object_id=201633209349306243
194 https://projects.propublica.org/nonprofits/download-xml?object_id=201512889349301466
195 https://projects.propublica.org/nonprofits/download-xml?object_id=201443219349304499
21 / 71
196 h

277 https://projects.propublica.org/nonprofits/download-xml?object_id=201431359349305868
278 https://projects.propublica.org/nonprofits/download-xml?object_id=201333179349304958
29 / 71
279 https://projects.propublica.org/nonprofits/download-xml?object_id=202213569349301336
280 https://projects.propublica.org/nonprofits/download-xml?object_id=202200619349301205
281 https://projects.propublica.org/nonprofits/download-xml?object_id=202013159349303941
282 https://projects.propublica.org/nonprofits/download-xml?object_id=202030419349301403
283 https://projects.propublica.org/nonprofits/download-xml?object_id=201900269349300605
284 https://projects.propublica.org/nonprofits/download-xml?object_id=201723179349307002
285 https://projects.propublica.org/nonprofits/download-xml?object_id=201741379349300649
286 https://projects.propublica.org/nonprofits/download-xml?object_id=201643209349307094
30 / 71
287 https://projects.propublica.org/nonprofits/download-xml?object_id=202232459349301428
288 h

369 https://projects.propublica.org/nonprofits/download-xml?object_id=201123199349302942
39 / 71
370 https://projects.propublica.org/nonprofits/download-xml?object_id=202330819349301668
371 https://projects.propublica.org/nonprofits/download-xml?object_id=202201369349311415
372 https://projects.propublica.org/nonprofits/download-xml?object_id=202132959349302003
373 https://projects.propublica.org/nonprofits/download-xml?object_id=202021369349300842
374 https://projects.propublica.org/nonprofits/download-xml?object_id=201900369349300815
375 https://projects.propublica.org/nonprofits/download-xml?object_id=201733429349300133
376 https://projects.propublica.org/nonprofits/download-xml?object_id=201613409349300211
377 https://projects.propublica.org/nonprofits/download-xml?object_id=201523079349301157
378 https://projects.propublica.org/nonprofits/download-xml?object_id=201520099349300632
379 https://projects.propublica.org/nonprofits/download-xml?object_id=201333459349300018
380 https://p

461 https://projects.propublica.org/nonprofits/download-xml?object_id=201603199349306620
462 https://projects.propublica.org/nonprofits/download-xml?object_id=201523019349301012
463 https://projects.propublica.org/nonprofits/download-xml?object_id=201432899349300323
464 https://projects.propublica.org/nonprofits/download-xml?object_id=201342739349300369
50 / 71
465 https://projects.propublica.org/nonprofits/download-xml?object_id=202243329349300104
466 https://projects.propublica.org/nonprofits/download-xml?object_id=202142319349300329
467 https://projects.propublica.org/nonprofits/download-xml?object_id=202001769349301120
468 https://projects.propublica.org/nonprofits/download-xml?object_id=201922769349301127
469 https://projects.propublica.org/nonprofits/download-xml?object_id=201822889349300407
470 https://projects.propublica.org/nonprofits/download-xml?object_id=201731289349300318
471 https://projects.propublica.org/nonprofits/download-xml?object_id=201601349349302565
472 https://p

553 https://projects.propublica.org/nonprofits/download-xml?object_id=202011399349301901
554 https://projects.propublica.org/nonprofits/download-xml?object_id=201940919349300829
555 https://projects.propublica.org/nonprofits/download-xml?object_id=201811319349303161
556 https://projects.propublica.org/nonprofits/download-xml?object_id=201731359349311873
557 https://projects.propublica.org/nonprofits/download-xml?object_id=201601379349310830
558 https://projects.propublica.org/nonprofits/download-xml?object_id=201531349349308828
559 https://projects.propublica.org/nonprofits/download-xml?object_id=201441339349305869
560 https://projects.propublica.org/nonprofits/download-xml?object_id=201311359349308641
561 https://projects.propublica.org/nonprofits/download-xml?object_id=201231299349301928
562 https://projects.propublica.org/nonprofits/download-xml?object_id=201121369349306422
60 / 71
563 https://projects.propublica.org/nonprofits/download-xml?object_id=202213199349325486
564 https://p

645 https://projects.propublica.org/nonprofits/download-xml?object_id=201531049349300043
646 https://projects.propublica.org/nonprofits/download-xml?object_id=201400509349300220
647 https://projects.propublica.org/nonprofits/download-xml?object_id=201330889349300118
648 https://projects.propublica.org/nonprofits/download-xml?object_id=201231999349300043
649 https://projects.propublica.org/nonprofits/download-xml?object_id=201121299349302002
69 / 71
650 https://projects.propublica.org/nonprofits/download-xml?object_id=202320899349301857
651 https://projects.propublica.org/nonprofits/download-xml?object_id=202221389349301022
652 https://projects.propublica.org/nonprofits/download-xml?object_id=202110749349300046
653 https://projects.propublica.org/nonprofits/download-xml?object_id=202022669349300022
654 https://projects.propublica.org/nonprofits/download-xml?object_id=201932909349301313
655 https://projects.propublica.org/nonprofits/download-xml?object_id=201833199349312273
656 https://p

### Checking the missing cells

The following two cells should both be 1.0.

The first checks the conditional probability that if the name is not found in a row, the tax year wasn't found.

The second checks the inverse conditional probability. Sometimes, if the request attempt tried and failed three times, the row will be entirely empty, where the name is `''` and not `'Not Found'`

In [12]:
# Every time the name isn't found, nothing is found
info990.loc[info990['ba'] == 'Not found']['Tax Year'].isna().mean()

1.0

In [13]:
# And every time the tax year isn't found the name isn't found
(info990[['ba', 'Tax Year']].loc[info990['Tax Year'].isna()]['ba'] == 'Not found').mean()

1.0

This cell indicates how many XML files failed to be accessed

In [14]:
info990['ba'].isna().sum()

0

If the following cell returns 0.0, that indicates that every time the tax year is missing, every single financial cell is zero. All of these rows can be dropped and no information is lost.

In [15]:
info990.loc[info990['Tax Year'].isna(), 'Federate Campaigns':].sum(axis=1).sum()

0

### Cleaning the raw information

Assuming the prior three checks are all adequate, we can drop rows with any nan values.

In [16]:
no_error = info990.dropna().reset_index(drop=True)
no_error.loc[:, 'EIN':'Total'] = no_error.loc[:, 'EIN':'Total'].astype(int)
no_error.loc[:, 'EIN_YEAR'] = no_error['EIN'].astype(str) + '_' + no_error['Tax Year'].astype(str)
no_error = no_error.set_index('EIN_YEAR')

### Export here!

<b>REMEMBER TO EXPORT</b>

`no_error` is the cleaned dataframe

`info990` is the raw dataframe with missing rows

In [17]:
 no_error.to_csv('no_errors.csv')

In [18]:
 info990.to_csv('full_table.csv')

The following line appends the rows to the bottom of the existing dataframe titled `'no_errors.csv'` instead.

It creates a new CSV and it is recommended to rename it back to `no_errors.csv` in the case it is exported successfully

In [21]:
extended = pd.concat([pd.read_csv('no_errors.csv'), no_error]).reset_index(drop=True).drop_duplicates()
extended.to_csv('no_errors_appended.csv')

## Testing Zone

In [22]:
url = 'https://projects.propublica.org/nonprofits/organizations/263405689'
html = scrape_website(url)
soup = BeautifulSoup(html.text)
links = soup.find_all('a', class_='action xml')
base_url = 'https://projects.propublica.org'
urls = [base_url + x.get('href') for x in links if x.text == '990']

In [23]:
test = [x for x in soup.find_all(class_='action xml') if re.search(r'990\b', x.text)]
test2 = [base_url + x.get('href') if x.name == 'a' else base_url + x.select_one('select.action.xml option[data-href]').get('data-href') for x in test]

In [24]:
info990

Unnamed: 0,ba,EIN,Tax Year,Location (Zipcode),Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2021.0,91911,0,0,0,0,44408141,3256460,2136983,47664601
1,Metropolitan Area Advisory Committee on Anti-P...,952457354,2020.0,91911,0,0,0,0,35941636,3522954,2632688,39464590
2,Metropolitan Area Advisory Committee on Anti-P...,952457354,2019.0,91911,0,0,204555,0,35487026,2434789,2298104,38126370
3,Metropolitan Area Advisory Committee on Anti-P...,952457354,2018.0,91911,0,0,35000,0,29811044,2412572,1869374,32258616
4,Metropolitan Area Advisory Committee on Anti-P...,952457354,2017.0,91911,0,0,36800,0,27582015,2269306,2049541,29888121
...,...,...,...,...,...,...,...,...,...,...,...,...
664,ALPHA PROJECT FOR THE HOMELESS,330215585,2015.0,92103,0,0,0,0,2827016,2417961,1086372,5244977
665,ALPHA PROJECT FOR THE HOMELESS,330215585,2014.0,92103,0,0,0,0,4037724,1847458,676252,5885182
666,ALPHA PROJECT FOR THE HOMELESS,330215585,2013.0,0,0,0,0,0,2065058,4124494,469651,6189552
667,Not found,330215585,,0,0,0,0,0,0,0,0,0


In [27]:
extended = pd.concat([pd.read_csv('no_errors.csv'), no_error]).reset_index(drop=True).drop_duplicates()

In [28]:
extended.loc[:, 'EIN':'Total'] = extended.loc[:, 'EIN':'Total'].astype(int)

In [29]:
extended = extended.drop_duplicates()

In [30]:
info = pd.read_csv('no_errors.csv')

In [31]:
info = info.set_index('EIN_YEAR')
# accurate = info[['Federate Campaigns', 'Membership Dues', 'Fundraising Events', 'Related Organizations', 'Government Grants', 'All Other Contributions']].sum(axis=1) == info['Total']

In [32]:
info

Unnamed: 0_level_0,ba,EIN,Tax Year,Location (Zipcode),Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
EIN_YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
952457354_2021.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2021.0,91911,0,0,0,0,44408141,3256460,2136983,47664601
952457354_2020.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2020.0,91911,0,0,0,0,35941636,3522954,2632688,39464590
952457354_2019.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2019.0,91911,0,0,204555,0,35487026,2434789,2298104,38126370
952457354_2018.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2018.0,91911,0,0,35000,0,29811044,2412572,1869374,32258616
952457354_2017.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2017.0,91911,0,0,36800,0,27582015,2269306,2049541,29888121
...,...,...,...,...,...,...,...,...,...,...,...,...
330215585_2017.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2017.0,92103,0,0,0,0,1972019,8974475,1743782,10946494
330215585_2016.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2016.0,92103,0,0,0,0,1251303,6178785,1876303,7430088
330215585_2015.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2015.0,92103,0,0,0,0,2827016,2417961,1086372,5244977
330215585_2014.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2014.0,92103,0,0,0,0,4037724,1847458,676252,5885182


In [33]:
# tuple(accurate[~accurate].index)

In [37]:
info_EINerrs = find_errors(info)

In [49]:
info_errs = info[info.index.isin(info_EINerrs)]

In [43]:
info_errs = [int(err[:-7]) for (err) in info_EINerrs]
info_errs

[951874073,
 953368020,
 272917644,
 952039198,
 952039198,
 263405689,
 331146733,
 953497926,
 953302967,
 550806460]

In [45]:
filtered_info_errs = grabber(info_errs, errors=True)

0 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
1 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
2 / 10
woot woot
woot woot
woot woot
woot woot
3 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
4 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
5 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
6 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
7 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
8 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
9 / 10
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
woot woot
10 / 10


In [50]:
info_no_errors = filtered_info_errs.dropna().reset_index(drop=True)
info_no_errors.loc[:, 'EIN':'Total'] = info_no_errors.loc[:, 'EIN':'Total'].astype(int)
info_no_errors.loc[:, 'EIN_YEAR'] = info_no_errors['EIN'].astype(str) + '_' + info_no_errors['Tax Year'].astype(str)
info_no_errors = info_no_errors.set_index('EIN_YEAR')

In [53]:
info_no_errors.to_csv('info_no_errors.csv')
EINinfoerrs = pd.read_csv('info_no_errors.csv')
EINinfoerrs = EINinfoerrs.set_index('EIN_YEAR')

In [55]:
find_errors(EINinfoerrs)

('951874073_2015.0',)

In [57]:
EINinfoerrs.loc[['951874073_2015.0']]

Unnamed: 0_level_0,ba,EIN,Tax Year,Location (Zipcode),Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
EIN_YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
951874073_2015.0,San Diego Rescue Mission Inc,951874073,2015.0,92138,0,0,265175,0,0,15535669,9304663,15535669


# Testing EINs from Manual Pull

In [58]:
csvfile_eins = pd.read_csv('990 Manual Pull - Cleaned EIN List.csv')
eins = list(csvfile_eins.get('EIN').dropna())
eins = [int(ein) for ein in eins]
#eins

In [59]:
parsedEINs = grabber(eins)#, verbose=True)

0 / 84
1 / 84
2 / 84
3 / 84
4 / 84
5 / 84
6 / 84
7 / 84
8 / 84
9 / 84
10 / 84
11 / 84
12 / 84
13 / 84
14 / 84
15 / 84
16 / 84
17 / 84
18 / 84
19 / 84
20 / 84
21 / 84
22 / 84
23 / 84
24 / 84
25 / 84
26 / 84
27 / 84
28 / 84
29 / 84
30 / 84
31 / 84
32 / 84
33 / 84
34 / 84
35 / 84
36 / 84
37 / 84
38 / 84
39 / 84
40 / 84
41 / 84
42 / 84
43 / 84
44 / 84
45 / 84
46 / 84
47 / 84
48 / 84
49 / 84
50 / 84
51 / 84
52 / 84
53 / 84
54 / 84
55 / 84
56 / 84
57 / 84
58 / 84
59 / 84
60 / 84
61 / 84
62 / 84
63 / 84
64 / 84
65 / 84
66 / 84
67 / 84
68 / 84
69 / 84
70 / 84
71 / 84
72 / 84
73 / 84
74 / 84
75 / 84
76 / 84
77 / 84
78 / 84
79 / 84
80 / 84
81 / 84
82 / 84
83 / 84
84 / 84


In [60]:
no_errorsEIN = parsedEINs.dropna().reset_index(drop=True)
no_errorsEIN.loc[:, 'EIN':'Total'] = no_errorsEIN.loc[:, 'EIN':'Total'].astype(int)
no_errorsEIN.loc[:, 'EIN_YEAR'] = no_errorsEIN['EIN'].astype(str) + '_' + no_errorsEIN['Tax Year'].astype(str)
no_errorsEIN = no_errorsEIN.set_index('EIN_YEAR')

In [61]:
no_errorsEIN.to_csv('no_errorsEIN.csv')

In [62]:
EINinfo = pd.read_csv('no_errors.csv')

In [63]:
EINinfo = EINinfo.set_index('EIN_YEAR')

In [64]:
EINinfo

Unnamed: 0_level_0,ba,EIN,Tax Year,Location (Zipcode),Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
EIN_YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
952457354_2021.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2021.0,91911,0,0,0,0,44408141,3256460,2136983,47664601
952457354_2020.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2020.0,91911,0,0,0,0,35941636,3522954,2632688,39464590
952457354_2019.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2019.0,91911,0,0,204555,0,35487026,2434789,2298104,38126370
952457354_2018.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2018.0,91911,0,0,35000,0,29811044,2412572,1869374,32258616
952457354_2017.0,Metropolitan Area Advisory Committee on Anti-P...,952457354,2017.0,91911,0,0,36800,0,27582015,2269306,2049541,29888121
...,...,...,...,...,...,...,...,...,...,...,...,...
330215585_2017.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2017.0,92103,0,0,0,0,1972019,8974475,1743782,10946494
330215585_2016.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2016.0,92103,0,0,0,0,1251303,6178785,1876303,7430088
330215585_2015.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2015.0,92103,0,0,0,0,2827016,2417961,1086372,5244977
330215585_2014.0,ALPHA PROJECT FOR THE HOMELESS,330215585,2014.0,92103,0,0,0,0,4037724,1847458,676252,5885182


In [65]:
EIN_errs = find_errors(EINinfo)
EIN_errs

('951874073_2015.0',
 '953368020_2015.0',
 '272917644_2017.0',
 '952039198_2020.0',
 '952039198_2013.0',
 '263405689_2015.0',
 '331146733_2017.0',
 '953497926_2016.0',
 '953302967_2013.0',
 '550806460_2022.0')

#### Find errors, then prep data to double filter

In [66]:
errs1 =EINinfo[EINinfo.index.isin(EIN_errs)]

In [67]:
EINerrs = [int(err[:-7]) for err in EIN_errs]

#### Double filter testing

In [68]:
parsedErrEINs = grabber(EINerrs, errors=True)

0 / 10
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [69]:
no_errors2 = parsedErrEINs.dropna().reset_index(drop=True)
no_errors2.loc[:, 'EIN':'Total'] = no_errors2.loc[:, 'EIN':'Total'].astype(int)
no_errors2.loc[:, 'EIN_YEAR'] = no_errors2['EIN'].astype(str) + '_' + no_errors2['Tax Year'].astype(str)
no_errors2 = no_errors2.set_index('EIN_YEAR')

In [70]:
no_errors2.to_csv('no_errors2.csv')

In [71]:
EINErrinfo = pd.read_csv('no_errors2.csv')

In [72]:
EINErrinfo = EINErrinfo.set_index('EIN_YEAR')

In [73]:
EINErr_errs = find_errors(EINErrinfo)
EINErr_errs

('951874073_2015.0',)

In [74]:
errs2 = EINErrinfo[EINErrinfo.index.isin(EINErr_errs)]

In [75]:
[err for err in EINErr_errs if err in EIN_errs]

['951874073_2015.0']

In [76]:
pd.merge(errs1, errs2, how='inner')

Unnamed: 0,ba,EIN,Tax Year,Location (Zipcode),Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
0,San Diego Rescue Mission Inc,951874073,2015.0,92138,0,0,265175,0,0,15535669,9304663,15535669
