In [5]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import requests
from bs4 import BeautifulSoup
import time
import lxml
import re
import os
import zipfile

# 990 Parser from IRS Year Indexer

Last updated: 8/7/23 by Nadine Marcus

### Methods

`scrape_website(url, attempts=0)`
- Takes a URL and returns the response object if successful. Retries three times on failure

`info_grabber(fp, url=True)`
- Takes the URL/filepath of an XML file. The `url` parameter indicates whether the given filepath is a URL or local
- Returns a dictionary of the row information for section VIII (along with a couple other columns)
- Older (prior to 2013) XML files have different tags and thus can't be read. Results in a row of all zeroes and `'Not Found'` in the `ba` column
- Of successfully scraped columns, has ~1.5% error rate for rows (0.13% of cells are inaccurate), only in the `Fundraising Events` column due to inconsistent labeling in XMLs
    - This error rate is lower than the human error rate
- In 2013, they had a different way of tagging zip codes, so all 2013 zips will be 0

`contact_grabber(fp, url=True)`
- Returns a dataframe of each contact in the 990 form, according to section VII
- Incomplete and edge cases aren't tested fully, as we didn't need the function.

`get_xmls(ein)`
- Takes the EIN of a company and returns a list of the URLs for each XML file that is linked on the ProPublica page for that given company

`grabber(eins, verbose=False, clean=False)`
- Takes a list of EINs and uses `get_xmls()` to find each XML file, then uses `info_grabber()` to scrape each XML for the relevant information.
- Returns a pandas DataFrame, doesn't automatically save as csv.
- Takes around 22 minutes on my laptop for 71 businesses
- The `clean` parameter automatically cleans the dataframe by removing empty rows

`find_errors(info)`
- Data columns are supposed to add up to the total, if it doesn't this method will find it
- Takes a dataframe of 990 rows from `grabber` and returns a tuple of incides that didn't properly grab all of the data
    - Preferably indiced on `EIN_YEAR`
    
`extract_ein_number(xml_element)`
- Data columns are supposed to add up to the total, if it doesn't this method will find it
- Takes a dataframe of 990 rows from `grabber` and returns a tuple of incides that didn't properly grab all of the data
    - Preferably indiced on `EIN_YEAR`

#### `scrape_website`

In [29]:
def scrape_website(url, attempts=0):
    # Send GET request to the website
    response = requests.get(url)
    time.sleep(1)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        if attempts > 0:
            print('Success')
        return response
    elif attempts < 3:
        print("Error:", response.status_code, "Trying again.")
        time.sleep(1)
        return scrape_website(url, attempts + 1)
    else:
        # Handle the request error
        print("Error: ", response.status_code, 'Failed on:', url)
        return None


#### `ein_grabber`

In [6]:
def ein_grabber(fp,url=False):
    try:
        with open(fp, 'r') as file:
                xml = file.read()
    except FileNotFoundError:
        print(f"File not found: {fp}")
        return dict()
    if xml is None:
        return dict()
    
    soup = BeautifulSoup(xml, 'xml')
    return soup.find('EIN')

#### `get_eins`

In [37]:
def get_eins(folder_path):
    eins = []
    xml_files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
    ind = 1
    for xml_file in xml_files:
        xml_path = os.path.join(folder_path, xml_file)
        try:
            with open(xml_path, 'r') as file:
                xml_content = file.read()
                ein = ein_grabber(xml_path)
                eins.append(ein)
                print(ind)
        except FileNotFoundError as e:
            print(f"Error reading {xml_file}: {e}")
        except Exception as e:
            print(f"Error processing {xml_file}: {e}")
        ind+=1
    return eins

In [1]:
# folder_eins = get_eins(r"C:\Users\nadin\Downloads\XML_files_irs\2022_TEOS_XML_01B")
# folder_eins

#### `extract_ein_number` : only use if files are downloaded locally

In [2]:
def extract_ein_number(xml_element):
    if xml_element is not None:
        try:
            return xml_element.text.strip("<EIN>").strip("</EIN>")
        except Exception as e:
            pass
    return None

# ein_numbers = list(map(extract_ein_number, folder_eins))
# ein_numbers = [int(ein) for ein in ein_numbers if ein is not None]

# ein_numbers

#### `find_filename` : only use if you're finding the information through IRS site

In [15]:
def find_filename(eins):
    years = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
    ein_data = pd.DataFrame(columns=['EIN', 'Year', 'TaxPayer', 'ObjectID'])
    
    for ein in eins:
        for year in years:
            index_csv = pd.read_csv(f"index_{year}.csv")
            ein_ind = index_csv[index_csv['EIN'] == ein]
            if not ein_ind.empty:
                ein_num = ein_ind['EIN'].iloc[0]
                
                sub_date = ein_ind['SUB_DATE'].iloc[0]
                if len(str(sub_date)) == 4:
                    ein_year = sub_date
                else:
                    ein_year = pd.to_datetime(sub_date).year
                taxpayer = ein_ind['TAXPAYER_NAME'].iloc[0]
                objID = str(ein_ind['OBJECT_ID'].iloc[0]) + '_public'
                ein_data = pd.concat([ein_data, pd.DataFrame({'EIN': [ein_num], 'Year': [ein_year], 'TaxPayer': [taxpayer], 'ObjectID': [objID]})], ignore_index=True)
            
    return ein_data

result = find_filename([480547708])
result

Unnamed: 0,EIN,Year,TaxPayer,ObjectID
0,480547708,2017,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,201613009349300846_public
1,480547708,2018,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,201810159349300801_public
2,480547708,2019,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,201940159349300724_public
3,480547708,2020,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,202020109349301112_public
4,480547708,2021,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,202130149349300138_public
5,480547708,2022,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,202230139349300723_public
6,480547708,2023,NATIONAL SOCIAL SCIENCE HONOR SOCIETY PI GAMMA MU,202300339349301850_public


#### `info_grabber`

In [30]:
def info_grabber(fp, url=True, errors=False):
    if url:
        xml = scrape_website(fp)
        xml.encoding = 'UTF-8'
        xml = xml.text.strip()
    else:
        try:
            with open(fp, 'r') as file:
                xml = file.read()
        except FileNotFoundError:
            print(f"File not found: {fp}")
            return dict()
    if xml is None:
        return dict()
    
    soup = BeautifulSoup(xml, 'xml')

    row = dict()
    row['ba'] = soup.find('BusinessName')
    row['EIN'] = soup.find('EIN')
    row['Tax Year'] = soup.find('TaxPeriodBeginDt')
    row['Location (Zipcode)'] = soup.find('USAddress')
    row['Federate Campaigns'] = soup.find('FederatedCampaignsAmt')
    row['Membership Dues'] = soup.find('MembershipDuesAmt')
    row['Fundraising Events'] = soup.find('z')
    
    row['Related Organizations'] = soup.find('RelatedOrganizationsAmt')
    row['Government Grants'] = soup.find('GovernmentGrantsAmt')
    row['All Other Contributions'] = soup.find('AllOtherContributionsAmt')
    row['Noncash Contributions'] = soup.find('NoncashContributionsAmt')
    row['Total'] = soup.find('TotalContributionsAmt')
    # Can make default=0

    if row['Location (Zipcode)'] is not None:
        row['Location (Zipcode)'] = row['Location (Zipcode)'].find('ZIPCd')
    else:
        row['Location (Zipcode)'] = None

    row = dict(map(lambda item: (item[0], item[1].text if item[1] is not None else None), row.items()))

    if row['ba'] is not None:
        row['ba'] = row['ba'].strip()
    else:
        row['ba'] = 'Not found'
    if row['Tax Year'] is not None:
        row['Tax Year'] = pd.Timestamp(row['Tax Year']).year
    else:
        row['Tax Year'] = np.nan
    
    if errors:
        sumKeys = ['Federate Campaigns', 'Membership Dues', 'Related Organizations', 'Government Grants', 'All Other Contributions']
        sumAmt = sum(float(row[key]) for key in sumKeys if key in row and row[key] is not None)
        fundAmt_element = soup.find('FundraisingAmt')
        fundAmt = int(fundAmt_element.text) if fundAmt_element is not None else 0
        sumAmt += fundAmt

        if sumAmt is None or row['Total'] is None:
            pass
        elif int(sumAmt) == int(row['Total']):
            row['Fundraising Events'] = fundAmt
        else:
            pass

    row = dict(map(lambda item: (item[0], item[1] if item[1] is not None else 0), row.items()))

    return row


#### `contact_grabber`

Old, might not work

In [31]:
def contact_grabber(fp, url=True):
    '''
    Return a dataframe of the important names and their positions in an organization for each year

    :param fp: filepath of the xml file
    :param url: Whether the fp is a url or not. Default = True
    '''
    if url:
        xml = scrape_website(fp)
    else:
        xml = requests.get(fp)
    if xml is None:
        return dict()
    xml.encoding = 'UTF-8'
    xml = xml.text.strip()
    soup = BeautifulSoup(xml, 'xml')

    people = soup.find_all('Form990PartVIISectionAGrp')
    names = [x.find('PersonNm').text for x in people if x.find('PersonNm') is not None]
    titles = [x.find('TitleTxt').text for x in people if x.find('TitleTxt') is not None]
    try:
        organization = [soup.find('BusinessName').text.strip()] * len(names)
    except AttributeError:
        organization = ['Not found'] * len(names)
    try:
        year = [pd.Timestamp(soup.find('TaxPeriodBeginDt').text).year] * len(names)
    except AttributeError:
        year = [0] * len(names)
        
    return pd.DataFrame({'organization': organization, 'year': year, 'names': names, 'titles': titles})

#### `get_xmls`

In [32]:
def get_xmls(ein):
    '''
    Return a list of xml urls from ProRepublica based on an organization's EIN

    :param ein: Employer Identification number as String or Int
    '''
    url = f'https://projects.propublica.org/nonprofits/organizations/{ein}'
    html = scrape_website(url)
    if html is None:
        return []
    soup = BeautifulSoup(html.text)
    links = soup.find_all('a', class_='action xml')
    base_url = 'https://projects.propublica.org'
    # urls = [base_url + x.get('href') for x in links if x.text == '990']
    urls = [x for x in soup.find_all(class_='action xml') if re.search(r'990\b', x.text)]
    urls = [base_url + x.get('href') if x.name == 'a' else base_url + x.select_one('select.action.xml option[data-href]').get('data-href') for x in urls]
    return urls

#### `grabber`

In [1]:
def grabber(eins, verbose=False, clean=False, errors=False):
    if type(eins) == int:
        eins = [eins]
    data = []
    # contacts = pd.DataFrame()
    overall_index = 0
    for index, ein in enumerate(eins):
        print(str(index) + (' / ') + str(len(eins)))
        xmls = get_xmls(ein)
        for xml in xmls:
            if verbose:
                print(f'{overall_index} {xml}')
            if errors:
                data.append(info_grabber(xml,url=False,errors=True))
            else:
                data.append(info_grabber(xml,url=False))
            #contacts = pd.concat([contacts, contact_grabber(xml)])
            overall_index += 1
    print(str(len(eins)) + (' / ') + str(len(eins)))
    info = pd.DataFrame(data)
    if not clean:
        return info

    no_error = info.dropna().reset_index(drop=True)
    no_error.loc[:, 'EIN':'Total'] = no_error.loc[:, 'EIN':'Total'].astype(int)
    no_error.loc[:, 'EIN_YEAR'] = no_error['EIN'].astype(str) + '_' + no_error['Tax Year'].astype(str)
    no_error = no_error.set_index('EIN_YEAR')

    return no_error

#### `find_errors`

In [34]:
def find_errors(info):
    '''
    Takes a dataframe of 990 rows and returns a tuple of the indices where it failed to grab the data properly

    :param info: The dataframe of 990 rows
    '''
    accurate = info[['Federate Campaigns', 'Membership Dues', 'Fundraising Events', 'Related Organizations', 'Government Grants', 'All Other Contributions']].sum(axis=1) == info['Total']
    return tuple(accurate[~accurate].index)

### Using the `grabber()` method

`grabber()` takes a list of EINs. This was curated from the old "990 Manual Pull" spreadsheet by pasting and splitting the entire `EIN` column and putting it into a set.

In [8]:
all_eins = '''113723093
113723093
113723093
113723093
113723093
237279074
237279074
237279074
237279074
237279074
237334012
237334012
237334012
237334012
237334012
261712580
261712580
261712580
261712580
261712580
261712580
330006089
330006089
330006089
330006089
330215585
330215585
330215585
330215585
330215585
330317950
330317950
330317950
330317950
330433314
330433314
330433314
330433314
330433314
330492304
330492304
330492304
330492304
330492304
330496092
330496092
330496092
330496092
330623634
330623634
330623634
330623634
330623634
331029843
331029843
331029843
331029843
331146733
331146733
331146733
331146733
465055513
465055513
465055513
465055513
942358632
942358632
942358632
942358632
942358632
951644024
951644024
951644024
951644024
951869806
951869806
951869806
951869806
951869806
951869806
951874073
951874073
951874073
951874073
951874073
951874073
951944230
951944230
951944230
951944230
951944230
952039198
952039198
952039198
952039198
952157559
952157559
952157559
952157559
952157559
952648050
952648050
952648050
952648050
952648050
952648050
952653869
952653869
952653869
952653869
952653869
952833205
952833205
952833205
952833205
952833205
952850121
952850121
952850121
952850121
952850121
952880426
952880426
952880426
952880426
952880426
953244085
953244085
953244085
953244085
953244085
953248148
953248148
953248148
953248148
953248148
953302967
953302967
953302967
953302967
953302967
953302967
953315571
953315571
953315571
953315571
953315571
953649525
953649525
953649525
953649525
956379598
956379598
956379598
956379598
953497926
953497926
953497926
953497926
953497926
953497926
953837714
953837714
953837714
953837714
953837714
953140767
953140767
953140767
953140767
951648184
951648184
951648184
951648184
951648184
952693142
952693142
952693142
952693142
952693142
330217339
330217339
330217339
330217339
330217339
330217339
952794253
952794253
952794253
952794253
952794253
952794253
951945256
951945256
951945256
951945256
951945256
270865318
270865318
270865318
270865318
270865318
331146733
331146733
331146733
331146733
331146733
821946283
821946283
263405689
263405689
263405689
263405689
263405689
263405689
330602842
330602842
330602842
330602842
330602842
330902617
330902617
330902617
330902617
330902617
952457354
952457354
952457354
952457354
952457354
330008269
330008269
330008269
330008269
330008269
953031682
953031682
953031682
953031682
953031682
237161267
237161267
237161267
237161267
330497515
330497515
330497515
330497515
330497515
953750738
953750738
953750738
953750738
953750738
953368020
953368020
953368020
953368020
953368020
953368020
953798088
953798088
953798088
953798088
952111196
952111196
952111196
952111196
952949636
952949636
952949636
952949636
952949636
330618893
330618893
953782164
953782164
953782164
953782164
953782164
953782164
272917644
272917644
272917644
272917644
273390797
273390797
273390797
931008369
931008369
931008369
931008369
931008369
952213995
952213995
952213995
952213995
562613191
562613191
562613191
562613191
562613191
952422704
952422704
952422704
952422704
941676390
941676390
941676390
941676390
941676390
550806460
550806460
550806460
550806460
550806460
550806460
330122462
330122462
330122462
330122462
270447059
270447059
270447059
270447059
270447059
330210280
822363154
953138268
953950196'''

In [9]:
wrong_eins = '''952850121 562613191 952457354 330210280 952877102 330433314 204374795 330602842 465055513 263405689 263405689 330602842 330602842 330602842 330433314 951644024 330553621 952653869 330433314 941676390 953302967 941676390 941676390 941676390 465055513 330602842 952850121 330006089 952111196 330433314 562613191 952880426 953244085
'''

In [10]:
eins = list(set(all_eins.split()))
# eins = list(set(wrong_eins.split()))

<b>Warning</b>: `grabber()` takes a long time. The following cell will take about 20 minutes.

# 990 - Manual Pull Testing

In [11]:
info990 = grabber(eins, verbose=True)

0 / 71
0 https://projects.propublica.org/nonprofits/download-xml?object_id=202210809349300966
1 https://projects.propublica.org/nonprofits/download-xml?object_id=202121099349301417
2 https://projects.propublica.org/nonprofits/download-xml?object_id=202010669349301306
3 https://projects.propublica.org/nonprofits/download-xml?object_id=201842969349300634


KeyboardInterrupt: 

### Checking the missing cells

The following two cells should both be 1.0.

The first checks the conditional probability that if the name is not found in a row, the tax year wasn't found.

The second checks the inverse conditional probability. Sometimes, if the request attempt tried and failed three times, the row will be entirely empty, where the name is `''` and not `'Not Found'`

In [None]:
# Every time the name isn't found, nothing is found
info990.loc[info990['ba'] == 'Not found']['Tax Year'].isna().mean()

In [None]:
# And every time the tax year isn't found the name isn't found
(info990[['ba', 'Tax Year']].loc[info990['Tax Year'].isna()]['ba'] == 'Not found').mean()

This cell indicates how many XML files failed to be accessed

In [None]:
info990['ba'].isna().sum()

If the following cell returns 0.0, that indicates that every time the tax year is missing, every single financial cell is zero. All of these rows can be dropped and no information is lost.

In [None]:
info990.loc[info990['Tax Year'].isna(), 'Federate Campaigns':].sum(axis=1).sum()

### Cleaning the raw information

Assuming the prior three checks are all adequate, we can drop rows with any nan values.

In [None]:
no_error = info990.dropna().reset_index(drop=True)
no_error.loc[:, 'EIN':'Total'] = no_error.loc[:, 'EIN':'Total'].astype(int)
no_error.loc[:, 'EIN_YEAR'] = no_error['EIN'].astype(str) + '_' + no_error['Tax Year'].astype(str)
no_error = no_error.set_index('EIN_YEAR')

### Export here!

<b>REMEMBER TO EXPORT</b>

`no_error` is the cleaned dataframe

`info990` is the raw dataframe with missing rows

In [None]:
 no_error.to_csv('no_errors.csv')

In [None]:
 info990.to_csv('full_table.csv')

The following line appends the rows to the bottom of the existing dataframe titled `'no_errors.csv'` instead.

It creates a new CSV and it is recommended to rename it back to `no_errors.csv` in the case it is exported successfully

In [None]:
extended = pd.concat([pd.read_csv('no_errors.csv'), no_error]).reset_index(drop=True).drop_duplicates()
extended.to_csv('no_errors_appended.csv')

## Testing Zone

In [None]:
url = 'https://projects.propublica.org/nonprofits/organizations/263405689'
html = scrape_website(url)
soup = BeautifulSoup(html.text)
links = soup.find_all('a', class_='action xml')
base_url = 'https://projects.propublica.org'
urls = [base_url + x.get('href') for x in links if x.text == '990']

In [None]:
test = [x for x in soup.find_all(class_='action xml') if re.search(r'990\b', x.text)]
test2 = [base_url + x.get('href') if x.name == 'a' else base_url + x.select_one('select.action.xml option[data-href]').get('data-href') for x in test]

In [None]:
info990

In [None]:
extended = pd.concat([pd.read_csv('no_errors.csv'), no_error]).reset_index(drop=True).drop_duplicates()

In [None]:
extended.loc[:, 'EIN':'Total'] = extended.loc[:, 'EIN':'Total'].astype(int)

In [None]:
extended = extended.drop_duplicates()

In [None]:
info = pd.read_csv('no_errors.csv')

In [None]:
info = info.set_index('EIN_YEAR')
# accurate = info[['Federate Campaigns', 'Membership Dues', 'Fundraising Events', 'Related Organizations', 'Government Grants', 'All Other Contributions']].sum(axis=1) == info['Total']

In [None]:
info

In [None]:
# tuple(accurate[~accurate].index)

In [None]:
info_EINerrs = find_errors(info)

In [None]:
info_errs = info[info.index.isin(info_EINerrs)]

In [None]:
info_errs = [int(err[:-7]) for (err) in info_EINerrs]
info_errs

In [None]:
filtered_info_errs = grabber(info_errs, errors=True)

In [None]:
info_no_errors = filtered_info_errs.dropna().reset_index(drop=True)
info_no_errors.loc[:, 'EIN':'Total'] = info_no_errors.loc[:, 'EIN':'Total'].astype(int)
info_no_errors.loc[:, 'EIN_YEAR'] = info_no_errors['EIN'].astype(str) + '_' + info_no_errors['Tax Year'].astype(str)
info_no_errors = info_no_errors.set_index('EIN_YEAR')

In [None]:
info_no_errors.to_csv('info_no_errors.csv')
EINinfoerrs = pd.read_csv('info_no_errors.csv')
EINinfoerrs = EINinfoerrs.set_index('EIN_YEAR')

In [None]:
find_errors(EINinfoerrs)

In [None]:
EINinfoerrs.loc[['951874073_2015.0']]

# Testing EINs from Manual Pull

In [71]:
csvfile_eins = pd.read_csv('990 Manual Pull - Cleaned EIN List.csv')
eins = list(csvfile_eins.get('EIN').dropna())
eins = [int(ein) for ein in eins]
eins

[113723093,
 237279074,
 237334012,
 261712580,
 330006089,
 330215585,
 330317950,
 330433314,
 330492304,
 330496092,
 330623634,
 331029843,
 331146733,
 465055513,
 942358632,
 951644024,
 951869806,
 951874073,
 951944230,
 952039198,
 952157559,
 952648050,
 952653869,
 952833205,
 952850121,
 952880426,
 953244085,
 953248148,
 953302967,
 953315571,
 953649525,
 956379598,
 953497926,
 953837714,
 953140767,
 951648184,
 952693142,
 330217339,
 952794253,
 951945256,
 270865318,
 821946283,
 263405689,
 330602842,
 330902617,
 952457354,
 330008269,
 953031682,
 237161267,
 330497515,
 953750738,
 953368020,
 953798088,
 952111196,
 952949636,
 330618893,
 953782164,
 272917644,
 273390797,
 931008369,
 952213995,
 562613191,
 952422704,
 941676390,
 550806460,
 330122462,
 270447059,
 330210280,
 822363154,
 953138268,
 953950196,
 952877102,
 330553621,
 204374795,
 571214920,
 571162424,
 956379598,
 330122462,
 953782164,
 953837714,
 331029843,
 330217339,
 952880426,
 953

In [None]:
parsedEINs = grabber(eins)#, verbose=True)

In [None]:
no_errorsEIN = parsedEINs.dropna().reset_index(drop=True)
no_errorsEIN.loc[:, 'EIN':'Total'] = no_errorsEIN.loc[:, 'EIN':'Total'].astype(int)
no_errorsEIN.loc[:, 'EIN_YEAR'] = no_errorsEIN['EIN'].astype(str) + '_' + no_errorsEIN['Tax Year'].astype(str)
no_errorsEIN = no_errorsEIN.set_index('EIN_YEAR')

In [None]:
no_errorsEIN.to_csv('no_errorsEIN.csv')

In [10]:
EINinfo = pd.read_csv('no_errors.csv')

In [12]:
np.unique(EINinfo.get('EIN'))

array([113723093, 237161267, 237279074, 237334012, 261712580, 263405689,
       270447059, 270865318, 272917644, 273390797, 330006089, 330008269,
       330122462, 330210280, 330215585, 330217339, 330317950, 330433314,
       330492304, 330496092, 330497515, 330602842, 330618893, 330623634,
       330902617, 331029843, 331146733, 465055513, 550806460, 562613191,
       821946283, 822363154, 931008369, 941676390, 942358632, 951644024,
       951648184, 951869806, 951874073, 951944230, 951945256, 952039198,
       952111196, 952157559, 952213995, 952422704, 952457354, 952648050,
       952653869, 952693142, 952794253, 952833205, 952850121, 952880426,
       952949636, 953031682, 953138268, 953140767, 953244085, 953248148,
       953302967, 953315571, 953368020, 953497926, 953649525, 953750738,
       953782164, 953798088, 953837714, 953950196, 956379598], dtype=int64)

In [26]:
EINinfo = EINinfo.set_index('EIN_YEAR')

In [60]:
EIN_Year = EINinfo.groupby('Tax Year').sum().drop(labels = ['ba','EIN','Location (Zipcode)'], axis=1)
EIN_Year.loc[2015.0, 'Fundraising Events'] -= 265175
EIN_Year.to_csv('EIN_Year.csv')

In [None]:
EIN_errs = find_errors(EINinfo)
EIN_errs

#### Find errors, then prep data to double filter

In [None]:
errs1 =EINinfo[EINinfo.index.isin(EIN_errs)]

In [None]:
EINerrs = [int(err[:-7]) for err in EIN_errs]

#### Double filter testing

In [None]:
parsedErrEINs = grabber(EINerrs, errors=True)

In [None]:
no_errors2 = parsedErrEINs.dropna().reset_index(drop=True)
no_errors2.loc[:, 'EIN':'Total'] = no_errors2.loc[:, 'EIN':'Total'].astype(int)
no_errors2.loc[:, 'EIN_YEAR'] = no_errors2['EIN'].astype(str) + '_' + no_errors2['Tax Year'].astype(str)
no_errors2 = no_errors2.set_index('EIN_YEAR')

In [None]:
no_errors2.to_csv('no_errors2.csv')

In [5]:
EINErrinfo = pd.read_csv('no_errors2.csv')

In [9]:
np.unique(EINErrinfo.get('EIN'))

array([263405689, 272917644, 331146733, 550806460, 951874073, 952039198,
       953302967, 953368020, 953497926], dtype=int64)

In [None]:
EINErrinfo = EINErrinfo.set_index('EIN_YEAR')

In [None]:
EINErr_errs = find_errors(EINErrinfo)
EINErr_errs

In [None]:
errs2 = EINErrinfo[EINErrinfo.index.isin(EINErr_errs)]

In [None]:
[err for err in EINErr_errs if err in EIN_errs]

In [None]:
pd.merge(errs1, errs2, how='inner')

In [None]:
get_xmls(951874073)

In [3]:
einyearcsv = pd.read_csv('EIN_Year.csv')

In [4]:
einyearcsv

Unnamed: 0,Tax Year,Federate Campaigns,Membership Dues,Fundraising Events,Related Organizations,Government Grants,All Other Contributions,Noncash Contributions,Total
0,2013.0,14812168,5931,2926753,1992567,272916322,82768266,26026065,398097644
1,2014.0,14690906,10503,3451517,1718545,436176976,85800212,18083102,541848659
2,2015.0,13449748,760,3621489,1151289,490880029,91616746,17711150,600733501
3,2016.0,12270921,11824,3270531,2928524,511590186,104729345,23099826,634806291
4,2017.0,8149406,4925,3523143,2951688,578384972,112406307,24512047,705427271
5,2018.0,7121111,9375,5954813,2737085,662322346,118763690,23556060,796908420
6,2019.0,6651291,9365,5077165,6065219,816941379,147143289,24199956,981887708
7,2020.0,5730305,5050,5559680,11438182,997288792,168776475,30818037,1188337484
8,2021.0,4795227,0,7371420,1437925,923315284,131650133,33697316,1068569989
9,2022.0,0,0,279486,0,505011,7923835,19922,8716739


In [70]:
result = [ein for ein in eins if ein not in ein_numbers]

84