# 990PF 

In [3]:
import pandas as pd
from xml.etree import cElementTree as ElementTree
import xml.etree.ElementTree as et
from tqdm import tqdm
import logging
import re

logging.basicConfig(format='%(asctime)s: %(message)s', filename='analysis.log', level=logging.DEBUG)

In [4]:
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

def get_field(xmldict, path=None, default=None):
    if path is None: return xmldict
    try:
        for p in path:
            # Ugly namespaces are ugly, let's deal with them here
            xmldict = xmldict['{http://www.irs.gov/efile}' + p]
        return xmldict
    except KeyError:
        # If this key does not exist in the dictionary, 
        return default
        # Note: it may be faster to check ```if p not in dict``` rather
        # than set up this try/except block. I think it depends on how
        # often this fails - the more it fails, the worse the try/except does.

In [45]:
tree = et.parse("data/test/201721229349200737_public.xml")
root = tree.getroot()
#Converting from xml to dictionary
xmldict = XmlDictConfig(root)

get_field(xmldict, ['ReturnData','IRS990ScheduleG','FundraisingEventInformationGrp', 'GrossRevenueTotalEventsAmt'])

'63814'

In [46]:
def analyze_one(path):
    object_id = re.sub("\D", "", path)
    object_list = [[object_id]]
    object_df = pd.DataFrame(object_list, columns =['Object_ID'])
    tree = et.parse(path)
    root = tree.getroot()
    #Converting from xml to dictionary
    xmldict = XmlDictConfig(root)

    
    ##############Header Information 
    #Date Information
    TaxPeriodBeginDt = get_field(xmldict, ['ReturnHeader','TaxPeriodBeginDt'])
    TaxPeriodEndDt = get_field(xmldict, ['ReturnHeader','TaxPeriodEndDt'])
    
    
    #Filer Information
    filer_data = get_field(xmldict, ['ReturnHeader', 'Filer'], {})
    EIN = get_field(filer_data, ['EIN'])
    BusinessName = get_field(filer_data, ['BusinessName', 'BusinessNameLine1Txt'])
    City = get_field(filer_data, ['USAddress', 'CityNm'])
    State = get_field(filer_data, ['USAddress', 'StateAbbreviationCd'])
    ZIPCd = get_field(filer_data, ['USAddress', 'ZIPCd'])
    TaxYr = get_field(xmldict, ['ReturnHeader', 'TaxYr'])
    filer_list = [[EIN, BusinessName,TaxPeriodBeginDt, TaxPeriodEndDt,TaxYr, City, State, ZIPCd]]
    filer_df = pd.DataFrame(filer_list, columns = ['EIN', 'BusinessName','TaxPeriodBeginDt','TaxPeriodEndDt','TaxYr','City', 'State', 'ZIPCd']) 
    
    
    ##############Return Data
    return_data = get_field(xmldict, ['ReturnData', 'IRS990EZ'])
    if return_data is None:
        logging.warning(f'Could not find Return Data for file: {path}')
        return None
    
    #AddressChangeInd
    AddressChangeInd = get_field(return_data, ['AddressChangeInd'], "")
    
    #InitialReturnInd
    InitialReturnInd = get_field(return_data, ['InitialReturnInd'], "")
    
    #InitialReturnFormerPubChrtyInd
    InitialReturnFormerPubChrtyInd = get_field(return_data, ['InitialReturnFormerPubChrtyInd'], "")
    
    #AmendedReturnInd
    AmendedReturnInd = get_field(return_data, ['AmendedReturnInd'], "")
    
    #FinalReturnInd
    FinalReturnInd = get_field(return_data, ['FinalReturnInd'], "")
    
    #NameChange this may only be for paper filers 
    NameChange = get_field(return_data, ['NameChange'], "")
    
    #Organization501c3Ind
    Organization501c3Ind = get_field(return_data, ['Organization501c3Ind'], "")
    
    #Organization501cInd
    Organization501cInd = get_field(return_data, ['Organization501cInd'], "")
    
    #Organization4947a1NotPFInd
    Organization4947a1NotPFInd = get_field(return_data, ['Organization4947a1NotPFInd'], "")
    
    #Organization527Ind
    Organization527Ind = get_field(return_data, ['Organization527Ind'], "")
    
    #TypeOfOrganizationCorpInd
    TypeOfOrganizationCorpInd = get_field(return_data, ['TypeOfOrganizationCorpInd'], "")
    
    #Trust
    TypeOfOrganizationTrustInd = get_field(return_data, ['TypeOfOrganizationTrustInd'], "")
    
    #asso
    TypeOfOrganizationAssocInd = get_field(return_data, ['TypeOfOrganizationAssocInd'], "")
    
    #other
    TypeOfOrganizationOtherInd = get_field(return_data, ['TypeOfOrganizationOtherInd'], "")
    
    #GrossReceiptsAmt
    GrossReceiptsAmt = get_field(return_data, ['GrossReceiptsAmt'], "0")
    
    #InfoInScheduleOPartIInd
    InfoInScheduleOPartIInd = get_field(return_data, ['InfoInScheduleOPartIInd'], "")
    
    #ContributionsGiftsGrantsEtcAmt
    ContributionsGiftsGrantsEtcAmt = get_field(return_data, ['ContributionsGiftsGrantsEtcAmt'], "0")
    
    #MembershipDuesAmt
    ProgramServiceRevenueAmt = get_field(return_data, ['ProgramServiceRevenueAmt'], "0")
    
    #GrossReceiptsAmt
    MembershipDuesAmt = get_field(return_data, ['MembershipDuesAmt'], "0")
    
    #InvestmentIncomeAmt
    InvestmentIncomeAmt = get_field(return_data, ['InvestmentIncomeAmt'], "0")
    
    #GamingGrossIncomeAmt
    GamingGrossIncomeAmt = get_field(return_data, ['GamingGrossIncomeAmt'], "0")
    
    #FundraisingGrossIncomeAmt
    FundraisingGrossIncomeAmt = get_field(return_data, ['FundraisingGrossIncomeAmt'], "0")
    
    #Fundraising from scheduleG
    GrossRevenueTotalEventsAmt= get_field(xmldict, ['ReturnData','IRS990ScheduleG','FundraisingEventInformationGrp', 'GrossRevenueTotalEventsAmt'])
    
    # TotalRevenueAmt
    TotalRevenueAmt = get_field(return_data, ['TotalRevenueAmt'], "0")
    
    #TotalExpensesAmtt
    TotalExpensesAmt = get_field(return_data, ['TotalExpensesAmt'], "0")
    
    #NetAssetsOrFundBalancesBOYAmt
    NetAssetsOrFundBalancesBOYAmt = get_field(return_data, ['NetAssetsOrFundBalancesBOYAmt'], "0")
    
    #NetAssetsOrFundBalancesEOYAmt
    NetAssetsOrFundBalancesEOYAmt = get_field(return_data, ['NetAssetsOrFundBalancesEOYAmt'], "0")
    
    #Form990TotalAssetsGrp
    Form990TotalAssetsGrpBOY = get_field(return_data, ['Form990TotalAssetsGrp', 'BOYAmt'], "0")
    Form990TotalAssetsGrpEOY = get_field(return_data, ['Form990TotalAssetsGrp', 'EOYAmt'], "0")
    
    #SumOfTotalLiabilitiesGrp
    SumOfTotalLiabilitiesGrpBOY = get_field(return_data, ['SumOfTotalLiabilitiesGrp','BOYAmt'], "0")
    SumOfTotalLiabilitiesGrpEOY = get_field(return_data, ['SumOfTotalLiabilitiesGrp','EOYAmt'], "0")
    
    #NetAssetsOrFundBalancesGrp
    NetAssetsOrFundBalancesGrpBOY = get_field(return_data, ['NetAssetsOrFundBalancesGrp','BOYAmt'], "0")
    NetAssetsOrFundBalancesGrpEOY = get_field(return_data, ['NetAssetsOrFundBalancesGrp','EOYAmt'], "0")
    

    return_list = [[    AddressChangeInd,
                        InitialReturnInd,
                        InitialReturnFormerPubChrtyInd,
                        AmendedReturnInd,
                        FinalReturnInd,
                        NameChange,
                        Organization501c3Ind,
                        Organization501cInd,
                        Organization4947a1NotPFInd,
                        Organization527Ind,
                        TypeOfOrganizationCorpInd,
                        TypeOfOrganizationTrustInd,
                        TypeOfOrganizationAssocInd,
                        TypeOfOrganizationOtherInd,
                        GrossReceiptsAmt,
                        InfoInScheduleOPartIInd,
                        ContributionsGiftsGrantsEtcAmt,
                        MembershipDuesAmt,
                        GrossReceiptsAmt,
                        InvestmentIncomeAmt,
                        GamingGrossIncomeAmt,
                        FundraisingGrossIncomeAmt,
                        GrossRevenueTotalEventsAmt,
                        TotalRevenueAmt,
                        TotalExpensesAmt,
                        NetAssetsOrFundBalancesBOYAmt,
                        NetAssetsOrFundBalancesEOYAmt,
                        Form990TotalAssetsGrpBOY,
                        Form990TotalAssetsGrpEOY,
                        SumOfTotalLiabilitiesGrpBOY,
                        SumOfTotalLiabilitiesGrpEOY,
                        NetAssetsOrFundBalancesGrpBOY,
                        NetAssetsOrFundBalancesGrpEOY
                        
                   ]]
    
    return_df = pd.DataFrame(return_list, columns =
                             ['AddressChangeInd',
                            'InitialReturnInd',
                            'InitialReturnFormerPubChrtyInd',
                            'AmendedReturnInd',
                            'FinalReturnInd',
                            'NameChange',
                            'Organization501c3Ind',
                            'Organization501cInd',
                            'Organization4947a1NotPFInd',
                            'Organization527Ind',
                            'TypeOfOrganizationCorpInd',
                            'TypeOfOrganizationTrustInd',
                            'TypeOfOrganizationAssocInd',
                            'TypeOfOrganizationOtherInd',
                            'GrossReceiptsAmt',
                            'InfoInScheduleOPartIInd',
                            'ContributionsGiftsGrantsEtcAmt',
                            'MembershipDuesAmt',
                            'GrossReceiptsAmt',
                            'InvestmentIncomeAmt',
                            'GamingGrossIncomeAmt',
                            'FundraisingGrossIncomeAmt',
                            'GrossRevenueTotalEventsAmt',
                            'TotalRevenueAmt',
                            'TotalExpensesAmt',
                            'NetAssetsOrFundBalancesBOYAmt',
                            'NetAssetsOrFundBalancesEOYAmt',
                            'Form990TotalAssetsGrpBOY',
                            'Form990TotalAssetsGrpEOY',
                            'SumOfTotalLiabilitiesGrpBOY',
                            'SumOfTotalLiabilitiesGrpEOY',
                            'NetAssetsOrFundBalancesGrpBOY',
                            'NetAssetsOrFundBalancesGrpEOY'])

    #Concatenating header information and contributions
    return pd.concat([object_df, filer_df, return_df], axis=1, ignore_index=False)
    

In [47]:
#This is the method I used to get all the files in the directory

import os

'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles


dirName = 'data/test';
files = getListOfFiles(dirName)

In [48]:
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None

df = pd.concat([one_row for name in tqdm(files) if (one_row := analyze_one(name)) is not None])
df




  0%|          | 0/37 [00:00<?, ?it/s][A[A[A


100%|██████████| 37/37 [00:00<00:00, 229.64it/s][A[A[A


Unnamed: 0,Object_ID,EIN,BusinessName,TaxPeriodBeginDt,TaxPeriodEndDt,TaxYr,City,State,ZIPCd,AddressChangeInd,InitialReturnInd,InitialReturnFormerPubChrtyInd,AmendedReturnInd,FinalReturnInd,NameChange,Organization501c3Ind,Organization501cInd,Organization4947a1NotPFInd,Organization527Ind,TypeOfOrganizationCorpInd,TypeOfOrganizationTrustInd,TypeOfOrganizationAssocInd,TypeOfOrganizationOtherInd,GrossReceiptsAmt,InfoInScheduleOPartIInd,ContributionsGiftsGrantsEtcAmt,MembershipDuesAmt,GrossReceiptsAmt.1,InvestmentIncomeAmt,GamingGrossIncomeAmt,FundraisingGrossIncomeAmt,GrossRevenueTotalEventsAmt,TotalRevenueAmt,TotalExpensesAmt,NetAssetsOrFundBalancesBOYAmt,NetAssetsOrFundBalancesEOYAmt,Form990TotalAssetsGrpBOY,Form990TotalAssetsGrpEOY,SumOfTotalLiabilitiesGrpBOY,SumOfTotalLiabilitiesGrpEOY,NetAssetsOrFundBalancesGrpBOY,NetAssetsOrFundBalancesGrpEOY
0,201721229349200732,474352583,ROUTE 247 FOUNDATION INC,2016-01-01,2016-12-31,2016,VIRGINIA BEACH,VA,23455,,,,,,,"{'referenceDocumentId': '00000002', 'reference...",,,,X,,,,24070,X,24070,0,24070,0,0,0,,24070,48745,32793,8118,35155,8202,2362,84,32793,8118
0,201721229349200737,223527625,FESTIVAL COMMITTEE OF CHATSWORTH,2016-01-01,2016-12-31,2016,CHATSWORTH,NJ,8109,,,,,,,{'referenceDocumentId': 'IRS990ScheduleA'},,,,X,,,,73103,X,5210,0,73103,4079,0,{'referenceDocumentId': 'IRS990ScheduleG'},63814.0,22648,23593,133226,132281,134725,134044,1499,1763,133226,132281
0,201721229349200742,942761387,SAN PABLO POLICE EMPLOYEES ASSOCIATION,2016-01-01,2016-12-31,2016,SAN PABLO,CA,94806,,,,,,,,{'organization501cTypeTxt': '5'},,,,,X,,94239,X,94239,0,94239,0,0,0,,94239,106766,148647,136120,148647,136120,0,0,148647,136120
0,201721229349200747,311256830,ASSOCIATION OF FUNDRAISING PROFESSIONALS,2016-01-01,2016-12-31,2016,ARLINGTON,VA,222034168,,,,,,,{'referenceDocumentId': 'RetDoc1039100001'},,,,X,,,,70818,X,1077,4725,70818,3697,0,0,,70818,84722,99516,85612,99516,85612,0,0,99516,85612
0,201721229349200802,261315302,MAINELY RAT RESCUE,2016-01-01,2016-12-31,2016,FALMOUTH,ME,4105,,,,,,,{'referenceDocumentId': '1'},,,,,,,,67972,,48392,0,67972,0,0,0,,60507,46150,5158,19515,5158,19515,0,0,5158,19515
0,201721229349200807,463490807,DR KEN AND GLENDA COX,2016-01-01,2016-12-31,2016,MILL HALL,PA,17751,,X,,,,,{'referenceDocumentId': '1'},,,,X,,,,10380,X,10380,0,10380,0,0,0,,10380,8274,5083,7189,5083,15463,0,8274,5083,7189
0,201721229349200812,222481718,HARVARD CLASS OF 1953 ASSOCIATION,2016-01-01,2016-12-31,2016,BOSTON,MA,2109,,,,,,,{'referenceDocumentId': 'RetDoc1039100001'},,,,,,X,,2060,X,0,0,2060,2059,0,0,,2060,624,69431,71782,69431,71782,0,0,69431,71782
0,201721229349200817,950751488,Free and Accepted Masons Culver City-Foshay No...,2016-01-01,2016-12-31,2016,Culver City,CA,902322625,,,,,,,,{'organization501cTypeTxt': '10'},,,,,,X,83251,X,339,35572,83251,41522,0,0,,83251,38510,95736,143831,95736,144553,0,722,95736,143831
