# 990PF 

In [8]:
import pandas as pd
from xml.etree import cElementTree as ElementTree
import xml.etree.ElementTree as et
from tqdm import tqdm
import logging
import re

logging.basicConfig(format='%(asctime)s: %(message)s', filename='analysis.log', level=logging.DEBUG)

In [9]:
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

def get_field(xmldict, path=None, default=None):
    if path is None: return xmldict
    try:
        for p in path:
            # Ugly namespaces are ugly, let's deal with them here
            xmldict = xmldict['{http://www.irs.gov/efile}' + p]
        return xmldict
    except KeyError:
        # If this key does not exist in the dictionary, 
        return default
        # Note: it may be faster to check ```if p not in dict``` rather
        # than set up this try/except block. I think it depends on how
        # often this fails - the more it fails, the worse the try/except does.

In [7]:
tree = et.parse("data/test/201722789349300202_public.xml")
root = tree.getroot()
#Converting from xml to dictionary
xmldict = XmlDictConfig(root)


In [27]:
def analyze_one(path):
    object_id = re.sub("\D", "", path)
    object_list = [[object_id]]
    object_df = pd.DataFrame(object_list, columns =['Object_ID'])
    tree = et.parse(path)
    root = tree.getroot()
    #Converting from xml to dictionary
    xmldict = XmlDictConfig(root)

    
    ##############Header Information 
    #Date Information
    TaxPeriodBeginDt = get_field(xmldict, ['ReturnHeader','TaxPeriodBeginDt'])
    TaxPeriodEndDt = get_field(xmldict, ['ReturnHeader','TaxPeriodEndDt'])
    
    
    #Filer Information
    filer_data = get_field(xmldict, ['ReturnHeader', 'Filer'], {})
    EIN = get_field(filer_data, ['EIN'])
    BusinessName = get_field(filer_data, ['BusinessName', 'BusinessNameLine1Txt'])
    City = get_field(filer_data, ['USAddress', 'CityNm'])
    State = get_field(filer_data, ['USAddress', 'StateAbbreviationCd'])
    ZIPCd = get_field(filer_data, ['USAddress', 'ZIPCd'])
    TaxYr = get_field(xmldict, ['ReturnHeader', 'TaxYr'])
    filer_list = [[EIN, BusinessName,TaxPeriodBeginDt, TaxPeriodEndDt,TaxYr, City, State, ZIPCd]]
    filer_df = pd.DataFrame(filer_list, columns = ['EIN', 'BusinessName','TaxPeriodBeginDt','TaxPeriodEndDt','TaxYr','City', 'State', 'ZIPCd']) 
    
    
    ##############Return Data
    return_data = get_field(xmldict, ['ReturnData', 'IRS990PF'])
    if return_data is None:
        logging.warning(f'Could not find Return Data for file: {path}')
        return None
    
    #AddressChangeInd
    AddressChangeInd = get_field(return_data, ['AddressChangeInd'], "")
    
    #InitialReturnInd
    InitialReturnInd = get_field(return_data, ['InitialReturnInd'], "")
    
    #InitialReturnFormerPubChrtyInd
    InitialReturnFormerPubChrtyInd = get_field(return_data, ['InitialReturnFormerPubChrtyInd'], "")
    
    #AmendedReturnInd
    AmendedReturnInd = get_field(return_data, ['AmendedReturnInd'], "")
    
    #FinalReturnInd
    FinalReturnInd = get_field(return_data, ['FinalReturnInd'], "")
    
    #NameChange this may only be for paper filers 
    NameChange = get_field(return_data, ['NameChange'], "")
    
    #PFStatusTermSect507b1AInd
    PFStatusTermSect507b1AInd = get_field(return_data, ['PFStatusTermSect507b1AInd'], "")
    
    #Organization501c3ExemptPFInd
    Organization501c3ExemptPFInd = get_field(return_data, ['Organization501c3ExemptPFInd'], "")
    
    #Organization4947a1TrtdPFInd
    Organization4947a1TrtdPFInd = get_field(return_data, ['Organization4947a1TrtdPFInd'], "")
    
    #Organization501c3TaxablePFInd
    Organization501c3TaxablePFInd = get_field(return_data, ['Organization501c3TaxablePFInd'], "")
    
     #End of year assets "FMV of Assets at End of Year (required)"
    FMVAssetsEOYAmt = get_field(return_data, ['FMVAssetsEOYAmt'], "NA")
    
    #AdjustedNetIncomeAmt
    AdjustedNetIncomeAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','AdjustedNetIncomeAmt'], "")

    #TotalRevAndExpnssAmt
    TotalRevAndExpnssAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalRevAndExpnssAmt'], "")
    
    #TotalNetInvstIncmAmt
    TotalNetInvstIncmAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalNetInvstIncmAmt'], "")

    #TotalAdjNetIncmAmt
    TotalAdjNetIncmAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalAdjNetIncmAmt'], "")
    
    #TotalExpensesRevAndExpnssAmt 'Form990PFBalanceSheetsGrp',
    TotalExpensesRevAndExpnssAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalExpensesRevAndExpnssAmt'], "")
    
    #TotalExpensesNetInvstIncmAmt
    TotalExpensesNetInvstIncmAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalExpensesNetInvstIncmAmt'], "")
    
    #TotalExpensesAdjNetIncmAmt
    TotalExpensesAdjNetIncmAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalExpensesAdjNetIncmAmt'], "")
    
    #TotalExpensesDsbrsChrtblAmt
    TotalExpensesDsbrsChrtblAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','TotalExpensesDsbrsChrtblAmt'], "")
    
    #ExcessRevenueOverExpensesAmt
    ExcessRevenueOverExpensesAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','ExcessRevenueOverExpensesAmt'], "")
    
    #NetInvestmentIncomeAmt
    NetInvestmentIncomeAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','NetInvestmentIncomeAmt'], "")
    
    #AdjustedNetIncomeAmt
    AdjustedNetIncomeAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','AdjustedNetIncomeAmt'], "")
    
    #TotalAssetsBOYAmt
    TotalAssetsBOYAmt = get_field(return_data, ['AnalysisOfRevenueAndExpenses','AdjustedNetIncomeAmt'], "")
    
    #TotalAssetsEOYAmt
    TotalAssetsEOYAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotalAssetsEOYAmt'], "")
    
    #TotalAssetsEOYFMVAmt
    TotalAssetsEOYFMVAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotalAssetsEOYFMVAmt'], "")
    
    #TotalLiabilitiesBOYAmt
    TotalLiabilitiesBOYAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotalLiabilitiesBOYAmt'], "")
    
    #TotalLiabilitiesEOYAmt
    TotalLiabilitiesEOYAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotalLiabilitiesEOYAmtt'], "")
    
    #TotNetAstOrFundBalancesBOYAmt
    TotNetAstOrFundBalancesBOYAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotNetAstOrFundBalancesBOYAmt'], "")
    
    #TotNetAstOrFundBalancesEOYAmt
    TotNetAstOrFundBalancesEOYAmt = get_field(return_data, ['Form990PFBalanceSheetsGrp','TotNetAstOrFundBalancesEOYAmt'], "")
    
    #AdjustedQualifyingDistriAmt
    AdjustedQualifyingDistriAmt = get_field(return_data, ['QualifyingDistriPartXIIGrp','AdjustedQualifyingDistriAmt'], "")
    
    #ExcessDistriCyovToNextYrAmt
    try:
        ExcessDistriCyovToNextYrAmt = get_field(return_data, ['UndistributedIncomeGrp','ExcessDistriCyovToNextYrAmt'], "")
        
    except TypeError:
        ExcessDistriCyovToNextYrAmt = "" 
        
        
    

    return_list = [[    AddressChangeInd,
                        InitialReturnInd,
                        InitialReturnFormerPubChrtyInd,
                        AmendedReturnInd,
                        FinalReturnInd,
                        NameChange,
                        PFStatusTermSect507b1AInd,
                        Organization501c3ExemptPFInd,
                        Organization4947a1TrtdPFInd,
                        Organization501c3TaxablePFInd,
                        FMVAssetsEOYAmt,
                        AdjustedNetIncomeAmt,
                        TotalRevAndExpnssAmt,
                        TotalNetInvstIncmAmt,
                        TotalAdjNetIncmAmt,
                        TotalExpensesRevAndExpnssAmt,
                        TotalExpensesNetInvstIncmAmt,
                        TotalExpensesAdjNetIncmAmt,
                        TotalExpensesDsbrsChrtblAmt,
                        ExcessRevenueOverExpensesAmt,
                        NetInvestmentIncomeAmt,
                        AdjustedNetIncomeAmt,
                        TotalAssetsBOYAmt,
                        TotalAssetsEOYAmt,
                        TotalAssetsEOYFMVAmt,
                        TotalLiabilitiesBOYAmt,
                        TotalLiabilitiesEOYAmt,
                        TotNetAstOrFundBalancesBOYAmt,
                        TotNetAstOrFundBalancesEOYAmt,
                        AdjustedQualifyingDistriAmt,
                        ExcessDistriCyovToNextYrAmt
                   ]]
    
    return_df = pd.DataFrame(return_list, columns =
                             ['AddressChangeInd',
                            'InitialReturnInd',
                            'InitialReturnFormerPubChrtyInd',
                            'AmendedReturnInd',
                            'FinalReturnInd',
                            'NameChange',
                            'PFStatusTermSect507b1AInd',
                            'Organization501c3ExemptPFInd',
                            'Organization4947a1TrtdPFInd',
                            'Organization501c3TaxablePFInd',
                            'FMVAssetsEOYAmt',
                            'AdjustedNetIncomeAmt',
                            'TotalRevAndExpnssAmt',
                            'TotalNetInvstIncmAmt',
                            'TotalAdjNetIncmAmt',
                            'TotalExpensesRevAndExpnssAmt',
                            'TotalExpensesNetInvstIncmAmt',
                            'TotalExpensesAdjNetIncmAmt',
                            'TotalExpensesDsbrsChrtblAmt',
                            'ExcessRevenueOverExpensesAmt',
                            'NetInvestmentIncomeAmt',
                            'AdjustedNetIncomeAmt',
                            'TotalAssetsBOYAmt',
                            'TotalAssetsEOYAmt',
                            'TotalAssetsEOYFMVAmt',
                            'TotalLiabilitiesBOYAmt',
                            'TotalLiabilitiesEOYAmt',
                            'TotNetAstOrFundBalancesBOYAmt',
                            'TotNetAstOrFundBalancesEOYAmt',
                            'AdjustedQualifyingDistriAmt',
                            'ExcessDistriCyovToNextYrAmt'])

    #Concatenating header information and contributions
    return pd.concat([object_df, filer_df, return_df], axis=1, ignore_index=False)
    

In [13]:
#This is the method I used to get all the files in the directory

import os

'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles


dirName = 'C:/Users/Administrator/Desktop/irs_2019';
files = getListOfFiles(dirName)
files = files[0:70750]

In [14]:
df = pd.concat([one_row for name in tqdm(files) if (one_row := analyze_one(name)) is not None])
df


  0%|          | 0/70750 [00:00<?, ?it/s][A
  0%|          | 34/70750 [00:00<03:31, 334.49it/s][A
  0%|          | 62/70750 [00:00<03:47, 311.07it/s][A
  0%|          | 96/70750 [00:00<03:41, 318.73it/s][A
  0%|          | 152/70750 [00:00<03:13, 365.70it/s][A
  0%|          | 195/70750 [00:00<03:04, 382.40it/s][A
  0%|          | 230/70750 [00:00<04:11, 280.58it/s][A
  0%|          | 268/70750 [00:00<03:51, 304.02it/s][A
  0%|          | 307/70750 [00:00<03:37, 324.25it/s][A
  0%|          | 346/70750 [00:01<03:28, 337.59it/s][A
  1%|          | 384/70750 [00:01<03:22, 346.82it/s][A
  1%|          | 420/70750 [00:01<03:48, 308.08it/s][A
  1%|          | 466/70750 [00:01<03:25, 341.51it/s][A
  1%|          | 503/70750 [00:01<03:22, 346.23it/s][A
  1%|          | 542/70750 [00:01<03:16, 357.75it/s][A
  1%|          | 579/70750 [00:01<04:14, 275.60it/s][A
  1%|          | 624/70750 [00:01<03:45, 310.99it/s][A
  1%|          | 679/70750 [00:01<03:16, 357.17it/s][A
  1%|

Unnamed: 0,Object_ID,EIN,BusinessName,TaxPeriodBeginDt,TaxPeriodEndDt,TaxYr,City,State,ZIPCd,AddressChangeInd,...,AdjustedNetIncomeAmt,TotalAssetsBOYAmt,TotalAssetsEOYAmt,TotalAssetsEOYFMVAmt,TotalLiabilitiesBOYAmt,TotalLiabilitiesEOYAmt,TotNetAstOrFundBalancesBOYAmt,TotNetAstOrFundBalancesEOYAmt,AdjustedQualifyingDistriAmt,ExcessDistriCyovToNextYrAmt
0,2019201900079349100000,134011572,THE WICKHAM FOUNDATION INC,2017-06-01,2018-05-31,2017,NEW YORK,NY,10017,,...,,,15997,15997,0,,39093,15997,38773,259914
0,2019201900079349100050,812525454,A&M FELDMAN FOUNDATION INC,2017-07-01,2018-06-30,2017,CHERRY HILL,NJ,08003,,...,,,390280,399871,0,,362043,390280,63176,72563
0,2019201900079349100100,232439490,HAROLD AND RENEE BERGER FOUNDATION,2017-01-01,2017-12-31,2017,PHILADELPHIA,PA,191193505,,...,,,199241,237966,0,,131062,199241,43723,603989
0,2019201900079349100150,136220799,The Joelson Foundation,2017-04-01,2018-03-31,2017,PRINCETON,NJ,08540,,...,,,15438686,17890243,0,,16426585,15438686,1434450,1959349
0,2019201900079349100200,954718400,THE RONALD AND JANE OLSON FOUNDATION,2017-12-01,2018-11-30,2017,LOS ANGELES,CA,90071,,...,79020,79020,1647095,4181547,0,,1647223,1636332,199210,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2019201902989349101020,226034130,FLO OKIN CANCER RELIEF INC,2018-06-01,2019-05-31,2018,SPRINGFIELD,NJ,07081,,...,0,0,250090,250090,0,,252896,250090,8673,87233
0,2019201902989349101025,204189683,TAMAASUS,2018-01-01,2018-12-31,2018,LOS ANGELES,CA,90036,,...,,,14664,19872,0,,16801,14664,30804,304422
0,2019201902989349101100,223674497,MICHAEL JOHN BROWN MEMORIAL FUND,2018-09-01,2019-08-31,2018,OCEAN CITY,NJ,08226,,...,4980,4980,75360,74763,,,71835,75360,1685,4998
0,2019201902989349101105,273726563,HANNA AND MATTHEW FOUNDATION,2018-01-01,2018-12-31,2018,NEW YORK,NY,10017,,...,,,10444681,10415766,,,10551308,10444681,446785,


In [15]:
df.to_csv("C:/Users/Administrator/Desktop/Impact_Capital/data/pf/2019_objects_part1.csv")

In [22]:
files = getListOfFiles(dirName)
files = files[70850:77774]
df = pd.concat([one_row for name in tqdm(files) if (one_row := analyze_one(name)) is not None])
df.to_csv("C:/Users/Administrator/Desktop/Impact_Capital/data/pf/2019_objects_part2.csv")

  0%|          | 46/404774 [07:20<1077:34:20,  9.58s/it]
100%|██████████| 6924/6924 [00:22<00:00, 312.83it/s]


In [29]:
files = getListOfFiles(dirName)
files = files[77874:475574]
df = pd.concat([one_row for name in tqdm(files) if (one_row := analyze_one(name)) is not None])
df.to_csv("C:/Users/Administrator/Desktop/Impact_Capital/data/pf/2019_objects_part3.csv")

100%|██████████| 397700/397700 [21:12<00:00, 312.55it/s] 
