# Converting XML files to CSV 

In [3]:
import pandas as pd
import xml.etree.ElementTree as et


# Use IRS 2017 index https://s3.amazonaws.com/irs-form-990/index_2017.csv to subset data

In [9]:
#990 2017 index from the IRS
index = pd.read_csv('data/index_2017.csv')

#Index cleaning
index["filename"] = index["OBJECT_ID"].astype(str) + '_public.xml'
print(index.info())
index


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489013 entries, 0 to 489012
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RETURN_ID      489013 non-null  int64 
 1   FILING_TYPE    489013 non-null  object
 2   EIN            489013 non-null  int64 
 3   TAX_PERIOD     489013 non-null  int64 
 4   SUB_DATE       489013 non-null  object
 5   TAXPAYER_NAME  489013 non-null  object
 6   RETURN_TYPE    489013 non-null  object
 7   DLN            489013 non-null  int64 
 8   OBJECT_ID      489013 non-null  int64 
 9   filename       489013 non-null  object
dtypes: int64(5), object(5)
memory usage: 37.3+ MB
None


Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID,filename
0,14054169,EFILE,42662873,201603,1/4/2017 10:27:37 AM,ELKS BUILDING CORP OF NORWOOD,990O,93493243000066,201612439349300006,201612439349300006_public.xml
1,14056200,EFILE,42964630,201512,1/4/2017 8:17:38 PM,NEIGHBORHOOD OF AFFORDABLE HOUSING INC,990,93493243000266,201612439349300026,201612439349300026_public.xml
2,14055992,EFILE,382912028,201512,1/4/2017 6:07:44 PM,RELEAF MICHIGAN INC,990,93493243003416,201612439349300341,201612439349300341_public.xml
3,14056203,EFILE,200509226,201605,1/4/2017 8:17:39 PM,ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB,990,93493243005166,201612439349300516,201612439349300516_public.xml
4,14057332,EFILE,202699020,201512,1/4/2017 11:23:48 PM,KARLA SMITH FOUNDATION,990,93493243005466,201612439349300546,201612439349300546_public.xml
...,...,...,...,...,...,...,...,...,...,...
489008,15064384,EFILE,272948627,201512,12/29/2017 8:50:35 AM,CONSTELLA FESTIVAL OF MUSIC AND FINE ARTS,990EZ,93492320001487,201733209349200148,201733209349200148_public.xml
489009,15060607,EFILE,200745749,201512,12/28/2017 12:35:11 PM,CHRIST CHURCH INTERNATIONAL INC,990,93493319184457,201703199349318445,201703199349318445_public.xml
489010,15057955,EFILE,263520140,201608,12/27/2017 11:57:28 PM,REBELS SOCCER CLUB,990EZ,93492319074127,201723199349207412,201723199349207412_public.xml
489011,15065145,EFILE,770572762,201707,12/29/2017 12:23:10 PM,RISING FARMWORKER DREAM FUND,990PF,93491317015447,201743179349101544,201743179349101544_public.xml


In [1]:
#Save just a list of the file names 
filenames = index[['filename']]

#for some reason when I try to save it as a .txt file it changes some digits to zero! So what I did was save it as a csv, copy and paste the column into notepad and save
filenames.to_csv(r'data/filenames_990_2017.csv', header=None, index=None, sep=' ', mode='a')

#At this point I run the batch file to move all of the files to a 2017 only file. 


NameError: name 'index' is not defined

# Here is the batch file I created to move 2017 files to its own folder, next time try without 'type

# Fields to pull from IRS 990



# Function to convert XML files to Python Dictionaries 

In [10]:
from xml.etree import cElementTree as ElementTree

class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

def get_field(xmldict, path=None, default=None):
    if path is None: return xmldict
    try:
        for p in path:
            # Ugly namespaces are ugly, let's deal with them here
            xmldict = xmldict['{http://www.irs.gov/efile}' + p]
        return xmldict
    except KeyError:
        # If this key does not exist in the dictionary, 
        return default
        # Note: it may be faster to check ```if p not in dict``` rather
        # than set up this try/except block. I think it depends on how
        # often this fails - the more it fails, the worse the try/except does.

# Analyze one file

This function returns a `1 x n` DataFrame of all relevant data for a single file, which can be appended to a larger DataFrame for multi-file analysis.

In [16]:
def analyze_one(path):
    tree = et.parse(path)
    root = tree.getroot()
    #Converting from xml to dictionary
    xmldict = XmlDictConfig(root)

    #These are the parent keys of the dictionary
    #xmldict.keys()

    ##############Return Data
    return_data = get_field(xmldict, ['ReturnData', 'IRS990'])
    if return_data is None:
        return None

    #Related Org
    related = get_field(return_data, ['RelatedOrganizationsAmt'], 0)

    #GovernmentGrantsAmt
    gov_grants = get_field(return_data, ['GovernmentGrantsAmt'], 0)

    #FederatedCampaignsAmt
    fed_camp = get_field(return_data, ['FederatedCampaignsAmt'], 0)

    #MembershipDuesAmt
    membership = get_field(return_data, ['MembershipDuesAmt'], 0)

    #FundraisingAmt
    fundraising = get_field(return_data, ['FundraisingAmt'], 0)

    #AllOtherContributionsAmt
    all_other = get_field(return_data, ['AllOtherContributionsAmt'], 0)

    #TotalContributionsAmt
    total_contri = get_field(return_data, ['TotalContributionsAmt'], 0)

    #MissionDesc
    descri = get_field(return_data, ['MissionDesc'], "")

    #TypeOfOrganizationCorpInd
    corp_ind = get_field(return_data, ['TypeOfOrganizationCorpInd'], "NA")

    
    ##############Header Information 
    filer_data = get_field(xmldict, ['ReturnHeader', 'Filer'], {})
    ein = get_field(filer_data, ['EIN'])
    business_name = get_field(filer_data, ['BusinessName', 'BusinessNameLine1Txt'])
    city = get_field(filer_data, ['USAddress', 'CityNm'])
    state = get_field(filer_data, ['USAddress', 'StateAbbreviationCd'])
    filer_list = [[ein, business_name, city, state]]
    filer_df = pd.DataFrame(filer_list, columns = ['EIN', 'BusinessName', 'City', 'State']) 

    contribution_list = [[related, gov_grants,fed_camp,membership, all_other, total_contri, descri, corp_ind]]
    contribution_df = pd.DataFrame(contribution_list, columns = ['RelatedOrganizationsAmt','GovernmentGrantsAmt','FederatedCampaignsAmt','MembershipDuesAmt',
                                                                 'AllOtherContributionsAmt','TotalContributionsAmt','MissionDesc','TypeOfOrganizationCorpInd'])

    #Concatenating header information and contributions
    return pd.concat([filer_df, contribution_df], axis=1, ignore_index=False)
    

# Example


In [15]:
df = pd.DataFrame([])

one_row = analyze_one('data/test/201602159349301240_public.xml')
df = df.append(one_row) 
df

Unnamed: 0,EIN,BusinessName,City,State,RelatedOrganizationsAmt,GovernmentGrantsAmt,FederatedCampaignsAmt,MembershipDuesAmt,AllOtherContributionsAmt,TotalContributionsAmt,MissionDesc,TypeOfOrganizationCorpInd
0,251753030,Consumer Health Coalition,Pittsburgh,PA,0,0,0,3410,1476893,1480303,To inspire consumer movement to enhance access...,X


# Function to get a list of files in the 990_2017 directory to loop over

In [4]:
import os

'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

# Use a for loop to create the dataframe with selected field
* Some fields are blank, so you have to create an if else in order to fill it in 

In [6]:
from tqdm import tqdm

dirName = 'data/IRS990_2017';
files = getListOfFiles(dirName)
df = pd.DataFrame([])

for file in tqdm(files):
    one_row = analyze_one(file)
    if one_row is None:
        continue
    
    df = df.append(one_row)

100%|██████████| 489013/489013 [10:04:57<00:00, 13.47it/s]  


In [8]:
df.to_csv("data/just_contributions.csv")