# Converting XML files to CSV 

In [None]:
import pandas as pd
from xml.etree import cElementTree as ElementTree
import xml.etree.ElementTree as et
from tqdm import tqdm
import os

# Use IRS 2017 index https://s3.amazonaws.com/irs-form-990/index_2017.csv to subset data

In [None]:
#990 2017 index from the IRS
index = pd.read_csv('data/index_2017.csv')

#Index cleaning
index['filename'] = index['OBJECT_ID'].astype(str) + '_public.xml'
print(index.info())
index

In [None]:
#Save just a list of the file names 
filenames = index[['filename']]

#for some reason when I try to save it as a .txt file it changes some digits to zero! So what I did was save it as a csv, copy and paste the column into notepad and save
filenames.to_csv(r'data/filenames_990_2017.csv', header=None, index=None, sep=' ', mode='a')

#At this point I run the batch file to move all of the files to a 2017 only file. 


# Here is the batch file I created to move 2017 files to its own folder, next time try without 'type

# Fields to pull from IRS 990



# Function to convert XML files to Python Dictionaries 

In [None]:
class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                # treat like dict
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                # treat like list
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    '''
    Example usage:

    >>> tree = ElementTree.parse('your_file.xml')
    >>> root = tree.getroot()
    >>> xmldict = XmlDictConfig(root)

    Or, if you want to use an XML string:

    >>> root = ElementTree.XML(xml_string)
    >>> xmldict = XmlDictConfig(root)

    And then use xmldict for what it is... a dict.
    '''
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                # treat like dict - we assume that if the first two tags
                # in a series are different, then they are all different.
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                # treat like list - we assume that if the first two tags
                # in a series are the same, then the rest are the same.
                else:
                    # here, we put the list in dictionary; the key is the
                    # tag name the list elements all share in common, and
                    # the value is the list itself 
                    aDict = {element[0].tag: XmlListConfig(element)}
                # if the tag has attributes, add those to the dict
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            # this assumes that if you've got an attribute in a tag,
            # you won't be having any text. This may or may not be a 
            # good idea -- time will tell. It works for the way we are
            # currently doing XML configuration files...
            elif element.items():
                self.update({element.tag: dict(element.items())})
            # finally, if there are no child tags and no attributes, extract
            # the text
            else:
                self.update({element.tag: element.text})

def get_field(xmldict, path=None, default=None):
    if path is None: return xmldict
    try:
        for p in path:
            # Ugly namespaces are ugly, let's deal with them here
            xmldict = xmldict['{http://www.irs.gov/efile}' + p]
        return xmldict
    except KeyError:
        # If this key does not exist in the dictionary, 
        return default
        # Note: it may be faster to check ```if p not in dict``` rather
        # than set up this try/except block. I think it depends on how
        # often this fails - the more it fails, the worse the try/except does.

# Analyze one file

This function returns a `1 x n` DataFrame of all relevant data for a single file, which can be appended to a larger DataFrame for multi-file analysis.

In [None]:
def analyze_one(path):
    tree = et.parse(path)
    root = tree.getroot()
    #Converting from xml to dictionary
    xmldict = XmlDictConfig(root)

    #These are the parent keys of the dictionary
    #xmldict.keys()

    ##############Return Data
    return_data = get_field(xmldict, ['ReturnData', 'IRS990'])
    if return_data is None:
        return None

    #Related Org
    related = get_field(return_data, ['RelatedOrganizationsAmt'], 0)

    #GovernmentGrantsAmt
    gov_grants = get_field(return_data, ['GovernmentGrantsAmt'], 0)

    #FederatedCampaignsAmt
    fed_camp = get_field(return_data, ['FederatedCampaignsAmt'], 0)

    #MembershipDuesAmt
    membership = get_field(return_data, ['MembershipDuesAmt'], 0)

    #FundraisingAmt
    fundraising = get_field(return_data, ['FundraisingAmt'], 0)

    #AllOtherContributionsAmt
    all_other = get_field(return_data, ['AllOtherContributionsAmt'], 0)

    #TotalContributionsAmt
    total_contri = get_field(return_data, ['TotalContributionsAmt'], 0)

    #MissionDesc
    descri = get_field(return_data, ['MissionDesc'], "")

    #TypeOfOrganizationCorpInd
    corp_ind = get_field(return_data, ['TypeOfOrganizationCorpInd'], "NA")

    
    ##############Header Information 
    filer_data = get_field(xmldict, ['ReturnHeader', 'Filer'], {})
    ein = get_field(filer_data, ['EIN'])
    business_name = get_field(filer_data, ['BusinessName', 'BusinessNameLine1Txt'])
    city = get_field(filer_data, ['USAddress', 'CityNm'])
    state = get_field(filer_data, ['USAddress', 'StateAbbreviationCd'])
    filer_list = [[ein, business_name, city, state]]
    filer_df = pd.DataFrame(filer_list, columns = ['EIN', 'BusinessName', 'City', 'State']) 

    contribution_list = [[related, gov_grants,fed_camp,membership, all_other, total_contri, descri, corp_ind]]
    contribution_df = pd.DataFrame(contribution_list, columns = ['RelatedOrganizationsAmt','GovernmentGrantsAmt','FederatedCampaignsAmt','MembershipDuesAmt',
                                                                 'AllOtherContributionsAmt','TotalContributionsAmt','MissionDesc','TypeOfOrganizationCorpInd'])

    #Concatenating header information and contributions
    return pd.concat([filer_df, contribution_df], axis=1, ignore_index=False)
    

# Example


In [None]:
df = pd.DataFrame([])

one_row = analyze_one('data/201602159349301240_public.xml')
df = df.append(one_row) 
df

# Function to get a list of files in the 990_2017 directory to loop over

In [None]:
'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

# Use a for loop to create the dataframe with selected field
* Some fields are blank, so you have to create an if else in order to fill it in 

In [None]:
dirName = 'data/IRS990_2017';
files = getListOfFiles(dirName)
len(files)

In [None]:
df = pd.concat([one_row for name in tqdm(files[:10000]) if (one_row := analyze_one(name)) is not None])
df

In [None]:
df.to_csv("data/just_contributions.csv")