# This is the tutorial located here:
# * https://github.com/jsfenfen/irsx_cookbook

In [198]:
# importing libraries we'll use.
import csv
import os
import pandas as pd

# This tells us where the csv files are located in the system
from irsx.settings import INDEX_DIRECTORY

In [199]:
texas_np = pd.read_csv("data/eo_tx.csv")

In [200]:

# look at the first few lines
#oregon_np.head()
## print the headers as an array
list(texas_np)

['EIN',
 'NAME',
 'ICO',
 'STREET',
 'CITY',
 'STATE',
 'ZIP',
 'GROUP',
 'SUBSECTION',
 'AFFILIATION',
 'CLASSIFICATION',
 'RULING',
 'DEDUCTIBILITY',
 'FOUNDATION',
 'ACTIVITY',
 'ORGANIZATION',
 'STATUS',
 'TAX_PERIOD',
 'ASSET_CD',
 'INCOME_CD',
 'FILING_REQ_CD',
 'PF_FILING_REQ_CD',
 'ACCT_PD',
 'ASSET_AMT',
 'INCOME_AMT',
 'REVENUE_AMT',
 'NTEE_CD',
 'SORT_NAME']

In [201]:

# Ignore some columns for now
tx_np_simplified = texas_np.filter(items=['EIN', 'NAME', 'ICO', 'STREET', 'CITY', 'STATE', 'ZIP', 'INCOME_AMT', 'ASSET_AMT', 'TAX_PERIOD'])
print("total Texas orgs: %s" % len(tx_np_simplified))

# This is a toy filter for a demo -- you'd want something more robust than a perfect text match
atx_orgs = tx_np_simplified.query('CITY == "AUSTIN"')
print("total Austin, TX orgs: %s" % len(atx_orgs))

total Texas orgs: 127566
total Austin, TX orgs: 7756


In [202]:
# Show the top values by income 
atx_orgs.sort_values(by=['INCOME_AMT'], ascending=[0]).head()

Unnamed: 0,EIN,NAME,ICO,STREET,CITY,STATE,ZIP,INCOME_AMT,ASSET_AMT,TAX_PERIOD
60225,742615873,TEXAS WORKERS COMPENSATION,% TEXAS MUTUAL INSURANCE COMPANY,2200 ALDRICH STREET,AUSTIN,TX,78723-3474,6632507000.0,7177542000.0,201812.0
58455,742481167,SOUTHWEST KEY PROGRAMS INC,,6002 JAIN LN,AUSTIN,TX,78721-3104,409257800.0,165450900.0,201808.0
25555,364336415,THE MICHAEL AND SUSAN DELL FOUNDATION,% MICH,4417 WESTLAKE DR,AUSTIN,TX,78746-1437,281163200.0,1612382000.0,201812.0
59837,742587416,ELECTRIC RELIABILITY COUNCIL OF TEXAS INC 10-1...,,7620 METRO CENTER DR,AUSTIN,TX,78744-1613,244963900.0,1529764000.0,201812.0
50747,741109641,ST EDWARDS UNIVERSITY,% KIMBERLY KVAAL,3001 S CONGRESS AVE,AUSTIN,TX,78704-6489,225910400.0,382111700.0,201806.0


In [203]:
# this is from the index file we dowloaded at the start
pd.set_option("display.max_rows", 10)
pd.set_option('display.min_rows', 10)
INDEX_2017= os.path.join(INDEX_DIRECTORY, 'index_2017.csv')
np_2017 = pd.read_csv(INDEX_2017)
np_2017

Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID
0,14054169,EFILE,42662873,201603,1/4/2017 10:27:37 AM,ELKS BUILDING CORP OF NORWOOD,990O,93493243000066,201612439349300006
1,14056200,EFILE,42964630,201512,1/4/2017 8:17:38 PM,NEIGHBORHOOD OF AFFORDABLE HOUSING INC,990,93493243000266,201612439349300026
2,14055992,EFILE,382912028,201512,1/4/2017 6:07:44 PM,RELEAF MICHIGAN INC,990,93493243003416,201612439349300341
3,14056203,EFILE,200509226,201605,1/4/2017 8:17:39 PM,ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB,990,93493243005166,201612439349300516
4,14057332,EFILE,202699020,201512,1/4/2017 11:23:48 PM,KARLA SMITH FOUNDATION,990,93493243005466,201612439349300546
...,...,...,...,...,...,...,...,...,...
489008,15064384,EFILE,272948627,201512,12/29/2017 8:50:35 AM,CONSTELLA FESTIVAL OF MUSIC AND FINE ARTS,990EZ,93492320001487,201733209349200148
489009,15060607,EFILE,200745749,201512,12/28/2017 12:35:11 PM,CHRIST CHURCH INTERNATIONAL INC,990,93493319184457,201703199349318445
489010,15057955,EFILE,263520140,201608,12/27/2017 11:57:28 PM,REBELS SOCCER CLUB,990EZ,93492319074127,201723199349207412
489011,15065145,EFILE,770572762,201707,12/29/2017 12:23:10 PM,RISING FARMWORKER DREAM FUND,990PF,93491317015447,201743179349101544


# Now save the list of possible filers who actually filed out to .csv

In [204]:
# Now find orgs that are in portland that filed in 2017. 
# This join requires that both fields be named EIN and be formatted the same

atx_2017_efilers = pd.merge(np_2017,
                 atx_orgs,
                 on='EIN')
print("Found a total of %s Austin 2017 efilers" % len(atx_2017_efilers))
atx_2017_efilers.head()

# sort by income amt, asset amt
atx_2017_efilers = atx_2017_efilers.sort_values(by=['INCOME_AMT', 'ASSET_AMT'], ascending=[0,0])
# Lets write them back out to a file for reference.
atx_2017_efilers.to_csv('data/atxefilers.csv')

# These are the top few for reference
atx_2017_efilers.head()

Found a total of 2046 Austin 2017 efilers


Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD_x,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID,NAME,ICO,STREET,CITY,STATE,ZIP,INCOME_AMT,ASSET_AMT,TAX_PERIOD_y
362,14179653,EFILE,742615873,201512,3/2/2017 1:11:43 PM,TEXAS MUTUAL INSURANCE COMPANY,990O,93493316016006,201603169349301600,TEXAS WORKERS COMPENSATION,% TEXAS MUTUAL INSURANCE COMPANY,2200 ALDRICH STREET,AUSTIN,TX,78723-3474,6632507000.0,7177542000.0,201812.0
1386,14659051,EFILE,742481167,201608,8/18/2017 9:35:04 PM,SOUTHWEST KEY PROGRAMS INC,990,93493100001117,201711009349300111,SOUTHWEST KEY PROGRAMS INC,,6002 JAIN LN,AUSTIN,TX,78721-3104,409257800.0,165450900.0,201808.0
1879,14927595,EFILE,742587416,201612,11/6/2017 7:57:25 PM,ELECTRIC RELIABILITY COUNCIL OF TEXAS INC,990O,93493262007957,201702629349300795,ELECTRIC RELIABILITY COUNCIL OF TEXAS INC 10-1...,,7620 METRO CENTER DR,AUSTIN,TX,78744-1613,244963900.0,1529764000.0,201812.0
1594,14810761,EFILE,741109641,201606,10/6/2017 11:07:48 AM,ST EDWARDS UNIVERSITY,990,93493129006337,201731299349300633,ST EDWARDS UNIVERSITY,% KIMBERLY KVAAL,3001 S CONGRESS AVE,AUSTIN,TX,78704-6489,225910400.0,382111700.0,201806.0
1580,14775643,EFILE,454364243,201606,9/27/2017 9:40:06 AM,ASCENSION TEXAS,990,93493135124907,201701359349312490,ASCENSION HEALTH TEXAS,% TAX DEPT LANNA DOMANGUE,1345 PHILOMENA STREET,AUSTIN,TX,78723-3210,222690700.0,35706800.0,201906.0


# Dump salaries from schedule J in Austin, TX (continuing from part 1)

In [205]:
import unicodecsv as csv
from irsx.xmlrunner import XMLRunner
import pandas as pd

In [206]:

# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('data/atxefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)

In [207]:
# the name of the output file
outfilename ="data/employees_simple.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["period", "ein", "object_id", "taxpayer_name", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [208]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

In [209]:
DEMO_MAX = 100
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    # Is there a Schedule J there at all?
    schedule_list = parsed_filing.list_schedules()
    if 'IRS990ScheduleJ' in schedule_list:
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['period'] = row['TAX_PERIOD_x']
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        outputdata['taxpayer_name'] = row['TAXPAYER_NAME']
        
        # some schedules can appear multiple times, but schedule j only appears once
        # so we grab the first one 
        parsed_skedj = parsed_filing.get_parsed_sked('IRS990ScheduleJ')[0] 
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            employee_groups = parsed_skedj['groups']['SkdJRltdOrgOffcrTrstKyEmpl']
        except KeyError:
            print("No SkdJRltdOrgOffcrTrstKyEmpl found in %s skipping" % this_object_id)
            continue
          
        # read through each employee and pull out the data we want
        for employee_group in employee_groups:
            
            # That leaves the following values to come from schedule J if there is one
            # "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"
            # those keys come from the headers we gave dictwriter before
            outputdata['name'] = employee_group.get('PrsnNm')
            outputdata['business_name1'] = employee_group.get('BsnssNmLn1Txt')
            outputdata['business_name2'] = employee_group.get('BsnssNmLn2Txt')
            outputdata['title'] = employee_group.get('TtlTxt')
            outputdata['org_comp'] = employee_group.get('TtlCmpnstnFlngOrgAmt') # Part II Column (E) 
            outputdata['related_comp'] = employee_group.get('TtlCmpnstnRltdOrgsAmt')
        
            dw.writerow(outputdata)
        
    else:
        print("No schedule J in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
    if(num_rows > DEMO_MAX):
        break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

No schedule J in filing 201720319349301317, skipping
No schedule J in filing 201622589349301327, skipping
No schedule J in filing 201602369349300115, skipping
No schedule J in filing 201742699349300864, skipping
No schedule J in filing 201602999349300935, skipping
No schedule J in filing 201722499349300542, skipping
No schedule J in filing 201721319349305482, skipping
No schedule J in filing 201740269349300214, skipping
No schedule J in filing 201722709349300347, skipping
No schedule J in filing 201622529349300537, skipping
No schedule J in filing 201612249349302741, skipping
No schedule J in filing 201603209349104055, skipping
No schedule J in filing 201720939349300132, skipping
No schedule J in filing 201720459349301022, skipping
No schedule J in filing 201700469349301095, skipping
No schedule J in filing 201711309349303006, skipping
Processed 100 filings


In [210]:
# close the outfile
outfile.close()

pd.set_option("display.max_rows", 20)
pd.set_option('display.min_rows', 20)
sked_j_atx_efilers = pd.read_csv(outfilename)
sked_j_atx_efilers

Unnamed: 0,period,ein,object_id,taxpayer_name,name,business_name1,business_name2,title,org_comp,related_comp
0,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Richard J Gergasko,,,President & CEO,760251.0,0.0
1,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Perry Michael Barron,,,SVP & CFO,485453.0,0.0
2,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,William Allen McLellan,,,SVP Information Technology,459291.0,0.0
3,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Terrence L Buchheit,,,VP Applications,479005.0,0.0
4,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Leonard Randall Johnson,,,SVP Investments,494329.0,0.0
5,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Mary B Nichols,,,General Counsel & SVP,440911.0,0.0
6,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Steven E Math,,,SVP Underwriting,439297.0,0.0
7,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Terry L Frakes,,,SVP Public Affairs,395498.0,0.0
8,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Benjamin Joel Turner,,,SVP & Chief Actuary,440441.0,0.0
9,201512,742615873,201603169349301600,TEXAS MUTUAL INSURANCE COMPANY,Brenda J Ward,,,SVP Policyholder Services,409371.0,0.0


# Now I'm going to try and repeat this but just for the revenue streams 

In [219]:

# We're creating a dictionary of all the nonprofits 
file_rows = [] 
# Essentially each row becomes an ordered dict 
with open('data/atxefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)

In [212]:
# the name of the output file
outfilename ="data/contributions_programsv_rev.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["period", "ein", "object_id", "taxpayer_name", "campaigns", "membership", "fundraising", "related_orgs", "government_grants", "all_other_contributions", "non_cash_contributions", "total_contributions", "program_service_rev"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [220]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

In [221]:
for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 

In [235]:
parsed_990 = parsed_filing.get_parsed_sked('IRS990')[0]
list(parsed_990.keys())
parsed_990['schedule_parts']

{'part_0': {'object_id': '201712699349301281',
  'ein': '741109643',
  'DngBsnssAsNm_BsnssNmLn1Txt': 'Seton Healthcare Network',
  'PrncplOfcrBsnssNm_BsnssNmLn1Txt': 'Scott Herndon',
  'USAddrss_AddrssLn1Txt': '1345 Philomena Street',
  'USAddrss_CtyNm': 'Austin',
  'USAddrss_SttAbbrvtnCd': 'TX',
  'USAddrss_ZIPCd': '78723',
  'GrssRcptsAmt': '2033803591',
  'GrpRtrnFrAffltsInd': 'false',
  'GrpExmptnNm': '0928',
  'Orgnztn501c3Ind': 'X',
  'WbstAddrssTxt': 'http://www.seton.net',
  'OfOrgnztnCrpInd': 'X',
  'FrmtnYr': '1900',
  'LglDmclSttCd': 'TX'},
 'part_i': {'object_id': '201712699349301281',
  'ein': '741109643',
  'ActvtyOrMssnDsc': 'Provision of healthcare services with a special concern for the poor and vulnerable.',
  'VtngMmbrsGvrnngBdyCnt': '12',
  'VtngMmbrsIndpndntCnt': '10',
  'TtlEmplyCnt': '13208',
  'TtlVlntrsCnt': '2000',
  'TtlGrssUBIAmt': '860222',
  'NtUnrltdBsTxblIncmAmt': '-1551389',
  'PYCntrbtnsGrntsAmt': '31675532',
  'CYCntrbtnsGrntsAmt': '26656186',
  'PYPr

In [233]:
part8 = parsed_990['schedule_parts']['part_viii']
part8 = pd.DataFrame(part8.items(), columns=['Variable', 'Value'])
part8

Unnamed: 0,Variable,Value
0,object_id,201712699349301281
1,ein,741109643
2,InfInSkdOPrtVIIIInd,X
3,RltdOrgnztnsAmt,19277811
4,GvrnmntGrntsAmt,4794351
5,AllOthrCntrbtnsAmt,2584024
6,TtlCntrbtnsAmt,26656186
7,TtlOthPrgrmSrvcRv_TtlRvnClmnAmt,3957898
8,TtlOthPrgrmSrvcRv_RltdOrExmptFncIncmAmt,3270942
9,TtlOthPrgrmSrvcRv_UnrltdBsnssRvnAmt,686956


In [214]:
DEMO_MAX = 10
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    # Is there a 990 there at all?
    schedule_list = parsed_filing.list_schedules()
    if 'IRS990' in schedule_list:
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['period'] = row['TAX_PERIOD_x']
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        outputdata['taxpayer_name'] = row['TAXPAYER_NAME']
        
        # some schedules can appear multiple times, but schedule j only appears once
        # so we grab the first one 
        parsed_990 = parsed_filing.get_parsed_sked('IRS990')[0] 
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            program_service_groups = parsed_990['schedule_parts']['part_viii']
        except KeyError:
            print("No part_viii found in %s skipping" % this_object_id)
            continue
          
        # read through each program and pull out the data we want
        for program_service_group in program_service_groups:
            
            # those keys come from the headers we gave dictwriter before
            outputdata["campaigns"] = program_service_group.get('FdrtdCmpgnsAmt')
            outputdata["membership"] = program_service_group.get('MmbrshpDsAmt')
            outputdata["fundraising"] = program_service_group.get('FndrsngAmt')
            outputdata["related_orgs"] = program_service_group.get('RltdOrgnztnsAmt')
            outputdata["government_grants"] = program_service_group.get('GvrnmntGrntsAmt')  
            outputdata["all_other_contributions"] = program_service_group.get('AllOthrCntrbtnsAmt')
            outputdata["non_cash_contributions"] = program_service_group.get('NncshCntrbtnsAmt')
            outputdata["total_contributions"] = program_service_group.get('AllOthrCntrbtnsAmt')
            outputdata["program_service_rev"] = program_service_group.get('TtlOthPrgrmSrvcRv_TtlRvnClmnAmt') #this can be pulled for all columns 
            
            dw.writerow(outputdata)
        
    else:
        print("No 990 in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
    if(num_rows > DEMO_MAX):
        break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

AttributeError: 'str' object has no attribute 'get'

In [152]:
# close the outfile
outfile.close()

pd.set_option("display.max_rows", 1000)
pd.set_option('display.min_rows', 1000)

x990_atx_efilers = pd.read_csv(outfilename)
x990_atx_efilers

Unnamed: 0,period,ein,object_id,taxpayer_name,campaigns,membership,fundraising,related_orgs,government_grants,all_other_contributions,non_cash_contributions,total_contributions,program_service_rev
