In [35]:
import requests
import xml.etree.ElementTree as ET


def extract_info_from_online_form990(xml_url):
    response = requests.get(xml_url)

    if response.status_code == 200:
        # Parse XML with namespaces
        ET.register_namespace("", "http://www.irs.gov/efile")
        namespaces = {"efile": "http://www.irs.gov/efile"}
        root = ET.fromstring(response.content)

        # Extract EIN
        ein = root.find('.//efile:EIN', namespaces).text

        # Extract Name
        name = root.find('.//efile:BusinessName/efile:BusinessNameLine1Txt', namespaces).text

        # Extract Address
        address_line1 = root.find('.//efile:Filer/efile:USAddress/efile:AddressLine1Txt', namespaces)
        city = root.find('.//efile:Filer/efile:USAddress/efile:CityNm', namespaces)
        state = root.find('.//efile:Filer/efile:USAddress/efile:StateAbbreviationCd', namespaces)
        zip_code = root.find('.//efile:Filer/efile:USAddress/efile:ZIPCd', namespaces)
        address = f"{address_line1.text if address_line1 is not None else 'N/A'}, {city.text if city is not None else 'N/A'}, {state.text if state is not None else 'N/A'} {zip_code.text if zip_code is not None else 'N/A'}"

        # Extract Total Contributions and Grants Amount
        contributions_amount = root.find('.//efile:IRS990/efile:CYContributionsGrantsAmt', namespaces)
        contributions_amount = contributions_amount.text if contributions_amount is not None else "N/A"

        # Extract Fundraising Partners
        fundraising_partners = root.findall('.//efile:IRS990ScheduleG/efile:FundraiserActivityInfoGrp/efile:OrganizationBusinessName/efile:BusinessNameLine1Txt', namespaces)
        fundraising_partner_names = [partner.text for partner in fundraising_partners]

        # Extract Independent Contractors
        independent_contractors = root.findall('.//efile:IRS990/efile:ContractorCompensationGrp/efile:ContractorName/efile:BusinessName/efile:BusinessNameLine1Txt', namespaces)
        independent_contractor_names = [contractor.text for contractor in independent_contractors]

        # Extract Officers
        officers = root.findall('.//efile:IRS990ScheduleJ/efile:RltdOrgOfficerTrstKeyEmplGrp/efile:PersonNm', namespaces)
        officer_names = [officer.text for officer in officers]

        # Compile extracted info
        extracted_info = {
            "EIN": ein,
            "Name": name,
            "Address": address,
            "Total Contributions and Grants Amount": contributions_amount,
            "Fundraising Partner Names": fundraising_partner_names,
            "Independent Contractor Names": independent_contractor_names,
            "Officer Names": officer_names,
        }

        return extracted_info
    else:
        return "Failed to download XML"

In [36]:
xml_url = 'https://pp-990-xml.s3.us-east-1.amazonaws.com/202323259349300717_public.xml?response-content-disposition=inline&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA266MJEJYTM5WAG5Y/20240327/us-east-1/s3/aws4_request&X-Amz-Date=20240327T200535Z&X-Amz-Expires=1800&X-Amz-SignedHeaders=host&X-Amz-Signature=ce75b7b2d642d97a281d52a306de9a0bbb1e2f6f0f1e020c700ac12855cbe8d7'
extracted_info = extract_info_from_online_form990(xml_url)
extracted_info

{'EIN': '521885088',
 'Name': 'JUDICIAL WATCH INC',
 'Address': '425 3RD STREET SW 800, WASHINGTON, DC 20024',
 'Total Contributions and Grants Amount': '102070631',
 'Fundraising Partner Names': ['TARGETED VICTORY LLC'],
 'Independent Contractor Names': ['PRODUCTION MANAGEMENT GROUP',
  'PLANET DIRECT',
  'TARGETED VICTORY LLC',
  'CONRAD DIRECT',
  'POLITICAL MEDIA INC'],
 'Officer Names': ['THOMAS J FITTON',
  'PAUL J ORFANEDES',
  'CHRISTOPHER FARRELL',
  'ROBERT PATRICK STICHT',
  'STEVEN ANDERSEN',
  'ROBERT POPPER',
  'JOHN ALBERTELLA',
  'THORNTON NOBEL',
  'MICHAEL BEKESHA']}