# Notes

- New data were downloaded in April 2025
- Former name:
    - *IRS 990 e-File Data (5c) -- DOWNLOAD AND EXTRACT ZIPPED IRS 990 Filings and Associated Schedules.ipynb*
- See also:
    - *IRS 990 e-File Data -- IRS Files (1) -- DOWNLOAD AND EXTRACT ZIPPED IRS 990 Filings and Associated Schedules (WITH EXTRA CODE).ipynb*

# Download Location
https://www.irs.gov/charities-non-profits/form-990-series-downloads

### Overview

This is a modified version of the <a href="http://social-metrics.org/irs-990-e-file-data-part-4/">fourth in a series of tutorials</a> that illustrate how to download the IRS 990 e-file data available at https://aws.amazon.com/public-data-sets/irs-990/

Specifically, in this notebook we will download into a MongoDB table not just the IRS 990 filings for our sample of 5 organizations, but also the associated "schedules." The first half of the tutorial is the same as part (4), so you can skip over it if you're comfortable with the material.

### Load Packages

In [1]:
%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')

CPU times: total: 0 ns
Wall time: 0 ns
Current date and time :  2025-04-06 14:16:02 



In [None]:
import sys
import time
import json

In [None]:
import zipfile, os

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [64]:
#http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)  #Set PANDAS to show all columns in DataFrame
pd.set_option('max_colwidth', 500)

In [None]:
#cd '/Users/gsaxton/Dropbox/990 e-file data'

In [None]:
pwd

### Get list of files to download

In [4]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import ren

In [9]:
req = Request("https://www.irs.gov/charities-non-profits/form-990-series-downloads")
html_page = urlopen(req)

#soup = BeautifulSoup(html_page, "lxml")
soup = BeautifulSoup(html_page, "html")

links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))
print(len(links))
links = [l for l in links if l]
print(len(links))
links = [l for l in links if 'zip' in l]
print(len(links))
print(links)

270
266
46
['https://apps.irs.gov/pub/epostcard/990/xml/2025/2025_TEOS_XML_01A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2025/2025_TEOS_XML_02A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_01A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_02A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_03A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_04A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_05A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_06A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_07A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_08A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_09A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_10A.zip', 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_11A.zip', 'https://apps.irs.gov/pub/epostcard/990

### Loop over all files and download *new* files
` conda install -c anaconda wget`

In [11]:
import os
import wget

# Define your download folder
download_folder = "D:\\IRS 990 Filings (zipped)\\"

# List to store filenames that were already downloaded
already_downloaded = []

# Loop through the links
for url in links:
    # Extract the filename from the URL
    filename = os.path.basename(url)
    destination = os.path.join(download_folder, filename)

    if os.path.exists(destination):
        print(f"Already downloaded: {filename}")
        already_downloaded.append(filename)
        continue

    print(f"Downloading: {filename}")
    wget.download(url, out=destination)
    print()  # Clean line after download

# Optional: Print or save the list of already downloaded files
print("\nFiles already present:")
for f in already_downloaded:
    print(f)

Downloading: 2025_TEOS_XML_01A.zip
100% [......................................................................] 103669610 / 103669610
Downloading: 2025_TEOS_XML_02A.zip
100% [......................................................................] 239929743 / 239929743
Already downloaded: 2024_TEOS_XML_01A.zip
Downloading: 2024_TEOS_XML_02A.zip
100% [......................................................................] 243689721 / 243689721
Downloading: 2024_TEOS_XML_03A.zip
100% [......................................................................] 209335079 / 209335079
Downloading: 2024_TEOS_XML_04A.zip
100% [......................................................................] 350268985 / 350268985
Downloading: 2024_TEOS_XML_05A.zip
100% [......................................................................] 977460330 / 977460330
Downloading: 2024_TEOS_XML_06A.zip
100% [......................................................................] 163872638 / 163872638
Downloading: 

In [16]:
already_downloaded

['2024_TEOS_XML_01A.zip',
 '2023_TEOS_XML_01A.zip',
 '2023_TEOS_XML_02A.zip',
 '2023_TEOS_XML_03A.zip',
 '2023_TEOS_XML_04A.zip',
 '2023_TEOS_XML_05A.zip',
 '2023_TEOS_XML_06A.zip',
 '2023_TEOS_XML_07A.zip',
 '2023_TEOS_XML_08A.zip',
 '2023_TEOS_XML_09A.zip',
 '2023_TEOS_XML_10A.zip',
 '2023_TEOS_XML_11A.zip',
 '2023_TEOS_XML_12A.zip',
 '2022_TEOS_XML_01A.zip',
 '2021_TEOS_XML_01A.zip',
 '2020_TEOS_XML_CT1.zip',
 'download990xml_2020_1.zip',
 'download990xml_2020_2.zip',
 'download990xml_2020_3.zip',
 'download990xml_2020_4.zip',
 'download990xml_2020_5.zip',
 'download990xml_2020_6.zip',
 'download990xml_2020_7.zip',
 'download990xml_2020_8.zip',
 '2019_TEOS_XML_CT1.zip',
 'download990xml_2019_1.zip',
 'download990xml_2019_2.zip',
 'download990xml_2019_3.zip',
 'download990xml_2019_4.zip',
 'download990xml_2019_5.zip',
 'download990xml_2019_6.zip',
 'download990xml_2019_7.zip',
 'download990xml_2019_8.zip']

In [17]:
links

['https://apps.irs.gov/pub/epostcard/990/xml/2025/2025_TEOS_XML_01A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2025/2025_TEOS_XML_02A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_01A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_02A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_03A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_04A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_05A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_06A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_07A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_08A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_09A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_10A.zip',
 'https://apps.irs.gov/pub/epostcard/990/xml/2024/2024_TEOS_XML_11A.zip',
 'https://apps.irs.gov/pub/epostcard/9

In [19]:
# Get the filenames from the links
link_filenames = [os.path.basename(url) for url in links]

# Convert both to sets and subtract
downloaded_filenames = set(link_filenames) - set(already_downloaded)

print("Already downloaded:", len(already_downloaded))
print("Total links:", len(links))
print("Just downloaded:", len(downloaded_filenames))

Already downloaded: 33
Total links: 46
Just downloaded: 13


### Extract the Zipped Files

#### Version Putting Unzipped XML Files into Separate Sub-Directories
This is preferable for checking later whether the IRS changes the number of filings in a given year retroactively, or when a recent zipped file was incomplete.

Create list of all zipped files in the folder

In [23]:
dir_name = 'D:\\IRS 990 Filings (zipped)\\'
extension = ".zip"
new_dir = 'D:\\IRS 990 Filings (XML) with folders\\'
entries = os.listdir(dir_name)[:]
print(len(entries))
entries

86


['2017_TEOS_XML_CT1.zip',
 '2018_TEOS_XML_CT1.zip',
 '2018_TEOS_XML_CT2.zip',
 '2018_TEOS_XML_CT3.zip',
 '2019_TEOS_XML_CT1.zip',
 '2020_TEOS_XML_CT1.zip',
 '2021_TEOS_XML_01A.zip',
 '2021_TEOS_XML_01B.zip',
 '2021_TEOS_XML_01C.zip',
 '2021_TEOS_XML_01D.zip',
 '2021_TEOS_XML_01E.zip',
 '2021_TEOS_XML_01F.zip',
 '2021_TEOS_XML_01G.zip',
 '2021_TEOS_XML_01H.zip',
 '2022_TEOS_XML_01A.zip',
 '2022_TEOS_XML_01B.zip',
 '2022_TEOS_XML_01C.zip',
 '2022_TEOS_XML_01D.zip',
 '2022_TEOS_XML_01E.zip',
 '2022_TEOS_XML_01F.zip',
 '2022_TEOS_XML_11A.zip',
 '2022_TEOS_XML_11B.zip',
 '2022_TEOS_XML_11C.zip',
 '2023_TEOS_XML_01A.zip',
 '2023_TEOS_XML_02A.zip',
 '2023_TEOS_XML_03A.zip',
 '2023_TEOS_XML_04A.zip',
 '2023_TEOS_XML_05A.zip',
 '2023_TEOS_XML_05B.zip',
 '2023_TEOS_XML_06A.zip',
 '2023_TEOS_XML_07A.zip',
 '2023_TEOS_XML_08A.zip',
 '2023_TEOS_XML_09A.zip',
 '2023_TEOS_XML_10A.zip',
 '2023_TEOS_XML_11A.zip',
 '2023_TEOS_XML_11B (1).zip',
 '2023_TEOS_XML_11B - Copy.zip',
 '2023_TEOS_XML_11B.zip',
 

<br>Define function to loop over the zipped files and extract each one in its own folder

Below code block used to fix an error: 
https://stackoverflow.com/questions/17664262/python-zipfile-module-erroneously-thinks-i-have-a-zipfile-that-spans-multiple-di

<br>
Updated: `When extracting, detect and flatten the extra layer immediately after unzipping, so the structure is consistent.`

In [29]:
import shutil
import zipfile_deflate64 as zipfile

def extract_zip_Files():
    os.chdir(dir_name)
    for item in entries:
        if item not in downloaded_filenames:
            continue

        print('Extracting:', item)    
        new_folder = os.path.splitext(item)[0]
        output_dir = os.path.join(new_dir, new_folder)
        os.makedirs(output_dir, exist_ok=True)

        if item.endswith(extension):
            file_path = os.path.abspath(item)
            with zipfile.ZipFile(file_path) as zip_ref:
                zip_ref.extractall(path=output_dir)

            # Check if the extracted folder has a single subfolder — flatten it
            contents = os.listdir(output_dir)
            if len(contents) == 1:
                possible_subfolder = os.path.join(output_dir, contents[0])
                if os.path.isdir(possible_subfolder):
                    for filename in os.listdir(possible_subfolder):
                        shutil.move(os.path.join(possible_subfolder, filename), output_dir)
                    os.rmdir(possible_subfolder)

In [30]:
%%time
# Extract all zip files
extract_zip_Files() 

CPU times: total: 11min 5s
Wall time: 14min 52s


#### Check number of extracted filings in each folder

#### 2025 Download

In [33]:
import os

directory = 'D:\\IRS 990 Filings (XML) with folders'
new_folder_names = [os.path.splitext(f)[0] for f in downloaded_filenames]
subfolders_with_paths = [os.path.join(directory, name) for name in new_folder_names if os.path.isdir(os.path.join(directory, name))]

print('# of newly added sub-folders:', len(subfolders_with_paths))

total_xml_files = 0

for folder_path in subfolders_with_paths:
    folder_name = os.path.basename(folder_path)
    contents = os.listdir(folder_path)

    # Default path to count files in
    target_path = folder_path

    # If there's exactly one subfolder inside, and it's a folder, dive into it
    if len(contents) == 1:
        possible_subfolder = os.path.join(folder_path, contents[0])
        if os.path.isdir(possible_subfolder):
            target_path = possible_subfolder

    # Count only XML files
    xml_files = [f for f in os.listdir(target_path) if f.endswith('.xml')]
    num_xml_files = len(xml_files)
    print(f"{folder_name}  # of XML files in folder: {num_xml_files}")
    total_xml_files += num_xml_files

print('\nTotal # of Filings:', total_xml_files)

# of newly added sub-folders: 13
2024_TEOS_XML_03A  # of XML files in folder: 38418
2024_TEOS_XML_07A  # of XML files in folder: 50144
2025_TEOS_XML_01A  # of XML files in folder: 17044
2025_TEOS_XML_02A  # of XML files in folder: 41855
2024_TEOS_XML_10A  # of XML files in folder: 43392
2024_TEOS_XML_02A  # of XML files in folder: 40284
2024_TEOS_XML_09A  # of XML files in folder: 44108
2024_TEOS_XML_06A  # of XML files in folder: 29408
2024_TEOS_XML_04A  # of XML files in folder: 59487
2024_TEOS_XML_11A  # of XML files in folder: 186632
2024_TEOS_XML_12A  # of XML files in folder: 21676
2024_TEOS_XML_08A  # of XML files in folder: 41687
2024_TEOS_XML_05A  # of XML files in folder: 156237

Total # of Filings: 770372


### MongoDB
Depending on the project, I will store the data in SQLite or MongoDB. This time I'll use MongoDB -- it's great for storing JSON data where each observation could have different variables. Before we get to the interesting part the following code blocks set up the MongoDB environment and the new database we'll be using. 

**_Note:_** In a terminal we'll have to start MongoDB by running the command *mongod* or *sudo mongod*. Then we run the following code block here to access MongoDB.

In [6]:
import pymongo
from pymongo import MongoClient
client = MongoClient()

In [7]:
db = client['irs_990_db']
filings_990 = db['filings_990']
filings_990_x = db['filings_990_x']
filings_990_y = db['filings_990_y']

In [58]:
#Set a unique constraint on the collection based on *URL*. This will avert duplicates from being inserted.
#db.filings_990_x.create_index([('URL', pymongo.ASCENDING)], unique=True)
list(db.filings_990.index_information())

['_id_', 'URL_1']

In [59]:
list(db.filings_990_x.index_information())

['_id_', 'URL_1']

In [60]:
list(db.filings_990_y.index_information())

['_id_', 'URL_1']

In [61]:
filings_990.estimated_document_count()

2192435

In [62]:
filings_990_x.estimated_document_count()

217259

In [63]:
filings_990_y.estimated_document_count()

891980

In [64]:
%%time

def insert_unique_docs_by_url(source_collection, target_collection):
    print(f"\n⏳ Processing collection: {source_collection.name}...")

    # Get existing URLs in the target collection
    existing_urls = set(
        doc['URL'] for doc in target_collection.find({}, {'URL': 1})
    )

    new_docs = []
    for doc in source_collection.find():
        if doc['URL'] not in existing_urls:
            new_docs.append(doc)

    if new_docs:
        target_collection.insert_many(new_docs)
        print(f"✅ Inserted {len(new_docs)} new documents from {source_collection.name}")
    else:
        print(f"✅ No new documents to insert from {source_collection.name}")

# Run the merge
insert_unique_docs_by_url(filings_990_x, filings_990)
insert_unique_docs_by_url(filings_990_y, filings_990)


⏳ Processing collection: filings_990_x...
✅ Inserted 217259 new documents from filings_990_x

⏳ Processing collection: filings_990_y...
✅ Inserted 674721 new documents from filings_990_y
CPU times: total: 24min 26s
Wall time: 36min 24s


<br>Here is my expected updated count for `filings_990`

In [65]:
2192435+217259+891980

3301674

In [66]:
filings_990.estimated_document_count()

3084415

In [67]:
filings_990_x.estimated_document_count()

217259

In [68]:
filings_990_y.estimated_document_count()

891980

<br>Connect to our database that contains the filing index data we downloaded earlier.

In [10]:
#from bson.son import SON
#pipeline = [ {"$group": {"_id": "$FormType", "count": {"$sum": 1}}} ]
#list(file_list_2021.aggregate(pipeline))

### Loop over List of Filings, grab e-file data, and insert into second database
First we'll write a function to turn an ordered dictionary (which is what is returned by *xmltodict*) into a normal Python dictionary so that we can combine it with the filing data gathered above.

What I'm doing in the next five or so blocks of code is looping over the *list* of 3 million plus filings that I've previously downloaded, and then downloading the full e-file 990 filings for all orgs that match one of our 8,304 EINs.

In [8]:
from json import loads, dumps
from collections import OrderedDict

def to_dict(input_ordered_dict):
    return loads(dumps(input_ordered_dict))

<br>In brief, what we want to do is write code that will loop over all relevant rows in our MongoDB collection, visit the respective URL where the 990 data are located, grab those data, convert them to a dictionary, and then insert into a new MongoDB collection. 
 

In [9]:
list(db.filings_990.index_information())

['_id_', 'URL_1']

#### Get list of downloaded filings

In [10]:
filings_990.estimated_document_count()

3469008

In [12]:
import pandas as pd
import numpy as np

In [13]:
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
dff = pd.DataFrame(list(filings_990.find({}, {'URL':1, 
    '_id':0})))
print("Number of columns:", len(dff.columns))
print("Number of observations:", len(dff))
dff[:1]

Current date and time :  2025-04-06 14:18:03 

Number of columns: 1
Number of observations: 3469008
CPU times: total: 6.23 s
Wall time: 1h 53min 59s


Unnamed: 0,URL
0,https://s3.amazonaws.com/irs-form-990/20111313...


In [16]:
pwd

'D:\\IRS 990 Filings (zipped)'

In [17]:
%%time
dff.to_pickle('C:\\Users\\Gregory\\IRS 990 Control Variables\\URLs of all filings up to 2025.pkl.gz', compression='gzip')

CPU times: total: 40.4 s
Wall time: 42 s


In [None]:
#cd '/Users/gsaxton/Dropbox/990 e-file data'

In [23]:
"""
%%time
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')
dff = pd.read_pickle('C:\\Users\\Gregory\\IRS 990 Control Variables\\URLs of all filings pre-2024.pkl.gz', compression='gzip')
print("Number of columns:", len(dff.columns))
print("Number of observations:", len(dff))
dff[:1]
"""

Current date and time :  2024-01-13 00:23:47 

Number of columns: 1
Number of observations: 2192435
CPU times: total: 797 ms
Wall time: 850 ms


Unnamed: 0,URL
9362,https://s3.amazonaws.com/irs-form-990/201013993493000040_public.xml


In [18]:
dff[-5:]

Unnamed: 0,URL
3158451,https://s3.amazonaws.com/irs-form-990/20254051...
3158452,https://s3.amazonaws.com/irs-form-990/20254051...
3158453,https://s3.amazonaws.com/irs-form-990/20254051...
3158454,https://s3.amazonaws.com/irs-form-990/20254051...
3158455,https://s3.amazonaws.com/irs-form-990/20254051...


In [19]:
%%time
dff=dff.sort_values('URL', ascending=1)
dff[:5]

CPU times: total: 3.81 s
Wall time: 4 s


Unnamed: 0,URL
9362,https://s3.amazonaws.com/irs-form-990/20101399...
8619,https://s3.amazonaws.com/irs-form-990/20101399...
9851,https://s3.amazonaws.com/irs-form-990/20101399...
8775,https://s3.amazonaws.com/irs-form-990/20101399...
9005,https://s3.amazonaws.com/irs-form-990/20101399...


In [20]:
##### Generate and save a list of downloaded 990 filings
mongo_files = dff['URL'].tolist()
print(len(mongo_files))
print(len(set(mongo_files)))
mongo_files = [f for f in mongo_files if pd.isnull(f) == False]# and x != 'nan']
print(len(mongo_files))
print(len(set(mongo_files)))
mongo_files[:2]

3469008
3469008
3469008
3469008


['https://s3.amazonaws.com/irs-form-990/201013993493000040_public.xml',
 'https://s3.amazonaws.com/irs-form-990/201013993493001130_public.xml']

### Download IRS 990 Data and Available Schedules
Now let's run the loop for all five EINs, grab the 33 IRS 990 filings, and insert them into our new MongoDB table. This block has some additional code that I won't discuss in detail (see comments below for further details). The short answer is that we are looping over each row in our database, visiting the URL that contains the 990 data, and then grabbing all of the data returned by the <code>IRS990</code> key. For convenience purposes, we then combine this new data with the associated filing index data from our first database, and then insert the combined data into our new <code>filings</code> collection.

Note that this code block will only work for organizations that have the <code>IRS990</code> key. The check is found in the following line of code:

&nbsp; &nbsp; <code>if 'IRS990' in data['Return']['ReturnData']:</code>

This means that organizations filing *990EZ* or *990PF* will be skipped. However, the code block could easily be modified to grab 990EZ or 990PF filings. 

Compared to the <a href="http://social-metrics.org/irs-990-e-file-data-part-4/">simpler version of this tutorials</a>, we are also adding columns for five keys that are nested under the ['Return'] key. If you're not familiar with Python this line of code from the code block we'll be using will seem perplexing:

&nbsp; &nbsp; <code>return_info = {k:v for k,v in datax['Return'].iteritems() if k not in ('ReturnData')}</code>

What this code does is assign to a new dictionary called *return_info* every key and value nested under &nbsp; <code>datax['Return']</code> &nbsp;  except for <code>datax['Return]['ReturnData']</code>. &nbsp; The latter contains our 990 data, so we are going to deal with that separately. So, *return_info* will contain one column for each of the following keys: &nbsp; <code>'@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', '@returnVersion', 'ReturnHeader'</code>. &nbsp; We are not likely to use these data but it will be good to have them handy in case we need them. 

Similarly, with the following line we create a new dictionary that will contain all of the keys nested under &nbsp; <code>datax['Return']['ReturnData']</code> &nbsp; except for &nbsp; <code>datax['Return]['ReturnData']['IRS990]</code>. &nbsp; 

&nbsp; &nbsp; <code>schedules = {k:v for k,v in datax['Return']['ReturnData'].iteritems() if k not in ('IRS990')}</code>

We then combine the 9 columns from our existing MongoDB collection, the 5 columns from *return_info*, the available columns in the dictionary *schedules*, and the 200+ columns from &nbsp; <code>datax['Return]['ReturnData']['IRS990]</code>. &nbsp;  This combined dictionary is then added to our new MongoDB *filings* collection.

What we are doing here is deciding not to "flatten" the data available in the schedules. Instead, the focus of this dataset is the actual core 990 data. Our resultant dataset will have a dozen or so "background" or filing detail columns, a half-dozen or so columns containing data on any available schedules, and then 200+ columns containing the 990 data. For most research purposes this will suffice and will minimize the need to flatten keys with nested data. But whenever we have the need the data will be there waiting for us. 

In [21]:
from json import loads, dumps
from collections import OrderedDict

def to_dict(input_ordered_dict):
    return loads(dumps(input_ordered_dict))

In [24]:
import datetime
print ("Current date and time : ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')

Current date and time :  2025-04-06 23:24:09 



##### To get a list of all sub-directories

In [25]:
import os 
directory = 'D:\IRS 990 Filings (XML) with folders'
os.walk(directory)

<generator object _walk at 0x000002083ED59CB0>

<br>We have 67 sub-directories to loop over

In [76]:
#list_subfolders_with_paths = [f.path for f in os.scandir(directory) if f.is_dir()]
#print(len(list_subfolders_with_paths))
#list_subfolders_with_paths

In [80]:
new_dir = 'D:\\IRS 990 Filings (XML) with folders\\'
new_folder_names = [os.path.splitext(f)[0] for f in downloaded_filenames]
new_subfolders = [os.path.join(new_dir, folder) for folder in new_folder_names if os.path.isdir(os.path.join(new_dir, folder))]
print(len(new_subfolders))
new_subfolders

13


['D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_03A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_07A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_01A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_10A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_09A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_06A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_04A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_11A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_12A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_08A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_05A']

In [34]:
new_subfolders = ['D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_03A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_07A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_01A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_10A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_09A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_06A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_04A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_11A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_12A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_08A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_05A']
new_subfolders

['D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_03A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_07A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_01A',
 'D:\\IRS 990 Filings (XML) with folders\\2025_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_10A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_02A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_09A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_06A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_04A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_11A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_12A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_08A',
 'D:\\IRS 990 Filings (XML) with folders\\2024_TEOS_XML_05A']

#### Search for Filings > 16MB

In [82]:
%%time

# Base directory and size limit in bytes (MongoDB limit ~16MB)
base_dir = "D:\\IRS 990 Filings (XML) with folders"
size_limit_bytes = 16_000_000  # 16 MB (WITH SAFETY BUFFER)
#size_limit_bytes = 16_777_216 # EXACTLY 16MB


# Store oversized files
oversized_files = []

for path, subdirs, files in os.walk(base_dir):
    for name in files:
        file_path = os.path.join(path, name)
        try:
            file_size = os.path.getsize(file_path)
            if file_size >= size_limit_bytes:
                oversized_files.append((file_path, file_size))
        except Exception as e:
            print(f"Error checking file {file_path}: {e}")

# Report
print(f"\n🔍 Found {len(oversized_files)} oversized files:")
for fpath, fsize in oversized_files:
    print(f"📁 {fpath} — {fsize / (1024 * 1024):.2f} MB")


🔍 Found 9 oversized files:
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_02A\202410469349300741_public.xml — 26.29 MB
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_02A\202430459349302913_public.xml — 49.66 MB
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_04A\202411039349301716_public.xml — 35.33 MB
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_05A\202411309349100126_public.xml — 40.59 MB
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_05A\202441359349309439_public.xml — 15.93 MB
📁 D:\IRS 990 Filings (XML) with folders\2024_TEOS_XML_05A\202441369349301334_public.xml — 54.88 MB
📁 D:\IRS 990 Filings (XML) with folders\2025_TEOS_XML_01A\202530159349302758_public.xml — 29.94 MB
📁 D:\IRS 990 Filings (XML) with folders\download990xml_2018_3\201813199349310531_public.xml — 15.34 MB
📁 D:\IRS 990 Filings (XML) with folders\download990xml_2019_8\201943169349302844_public.xml — 15.28 MB
CPU times: total: 14.9 s
Wall time: 16min 3s


# Insert New Filings into MongoDB

In [26]:
filings_990.estimated_document_count()

3469008

In [65]:
#os.listdir(folder)

In [None]:
from json import loads, dumps
from collections import OrderedDict

In [95]:
# Loop over each newly extracted subfolder
for folder in new_subfolders[:1]:
    print("\n📁 Processing folder:", os.path.basename(folder))


📁 Processing folder: 2024_TEOS_XML_03A


# 4/5/2025 - Main Updated Loop 
Note: `existing_urls = set(doc['URL'] for doc in filings_990.find({}, {'URL': 1}))` and `if url in existing_urls` instead of `if filings_990.count_documents({'URL': url}, limit=1) > 0:`        

#### `existing_urls` can be processed in the next code block but I have commented it out in order to see how long it takes

In [28]:
%%time
existing_urls = set(doc['URL'] for doc in filings_990.find({}, {'URL': 1}))

CPU times: total: 29.5 s
Wall time: 11min 13s


In [29]:
print(len(existing_urls))

3469008


In [30]:
list(existing_urls)[:5]

['https://s3.amazonaws.com/irs-form-990/201301619349300115_public.xml',
 'https://s3.amazonaws.com/irs-form-990/201411059349300011_public.xml',
 'https://s3.amazonaws.com/irs-form-990/202103079349302950_public.xml',
 'https://s3.amazonaws.com/irs-form-990/202002289349300705_public.xml',
 'https://s3.amazonaws.com/irs-form-990/202441359349300639_public.xml']

In [62]:
%%time

import datetime
print("Current date and time:", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), '\n')

import os
import xml.etree.ElementTree as ET
import xmltodict
import json
from pymongo import MongoClient, errors
#from IPython.display import clear_output
from tqdm.notebook import tqdm


#ET.register_namespace('', 'http://www.irs.gov/efile')

# Mongo setup
# client = MongoClient()
# db = client['irs_990_db']
# filings_990 = db['filings_990']

def to_dict(input_ordered_dict):
    return json.loads(json.dumps(input_ordered_dict))

start_count = filings_990.estimated_document_count()
counter = 0
duplicates = 0
non_990 = 0
inserted = 0

#existing_urls = set(doc['URL'] for doc in filings_990.find({}, {'URL': 1}))

# Loop over each newly extracted subfolder
for folder in new_subfolders[12:]:
#for folder in sampled_folders[:]:  #THIS IS FOR THE SAMPLED FOLDERS - lIKELY NO LONGER NEEDED
    print("\n Processing folder:", os.path.basename(folder))
    
    #for filename in os.listdir(folder):   
    for filename in tqdm(os.listdir(folder)):
        if not filename.endswith('.xml'):
            continue

        counter += 1
        url = 'https://s3.amazonaws.com/irs-form-990/' + filename    

        #if filings_990.count_documents({'URL': url}, limit=1) > 0:        
        if url in existing_urls:
            duplicates += 1
            #clear_output()
            #print("Already in *filings_990*")
            #print(('counter: ', counter, 'duplicates: ', duplicates, 'non-990: ', non_990), '\n')
            continue

        file_path = os.path.join(folder, filename)

        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            xmlstr = ET.tostring(root, encoding='utf-8', method='xml')
            datax = dict(xmltodict.parse(xmlstr))
        except Exception as e:
            #print("Could not parse XML:", filename, "(", str(e), ")")
            continue

        try:
            return_data = datax['Return']['ReturnData']
            if 'IRS990' in return_data:
                return_info = {k: v for k, v in datax['Return'].items() if k != 'ReturnData'}
                schedules = {k: v for k, v in return_data.items() if k != 'IRS990'}
                data = to_dict(return_data['IRS990'])

                record = dict(**return_info, **schedules, **data)
                record['URL'] = url
                record.pop('_id', None)

                filings_990.insert_one(record)
                inserted += 1

            else:
                non_990 += 1
        except KeyError:
            non_990 += 1
            continue

            
            
        # Print status every file
        #clear_output()
        #print(('counter: ', counter, 'duplicates: ', duplicates, 'non-990: ', non_990), '\n')

# Final summary
print("\n📊 Final Summary")
print("📂 Total subfolders processed:", len(new_subfolders))
print("📄 Total XMLs processed:", counter)
print("✅ Filings inserted:", inserted)
print("🔁 Duplicates skipped:", duplicates)
print("📄 Non-990s skipped:", non_990)
print("🗃️ Total filings now in database:", filings_990.estimated_document_count())
print("➕ New filings added this run:", filings_990.estimated_document_count() - start_count)

Current date and time: 2025-04-09 22:32:37 


 Processing folder: download990xml_2017_7


  0%|          | 0/26515 [00:00<?, ?it/s]


 Processing folder: 2021_TEOS_XML_01H


  0%|          | 0/29904 [00:00<?, ?it/s]


 Processing folder: download990xml_2019_8


  0%|          | 0/28954 [00:00<?, ?it/s]


📊 Final Summary
📂 Total subfolders processed: 13
📄 Total XMLs processed: 85373
✅ Filings inserted: 0
🔁 Duplicates skipped: 49420
📄 Non-990s skipped: 35953
🗃️ Total filings now in database: 3469008
➕ New filings added this run: 0
CPU times: total: 2min 22s
Wall time: 16min 52s


#### Now look at a few random previously extracted folders and re-run main block

In [49]:
import os
import random

# Get all folders in new_dir
all_subfolders = [os.path.join(new_dir, f) for f in os.listdir(new_dir) 
                  if os.path.isdir(os.path.join(new_dir, f))]

# Get folders not in new_subfolders
remaining_folders = list(set(all_subfolders) - set(new_subfolders))

# Sample 2 at random (if at least 2 remain)
sampled_folders = random.sample(remaining_folders, 2) if len(remaining_folders) >= 2 else remaining_folders

# Output
print("Total folders in new_dir:", len(all_subfolders))
print("Already processed folders:", len(new_subfolders))
print("Remaining folders:", len(remaining_folders))
print("Sampled folders:", sampled_folders)

Total folders in new_dir: 80
Already processed folders: 13
Remaining folders: 67
Sampled folders: ['D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_08A', 'D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_10A']


In [51]:
print(len(all_subfolders))
print(all_subfolders)

80
['D:\\IRS 990 Filings (XML) with folders\\2017_TEOS_XML_CT1', 'D:\\IRS 990 Filings (XML) with folders\\2018_TEOS_XML_CT1', 'D:\\IRS 990 Filings (XML) with folders\\2018_TEOS_XML_CT2', 'D:\\IRS 990 Filings (XML) with folders\\2018_TEOS_XML_CT3', 'D:\\IRS 990 Filings (XML) with folders\\2019_TEOS_XML_CT1', 'D:\\IRS 990 Filings (XML) with folders\\2020_TEOS_XML_CT1', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01A', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01B', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01C', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01D', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01E', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01F', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01G', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01H', 'D:\\IRS 990 Filings (XML) with folders\\2022_TEOS_XML_01A', 'D:\\IRS 990 Filings (XML) with folders\\2022_TEOS_XML_01B', 'D:\\IRS 990 Filings

In [53]:
print(len(remaining_folders))
print(remaining_folders)

67
['D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01E', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2018_2', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2020_7', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01A', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2017_5', 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01B', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2019_2', 'D:\\IRS 990 Filings (XML) with folders\\2020_TEOS_XML_CT1', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2020_4', 'D:\\IRS 990 Filings (XML) with folders\\2022_TEOS_XML_01F', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2019_3', 'D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_04A', 'D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_05A', 'D:\\IRS 990 Filings (XML) with folders\\2018_TEOS_XML_CT2', 'D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_09A', 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2

In [54]:
sampled_folders

['D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_08A',
 'D:\\IRS 990 Filings (XML) with folders\\2023_TEOS_XML_10A']

In [61]:
sampled_folders = random.sample(remaining_folders, 3) if len(remaining_folders) >= 2 else remaining_folders
sampled_folders

['D:\\IRS 990 Filings (XML) with folders\\download990xml_2017_7',
 'D:\\IRS 990 Filings (XML) with folders\\2021_TEOS_XML_01H',
 'D:\\IRS 990 Filings (XML) with folders\\download990xml_2019_8']