# Sandbox to explore SEC N-PORT fund holdings information

## author:
- **David W. Hogg** (NYU)

## projects / notes:
- Trying to scrape SEC for data. This gets EVERYTHING for the quarter and then searches it.
- This downloads and reads a huge file! Probably we should reformat it into a `sqlite` file or something?
- Leaning heavily on `claude.ai` for help.

## bugs / to-do items:
- Works off of a union of search terms? That's not good.
- Hard-codes (doesn't figure out) the accession number. How to find this automagically?
- Finds and reads the data but does nothing with it.

In [None]:
# !pip install pandas

In [None]:
import os
import io
import requests
import hashlib
import zipfile
import pandas as pd

In [None]:
# Change this to your name and email address; and adjust paths if you want.
HEADERS = {"User-Agent": "David W. Hogg dwhogg@gmail.com"}   # SEC requires this
DATA_DIR = os.path.expanduser("../data")
CACHE_DIR = os.path.expanduser("../data/cache")

In [None]:
# the story is: The fund name must contain (case-insensitive), ALL the keywords, not just ANY of them
# KEYWORDS = {"Vanguard", "social", }
# KEYWORDS = {"college", "social choice", }
# KEYWORDS = {"Nuveen Large Cap", "Responsible", }
KEYWORDS = {"Nuveen Large Cap", }
YEAR  = 2025
QUARTER = 3 # q1=Jan-Mar, q2=Apr-Jun, q3=Jul-Sep, q4=Oct-Dec

In [None]:
def get_with_cache(url, headers=None, timeout=180):
    # Use MD5 hex digest of URL as filename; last 2 chars as subdir
    digest = hashlib.md5(url.encode()).hexdigest()
    subdir = os.path.join(CACHE_DIR, digest[-2:])
    cache_path = os.path.join(subdir, digest)
    
    if os.path.exists(cache_path):
        print(f"Cache hit: {cache_path}")
        with open(cache_path, "rb") as f:
            return f.read()
    
    print(f"Downloading: {url}")
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    print(f"Downloaded {len(r.content)/1e6:.1f} MB")
    
    os.makedirs(subdir, exist_ok=True)
    with open(cache_path, "wb") as f:
        f.write(r.content)
    
    return r.content

In [None]:
url = f"https://www.sec.gov/files/dera/data/form-n-port-data-sets/{YEAR}q{QUARTER}_nport.zip"
r = get_with_cache(url, headers=HEADERS)

z = zipfile.ZipFile(io.BytesIO(r))
print("Files in zip:", z.namelist())

In [None]:
def read_tsv(z, name_fragment):
    candidates = [f for f in z.namelist() if name_fragment.upper() in f.upper()]
    if not candidates:
        raise FileNotFoundError(f"No file matching '{name_fragment}'. Available: {z.namelist()}")
    print(f"  Reading {candidates[0]} ...")
    with z.open(candidates[0]) as f:
        return pd.read_csv(f, sep="\t", dtype=str, low_memory=False)

sub      = read_tsv(z, "SUBMISSION")
reg      = read_tsv(z, "REGISTRANT")
fund     = read_tsv(z, "FUND_REPORTED_INFO")

print("SUBMISSION columns:", sub.columns.tolist())
print("REGISTRANT columns:",       reg.columns.tolist())
print("FUND_REPORTED_INFO columns:", fund.columns.tolist())

In [None]:
# Search all fund names in FUND_REPORTED_INFO for keywords
mask = fund["SERIES_NAME"].fillna("").str.upper().apply(
    lambda s: all(kw.upper() in s for kw in KEYWORDS)
)
Nmatch = mask.sum()
if Nmatch < 1:
    print("NO MATCHES FOUND (recall that we are and-ing the KEYWORDS")
    assert False
if Nmatch > 1:
    print("Uh-oh, found", Nmatch, "matches ...")
print(fund[mask][["ACCESSION_NUMBER", "SERIES_NAME"]].drop_duplicates().to_string())

In [None]:
# It is also possible to search registrant for related CIKs
# mask2 = reg["REGISTRANT_NAME"].str.upper().str.contains("VANGUARD", na=False)
# print(reg[mask2][["CIK", "REGISTRANT_NAME", "ACCESSION_NUMBER"]].drop_duplicates("CIK").to_string())

In [None]:
foo = fund[mask][["ACCESSION_NUMBER", "SERIES_NAME"]].iloc[0]
accession = foo["ACCESSION_NUMBER"]
series_name = foo["SERIES_NAME"]
print("Choosing (stupidly) to work with", accession, series_name)

In [None]:
frh = read_tsv(z, "FUND_REPORTED_HOLDING")
holdings = frh[frh["ACCESSION_NUMBER"] == accession].copy()
print("holdings rows:", len(holdings))
print("holdings columns:", list(holdings.keys()))

In [None]:
# weird issues with the PERCENTAGE column
holdings["PERCENTAGE"] = holdings["PERCENTAGE"].astype(float)
print(holdings.nlargest(10, "PERCENTAGE"))

In [None]:
print("total of all percentages:", holdings["PERCENTAGE"].sum())

In [None]:
# write out a CSV file
filename = os.path.join(DATA_DIR, series_name.replace(" ", "_") + ".csv")
holdings.to_csv(filename, index=False)
print("wrote", filename)