# Sandbox to explore TIAA-CREF fund holdings

## author:
- **David W. Hogg** (NYU)

## notes:
- Trying to scrape SEC for data.
- Leaning heavily on `claude.ai` for help.

## bugs / to-do items:
- Hard-codes (doesn't figure out) the accession number. How to find this automagically?
- Not sure if CUSIP numbers are defined for all of our interesting targets. Might need to resolve / translate.

In [None]:
# !pip install pandas

In [None]:
import os
import io
import requests
import hashlib
import zipfile
import pandas as pd

In [None]:
# TIAA-CREF Funds trust CIK (the umbrella trust that contains TISCX)
# You can verify at: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=tiaa-cref+funds&type=NPORT-P
TRUST_CIK = "0001084380"          # TIAA-CREF Funds
SERIES_NAME_FRAGMENT = "SOCIAL"   # case-insensitive match

# Which quarter to pull?  Format: YYYY and QN  (e.g. 2024, "q3")
# The SEC posts data ~60 days after quarter end.
YEAR  = 2024
QUARTER = 3 # q1=Jan-Mar, q2=Apr-Jun, q3=Jul-Sep, q4=Oct-Dec

OUTPUT_CSV = "tiscx_holdings.csv"

In [None]:
CACHE_DIR = os.path.expanduser("./cache")  # or wherever you like

def get_with_cache(url, headers=None, timeout=180):
    # Use MD5 hex digest of URL as filename; last 2 chars as subdir
    digest = hashlib.md5(url.encode()).hexdigest()
    subdir = os.path.join(CACHE_DIR, digest[-2:])
    cache_path = os.path.join(subdir, digest)
    
    if os.path.exists(cache_path):
        print(f"Cache hit: {cache_path}")
        with open(cache_path, "rb") as f:
            return f.read()
    
    print(f"Downloading: {url}")
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    print(f"Downloaded {len(r.content)/1e6:.1f} MB")
    
    os.makedirs(subdir, exist_ok=True)
    with open(cache_path, "wb") as f:
        f.write(r.content)
    
    return r.content

In [None]:
url = f"https://www.sec.gov/files/dera/data/form-n-port-data-sets/{YEAR}q{QUARTER}_nport.zip"
headers = {"User-Agent": "David W. Hogg dwhogg@gmail.com"}   # SEC requires this
r = get_with_cache(url, headers=headers)

z = zipfile.ZipFile(io.BytesIO(r))
print("Files in zip:", z.namelist())

In [None]:
def read_tsv(z, name_fragment):
    candidates = [f for f in z.namelist() if name_fragment.upper() in f.upper()]
    if not candidates:
        raise FileNotFoundError(f"No file matching '{name_fragment}'. Available: {z.namelist()}")
    print(f"  Reading {candidates[0]} ...")
    with z.open(candidates[0]) as f:
        return pd.read_csv(f, sep="\t", dtype=str, low_memory=False)

sub      = read_tsv(z, "SUBMISSION")
reg      = read_tsv(z, "REGISTRANT")
fund     = read_tsv(z, "FUND_REPORTED_INFO")

print("SUBMISSION columns:", sub.columns.tolist())
print("REGISTRANT columns:",       reg.columns.tolist())
print("FUND_REPORTED_INFO columns:", fund.columns.tolist())

In [None]:
# Search all fund names in FUND_REPORTED_INFO for "social choice"
mask = fund["SERIES_NAME"].str.upper().str.contains("SOCIAL CHOICE", na=False)
print(fund[mask][["ACCESSION_NUMBER", "SERIES_NAME"]].drop_duplicates().to_string())

In [None]:
# Also search registrant for any TIAA/CREF related CIKs
mask2 = reg["REGISTRANT_NAME"].str.upper().str.contains("TIAA|CREF|TEACHERS", na=False)
print(reg[mask2][["CIK", "REGISTRANT_NAME", "ACCESSION_NUMBER"]].drop_duplicates("CIK").to_string())

In [None]:
# BUG: Hard-setting the accession number
accession = "0001752724-24-196431"

frh = read_tsv(z, "FUND_REPORTED_HOLDING")
holdings = frh[frh["ACCESSION_NUMBER"] == accession].copy()
print("holdings rows:", len(holdings))
print("holdings columns:", list(holdings.keys()))

In [None]:
print(holdings[:10])