In [1]:
import requests
import json
import pandas as pd
from tqdm import tqdm

In [2]:
# Custom headers (SEC requires contact info in User-Agent)
HEADERS = {
    'User-Agent': 'Gayathri Jayaraman (gayathrij@uchicago.edu)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'data.sec.gov'
}

BASE_URL = "https://data.sec.gov/submissions/CIK{}.json"

In [4]:
def fetch_company_filings(cik):
    """Fetch submissions for a company based on CIK"""
    cik = str(cik).zfill(10)  # Ensure CIK is zero-padded
    url = BASE_URL.format(cik) 
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data for CIK {cik}: {response.status_code}")
        return None


def extract_filings(cik_list, filing_type="10-K"):
    """Extract specified filing type for multiple CIKs"""
    all_filings = []

    for cik in tqdm(cik_list, desc="Fetching SEC filings"):
        data = fetch_company_filings(cik)
        if data:
            filings = data.get('filings', {}).get('recent', {})
            for form, accession, report_date in zip(filings.get('form', []),
                                                    filings.get('accessionNumber', []),
                                                    filings.get('reportDate', [])):
                if form == filing_type:
                    filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession.replace('-', '')}/{accession}-index.html"
                    all_filings.append({
                        "CIK": cik,
                        "Form": form,
                        "AccessionNumber": accession,
                        "ReportDate": report_date,
                        "FilingURL": filing_url
                    })

    return pd.DataFrame(all_filings)

In [5]:
# Example CIKs (Apple, Microsoft, Amazon)
cik_list = ['0000320193', '0000789019', '0001018724']  

# Fetch 10-K filings
filings_df = extract_filings(cik_list)
# filings_df.to_csv("sec_10k_filings.csv", index=False)
print(filings_df.head())

Fetching SEC filings: 100%|██████████| 3/3 [00:00<00:00,  6.84it/s]

          CIK  Form       AccessionNumber  ReportDate  \
0  0000320193  10-K  0000320193-24-000123  2024-09-28   
1  0000320193  10-K  0000320193-23-000106  2023-09-30   
2  0000320193  10-K  0000320193-22-000108  2022-09-24   
3  0000320193  10-K  0000320193-21-000105  2021-09-25   
4  0000320193  10-K  0000320193-20-000096  2020-09-26   

                                           FilingURL  
0  https://www.sec.gov/Archives/edgar/data/320193...  
1  https://www.sec.gov/Archives/edgar/data/320193...  
2  https://www.sec.gov/Archives/edgar/data/320193...  
3  https://www.sec.gov/Archives/edgar/data/320193...  
4  https://www.sec.gov/Archives/edgar/data/320193...  





In [None]:
from bs4 import BeautifulSoup

def parse_filing_text(filing_url):
    """Parse and return filing text from SEC filing URL"""
    response = requests.get(filing_url, headers=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator=' ', strip=True)
    else:
        print(f"Failed to parse filing at {filing_url}")
        return ""

# Parse a sample filing
sample_text = parse_filing_text(filings_df.iloc[0]['FilingURL'])
with open("data/processed/sample_10k_filing.txt", "w", encoding="utf-8") as file:
    file.write(sample_text)

Failed to parse filing at https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123-index.html


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/sample_10k_filing.txt'

In [7]:
# Second try: extract 10k document
def get_10k_filing_text(filing_url):
    """Finds the correct 10-K document link and extracts its text"""
    response = requests.get(filing_url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to fetch filing page: {filing_url}")
        return ""

    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the 10-K document row based on "Description"
    ten_k_link = None
    for row in soup.find_all('tr'):
        columns = row.find_all('td')
        if len(columns) >= 2:  # Ensure the row has enough columns
            description = columns[1].get_text(strip=True)
            if "10-K" in description:  # Match the Description
                ten_k_link = columns[2].find('a')['href']  # Extract the Document URL
                break

    if not ten_k_link:
        print(f"No 10-K document found in {filing_url}")
        return ""

    # Construct the full URL to the 10-K document
    base_url = "/".join(filing_url.split("/")[:-1])  # Remove the last part of the URL
    ten_k_url = f"https://www.sec.gov{ten_k_link}"

    print(f"Fetching 10-K document: {ten_k_url}")

    # Request the 10-K document
    response = requests.get(ten_k_url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch 10-K document: {ten_k_url}")
        return ""

    # Extract and return text content
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.get_text(separator=" ", strip=True)


ten_k_text = get_10k_filing_text(filings_df.iloc[0]['FilingURL'])
if ten_k_text:
    with open("data/processed/sample_10k_filing.txt", "w", encoding="utf-8") as file:
        file.write(ten_k_text)

    print("10-K filing text successfully saved.")
else:
    print("Failed to extract 10-K filing text.")

Failed to fetch filing page: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123-index.html
Failed to extract 10-K filing text.
