In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import re

In [2]:
cik = '0000066740'
url = f'https://data.sec.gov/submissions/CIK{cik}.json'
user = ''


request = requests.get(url, headers = {"User-Agent":user})

In [3]:
forms = request.json()["filings"]["recent"]['form']
primaryDocs = request.json()["filings"]["recent"]["primaryDocument"]
accessionNumber = request.json()["filings"]["recent"]["accessionNumber"]
filingDate = request.json()["filings"]["recent"]["filingDate"]
companyName = request.json()['name']

In [4]:
import pandas as pd
earningsDF = pd.DataFrame(columns=['companyName', 'accessionNumber', 'document', 'filingDate'])

for i in range(len(forms)):
    if forms[i] == '10-Q':
        earningsDF.loc[len(earningsDF)] = [companyName, accessionNumber[i], primaryDocs[i], filingDate[i]]


earningsDF.head(10)

Unnamed: 0,companyName,accessionNumber,document,filingDate
0,3M CO,0000066740-25-000089,mmm-20250930.htm,2025-10-21
1,3M CO,0000066740-25-000063,mmm-20250630.htm,2025-07-18
2,3M CO,0000066740-25-000039,mmm-20250331.htm,2025-04-22
3,3M CO,0000066740-24-000101,mmm-20240930.htm,2024-10-22
4,3M CO,0000066740-24-000080,mmm-20240630.htm,2024-07-26
5,3M CO,0000066740-24-000053,mmm-20240331.htm,2024-04-30
6,3M CO,0000066740-23-000092,mmm-20230930.htm,2023-10-24
7,3M CO,0000066740-23-000058,mmm-20230630.htm,2023-07-25
8,3M CO,0000066740-23-000028,mmm-20230331.htm,2023-04-25
9,3M CO,0000066740-22-000076,mmm-20220930.htm,2022-10-25


In [5]:
#general format for SEC filings archive: 
#https://www.sec.gov/Archives/edgar/data/{CIK}/{accessionNumber}/{primaryDocument} 
#https://www.sec.gov/Archives/edgar/data/0000066740/000006674025000089/mmm-20250930.htm 

def build_filing_url(row):
    # Remove dashes from accession number for the URL path
    accession_clean = row['accessionNumber'].replace('-', '')
    return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_clean}/{row['document']}"

earningsDF['url'] = earningsDF.apply(build_filing_url, axis=1)
earningsDF.head()

Unnamed: 0,companyName,accessionNumber,document,filingDate,url
0,3M CO,0000066740-25-000089,mmm-20250930.htm,2025-10-21,https://www.sec.gov/Archives/edgar/data/000006...
1,3M CO,0000066740-25-000063,mmm-20250630.htm,2025-07-18,https://www.sec.gov/Archives/edgar/data/000006...
2,3M CO,0000066740-25-000039,mmm-20250331.htm,2025-04-22,https://www.sec.gov/Archives/edgar/data/000006...
3,3M CO,0000066740-24-000101,mmm-20240930.htm,2024-10-22,https://www.sec.gov/Archives/edgar/data/000006...
4,3M CO,0000066740-24-000080,mmm-20240630.htm,2024-07-26,https://www.sec.gov/Archives/edgar/data/000006...


In [6]:
def parse_10q_text(url, user_agent):
    """
    Fetch and parse a 10-Q filing to extract clean text.
    
    Args:
        url: The URL to the 10-Q HTML filing
        user_agent: Your email for SEC compliance
    
    Returns:
        Dictionary with extracted text and metadata
    """
    time.sleep(0.1)
    
    try:
        headers = {'User-Agent': user_agent}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text()
    
        lines = (line.strip() for line in text.splitlines())
        
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return {
            'success': True,
            'text': text,
            'length': len(text),
            'url': url
        }
    
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'url': url
        }

In [7]:
most_recent_url = earningsDF.iloc[0]['url']
print(f"Parsing: {most_recent_url}")

result = parse_10q_text(most_recent_url, user)

if result['success']:
    print(f"\nSuccessfully parsed {result['length']:,} characters")
    print(f"\nFirst 1000 characters:\n")
    print(result['text'][:1000])
else:
    print(f"Error: {result['error']}")

Parsing: https://www.sec.gov/Archives/edgar/data/0000066740/000006674025000089/mmm-20250930.htm

Successfully parsed 378,668 characters

First 1000 characters:

mmm-202509300000066740FALSE--12-312025MMMCHXCommon Stock, Par Value $.01 Per ShareQ3http://fasb.org/us-gaap/2025#CostOfRevenuehttp://fasb.org/us-gaap/2025#CostOfRevenuehttp://fasb.org/us-gaap/2025#SellingGeneralAndAdministrativeExpensehttp://fasb.org/us-gaap/2025#SellingGeneralAndAdministrativeExpensehttp://www.mmm.com/20250930#ResearchDevelopmentAndRelatedExpenseshttp://www.mmm.com/20250930#ResearchDevelopmentAndRelatedExpenseshttp://fasb.org/us-gaap/2025#AccountsPayableCurrenthttp://fasb.org/us-gaap/2025#AccountsPayableCurrenthttp://fasb.org/us-gaap/2025#LongTermDebtNoncurrenthttp://fasb.org/us-gaap/2025#LongTermDebtNoncurrentP20Yxbrli:sharesiso4217:USDiso4217:USDxbrli:sharesxbrli:puremmm:positioniso4217:EURmmm:plaintiffmmm:casemmm:lawsuitmmm:partymmm:respiratormmm:actionmmm:citymmm:facilitymmm:claimmmm:Sitemmm:perfluorinated

In [8]:
def extract_mda_section(url, user_agent):
    """
    Extract the Management's Discussion and Analysis section from a 10-Q.
    """
    time.sleep(0.1)
    
    headers = {'User-Agent': user_agent}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    for element in soup(["script", "style"]):
        element.decompose()
    text = soup.get_text()
    
    pattern = r'Item\s*2[\.:]\s*Management[\'\’]s\s+Discussion\s+and\s+Analysis\s+of\s+Financial\s+Condition\s+and\s+Results\s+of\s+Operations'
    
    matches = list(re.finditer(pattern, text, re.IGNORECASE))
    
    if len(matches) >= 2:
        second_match = matches[1]
        start_pos = second_match.start()
        
        text_before = text[:start_pos]
        actual_start_pos = start_pos
        
        next_item_pattern = r'ITEM\s*[3][.:]'
        next_match = re.search(next_item_pattern, text[start_pos + 100:], re.IGNORECASE)
        
        if next_match:
            end_pos = start_pos + 100 + next_match.start()
            mda_text = text[actual_start_pos:end_pos]
        else:
            mda_text = text[actual_start_pos:start_pos + 50000]
        
        return mda_text
    elif len(matches) == 1:
        return "Only one instance of Item 2 MD&A found (expected second instance)"
    else:
        return "MD&A section not found"

In [9]:
mda_text = extract_mda_section(most_recent_url, user)
print(f"MD&A section length: {len(mda_text):,} characters")
print(f"\nFirst 500 characters:\n")
print(mda_text[:500])

MD&A section length: 55,850 characters

First 500 characters:

Item 2. Management’s Discussion and Analysis of Financial Condition and Results of OperationsManagement’s Discussion and Analysis of Financial Condition and Results of Operations (MD&A) is designed to provide a reader of 3M’s financial statements with a narrative from the perspective of management. The MD&A should be read in conjunction with 3M's consolidated financial statements and the accompanying notes to the consolidated financial statements. 3M’s MD&A is presented in the following sections


In [10]:
print(mda_text)

Item 2. Management’s Discussion and Analysis of Financial Condition and Results of OperationsManagement’s Discussion and Analysis of Financial Condition and Results of Operations (MD&A) is designed to provide a reader of 3M’s financial statements with a narrative from the perspective of management. The MD&A should be read in conjunction with 3M's consolidated financial statements and the accompanying notes to the consolidated financial statements. 3M’s MD&A is presented in the following sections:•Overview•Results of Operations•Performance by Business Segment•Financial Condition and Liquidity•Cautionary Note Concerning Factors That May Affect Future ResultsForward-looking statements in Part I, Item 2 may involve risks and uncertainties that could cause results to differ materially from those projected (refer to the section entitled Cautionary Note Concerning Factors That May Affect Future Results in Part I, Item 2 and the risk factors provided in Part II, Item 1A for discussion of these

In [11]:
earningsDF

Unnamed: 0,companyName,accessionNumber,document,filingDate,url
0,3M CO,0000066740-25-000089,mmm-20250930.htm,2025-10-21,https://www.sec.gov/Archives/edgar/data/000006...
1,3M CO,0000066740-25-000063,mmm-20250630.htm,2025-07-18,https://www.sec.gov/Archives/edgar/data/000006...
2,3M CO,0000066740-25-000039,mmm-20250331.htm,2025-04-22,https://www.sec.gov/Archives/edgar/data/000006...
3,3M CO,0000066740-24-000101,mmm-20240930.htm,2024-10-22,https://www.sec.gov/Archives/edgar/data/000006...
4,3M CO,0000066740-24-000080,mmm-20240630.htm,2024-07-26,https://www.sec.gov/Archives/edgar/data/000006...
5,3M CO,0000066740-24-000053,mmm-20240331.htm,2024-04-30,https://www.sec.gov/Archives/edgar/data/000006...
6,3M CO,0000066740-23-000092,mmm-20230930.htm,2023-10-24,https://www.sec.gov/Archives/edgar/data/000006...
7,3M CO,0000066740-23-000058,mmm-20230630.htm,2023-07-25,https://www.sec.gov/Archives/edgar/data/000006...
8,3M CO,0000066740-23-000028,mmm-20230331.htm,2023-04-25,https://www.sec.gov/Archives/edgar/data/000006...
9,3M CO,0000066740-22-000076,mmm-20220930.htm,2022-10-25,https://www.sec.gov/Archives/edgar/data/000006...
