In [None]:
import requests
import os
import time
from bs4 import BeautifulSoup
import hashlib
from urllib.parse import urljoin
import logging
import shutil
import re

In [None]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

HTML_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov'
}

XML_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'DNT': '1',
    'Sec-Ch-Ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
}

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_cached_filename(url, directory):
    return os.path.join(directory, hashlib.md5(url.encode()).hexdigest() + '.xml')

def download_with_cache(url, cache_dir='sec_cache', is_xml=False):
    ensure_directory_exists(cache_dir)
    cached_file = get_cached_filename(url, cache_dir)
    
    if os.path.exists(cached_file):
        logging.info(f"Loading cached file for {url}")
        with open(cached_file, 'r', encoding='utf-8') as file:
            return file.read()
    
    logging.info(f"Downloading {url}")
    try:
        headers = XML_HEADERS if is_xml else HTML_HEADERS
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content = response.text
        
        with open(cached_file, 'w', encoding='utf-8') as file:
            file.write(content)
        
        time.sleep(1)  # Respectful delay after a new download
        return content
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading {url}: {e}")
        return None

def get_cik(ticker):
    ticker = ticker.upper()
    url = "https://www.sec.gov/include/ticker.txt"
    content = download_with_cache(url, 'sec_data')
    if content:
        for line in content.splitlines():
            t, c = line.strip().split('\t')
            if t.upper() == ticker:
                return c.zfill(10)
    logging.error(f"CIK not found for ticker {ticker}")
    return None

def get_filings_url(cik, filing_type):
    return f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type={filing_type}&dateb=&owner=exclude&count=40"

def get_xml_url(index_url):
    content = download_with_cache(index_url)
    if not content:
        return None, None
    
    soup = BeautifulSoup(content, 'html.parser')
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) >= 3 and 'XBRL INSTANCE DOCUMENT' in cells[1].text:
            link = cells[2].find('a')
            if link and link.has_attr('href'):
                return urljoin("https://www.sec.gov", link['href']), index_url
    
    logging.warning(f"No XBRL instance document found in {index_url}")
    return None, None

def get_accession_number(url):
    logging.debug(f"Attempting to extract accession number from URL: {url}")
    match = re.search(r'/(\d{10}-\d{2}-\d{6})', url)
    if match:
        accession_number = match.group(1)
        logging.debug(f"Extracted accession number: {accession_number}")
        return accession_number
    logging.debug("Failed to extract accession number")
    return None
    
def download_xml(ticker, filing_type='10-Q', num_filings=5):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return []

    filings_url = get_filings_url(cik, filing_type)
    content = download_with_cache(filings_url)
    if not content:
        logging.error(f"Failed to get filings for {ticker}")
        return []

    soup = BeautifulSoup(content, 'html.parser')
    filing_links = []
    for link in soup.find_all('a', href=True):
        if 'Archives' in link['href'] and '/data/' in link['href']:
            filing_links.append(urljoin("https://www.sec.gov", link['href']))
    
    if not filing_links:
        logging.error(f"No filing links found for {ticker}")
        return []

    xml_files = []
    for index_url in filing_links[:num_filings]:
        xml_url, index_url = get_xml_url(index_url)
        if xml_url:
            xml_content = download_with_cache(xml_url, f'xml_cache_{ticker}', is_xml=True)
            if xml_content:
                cached_filename = get_cached_filename(xml_url, f'xml_cache_{ticker}')
                xml_files.append((cached_filename, xml_url, index_url))
                logging.debug(f"Added file: {cached_filename}, XML URL: {xml_url}, Index URL: {index_url}")
            else:
                logging.error(f"Failed to download XML from {xml_url}")
        else:
            logging.warning(f"No XML URL found for {index_url}")
    
    return xml_files

def rename_and_store_xml_files(xml_files, ticker, destination_folder='xbrls'):
    ensure_directory_exists(destination_folder)
    renamed_files = []

    for cached_file, xml_url, index_url in xml_files:
        logging.debug(f"Processing file: {cached_file}")
        full_accession_number = get_accession_number(index_url)
        if full_accession_number:
            cik, unique_part = full_accession_number.split('-', 1)
            logging.debug(f"CIK: {cik}, Unique part: {unique_part}")
            
            date_match = re.search(r'-(\d{8})[-_]', xml_url)
            date = date_match.group(1) if date_match else "unknown_date"
            logging.debug(f"Extracted date: {date}")
            
            new_filename = f"{ticker.lower()}-{cik}-{unique_part}-{date}.xml"
            new_filepath = os.path.join(destination_folder, new_filename)
            logging.debug(f"New filepath: {new_filepath}")
            
            try:
                shutil.copy2(cached_file, new_filepath)
                renamed_files.append(new_filepath)
                logging.info(f"Stored {new_filepath}")
            except Exception as e:
                logging.error(f"Failed to copy file: {e}")
        else:
            logging.warning(f"Could not extract accession number for {index_url}")

    return renamed_files

In [None]:
ticker = "AAPL"
form_type = "10-Q"
num_filings = 5

xml_files = download_xml(ticker, form_type, num_filings)
print(f"Downloaded XML files for {ticker}:")
for file, xml_url, index_url in xml_files:
    print(f"File: {file}")
    print(f"XML URL: {xml_url}")
    print(f"Index URL: {index_url}")
    print()

renamed_files = rename_and_store_xml_files(xml_files, ticker)
print(f"\nRenamed and stored XML files:")
for file in renamed_files:
    print(file)

2024-09-21 06:59:27,845 - INFO - Loading cached file for https://www.sec.gov/include/ticker.txt
2024-09-21 06:59:27,846 - INFO - Loading cached file for https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=10-Q&dateb=&owner=exclude&count=40
2024-09-21 06:59:27,887 - INFO - Loading cached file for https://www.sec.gov/Archives/edgar/data/320193/000032019324000081/0000320193-24-000081-index.htm
2024-09-21 06:59:27,890 - INFO - Loading cached file for https://www.sec.gov/Archives/edgar/data/320193/000032019324000081/aapl-20240629_htm.xml
2024-09-21 06:59:27,892 - INFO - Loading cached file for https://www.sec.gov/Archives/edgar/data/320193/000032019324000069/0000320193-24-000069-index.htm
2024-09-21 06:59:27,896 - INFO - Loading cached file for https://www.sec.gov/Archives/edgar/data/320193/000032019324000069/aapl-20240330_htm.xml
2024-09-21 06:59:27,897 - INFO - Loading cached file for https://www.sec.gov/Archives/edgar/data/320193/000032019324000006/0000320193-2

Downloaded XML files for AAPL:
File: xml_cache_AAPL/cfdd4f144c3320fc781346b8b99caf59.xml
XML URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000081/aapl-20240629_htm.xml
Index URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000081/0000320193-24-000081-index.htm

File: xml_cache_AAPL/2da07f5a9a13e809f11888d987a3babd.xml
XML URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000069/aapl-20240330_htm.xml
Index URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000069/0000320193-24-000069-index.htm

File: xml_cache_AAPL/6efe51076e8ee7b5fa84035d406d202c.xml
XML URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000006/aapl-20231230_htm.xml
Index URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000006/0000320193-24-000006-index.htm

File: xml_cache_AAPL/6c906f55166e3152702067fdac2ab7e4.xml
XML URL: https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/aapl-20230701_htm.xml
Index URL: https://www