In [3]:
import requests
import os
import time
import hashlib
import logging
import re
import json
import pandas as pd  # For DataFrame and CSV handling

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Headers for SEC API requests
API_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',  # Replace with your information
    'Accept': 'application/json',
}

HTML_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',  # Replace with your information
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov'
}

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_cached_filename(url, directory):
    return os.path.join(directory, hashlib.md5(url.encode()).hexdigest() + '.json')

def download_with_cache(url, cache_dir='sec_cache', headers=None):
    ensure_directory_exists(cache_dir)
    cached_file = get_cached_filename(url, cache_dir)
    
    if os.path.exists(cached_file):
        logging.info(f"Loading cached file for {url}")
        with open(cached_file, 'r', encoding='utf-8') as file:
            return file.read()
    
    logging.info(f"Downloading {url}")
    try:
        if headers is None:
            headers = API_HEADERS
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content = response.text
        
        with open(cached_file, 'w', encoding='utf-8') as file:
            file.write(content)
        
        time.sleep(0.2)  # Respectful delay after a new download
        return content
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading {url}: {e}")
        return None

def get_cik(ticker):
    ticker = ticker.upper()
    url = "https://www.sec.gov/include/ticker.txt"
    content = download_with_cache(url, 'sec_data')
    if content:
        for line in content.splitlines():
            t, c = line.strip().split('\t')
            if t.upper() == ticker:
                return c.zfill(10)
    logging.error(f"CIK not found for ticker {ticker}")
    return None

def get_financial_data(ticker, concepts):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None
    
    financial_data = {}
    for concept in concepts:
        url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/{concept}.json"
        content = download_with_cache(url, cache_dir='sec_data', headers=API_HEADERS)
        if not content:
            logging.error(f"Failed to get data for {ticker} and concept {concept}")
            continue
        data = json.loads(content)
        concept_data = []
        for unit in data.get('units', {}):
            for fact in data['units'][unit]:
                concept_data.append({
                    'date': fact.get('end'),
                    'value': fact.get('val'),
                    'unit': unit,
                    'concept': concept,
                    'form': fact.get('form'),
                    'filed': fact.get('filed'),
                    'fy': fact.get('fy'),
                    'fp': fact.get('fp'),
                })
        # Sort the list by date in descending order
        concept_data.sort(key=lambda x: x['date'], reverse=True)
        financial_data[concept] = concept_data
    return financial_data

def save_financial_data_to_csv(ticker, financial_data, concepts):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None
    
    # Create 'fundamentals' folder if it doesn't exist
    ensure_directory_exists('fundamentals')
    
    # Collect all dates
    all_dates = set()
    for concept in financial_data:
        for item in financial_data[concept]:
            all_dates.add(item['date'])
    all_dates = sorted(all_dates, reverse=True)
    
    # Build rows
    rows = []
    for date in all_dates:
        row = {'date': date}
        for concept in concepts:
            # Find the value for this date
            value = None
            unit = None
            form = None
            filed = None
            fy = None
            fp = None
            for item in financial_data.get(concept, []):
                if item['date'] == date:
                    value = item['value']
                    unit = item['unit']
                    form = item['form']
                    filed = item['filed']
                    fy = item['fy']
                    fp = item['fp']
                    break
            row[concept] = value
            # Include unit for each concept
            row[f'{concept}_unit'] = unit
        # Include additional metadata (only if available)
        row['form'] = form
        row['filed'] = filed
        row['fy'] = fy
        row['fp'] = fp
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    # Save to CSV
    filename = f"fundamentals/{ticker.lower()}-{cik}-EPS.csv"
    df.to_csv(filename, index=False)
    logging.info(f"Saved financial data to {filename}")
    return filename

# Main code
ticker = "AAPL"  # Replace with your desired ticker

# Define the financial concepts to fetch
concepts = [
    'EarningsPerShareBasic',
    'EarningsPerShareDiluted',
    'NetIncomeLoss',
    'OperatingIncomeLoss',
    'GrossProfit',
    'Revenues'  # Also known as SalesRevenueNet for some companies
]

financial_data = get_financial_data(ticker, concepts)

if financial_data:
    filename = save_financial_data_to_csv(ticker, financial_data, concepts)
    print(f"Financial data saved to {filename}")
else:
    print("Failed to retrieve financial data.")


2024-09-21 07:57:04,516 - INFO - Loading cached file for https://www.sec.gov/include/ticker.txt
2024-09-21 07:57:04,519 - INFO - Loading cached file for https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/EarningsPerShareBasic.json
2024-09-21 07:57:04,521 - INFO - Loading cached file for https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/EarningsPerShareDiluted.json
2024-09-21 07:57:04,522 - INFO - Downloading https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/NetIncomeLoss.json
2024-09-21 07:57:05,730 - INFO - Downloading https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/OperatingIncomeLoss.json
2024-09-21 07:57:06,762 - INFO - Downloading https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/GrossProfit.json
2024-09-21 07:57:07,988 - INFO - Downloading https://data.sec.gov/api/xbrl/companyconcept/CIK0000320193/us-gaap/Revenues.json
2024-09-21 07:57:09,064 - INFO - Loading cached file for https://www.sec.gov/

Financial data saved to fundamentals/aapl-0000320193-EPS.csv
