In [6]:
import requests
import os
import time
import hashlib
import logging
import json
import pandas as pd
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Headers for SEC API requests
API_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',  # Replace with your information
    'Accept': 'application/json',
}

HTML_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',  # Replace with your information
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov'
}

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_cached_filename(url, directory):
    filename = hashlib.md5(url.encode()).hexdigest()
    ext = os.path.splitext(url)[1]
    return os.path.join(directory, filename + ext)

def download_with_cache(url, cache_dir='sec_cache', headers=None, as_text=True):
    ensure_directory_exists(cache_dir)
    cached_file = get_cached_filename(url, cache_dir)
    
    if os.path.exists(cached_file):
        logging.info(f"Loading cached file for {url}")
        mode = 'r' if as_text else 'rb'
        with open(cached_file, mode) as file:
            return file.read()
    
    logging.info(f"Downloading {url}")
    try:
        if headers is None:
            headers = API_HEADERS
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content = response.text if as_text else response.content
        
        mode = 'w' if as_text else 'wb'
        with open(cached_file, mode) as file:
            file.write(content)
        
        time.sleep(0.2)  # Respectful delay after a new download
        return content
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading {url}: {e}")
        return None

def get_cik(ticker):
    ticker = ticker.upper()
    url = "https://www.sec.gov/include/ticker.txt"
    content = download_with_cache(url, 'sec_data', as_text=True)
    if content:
        for line in content.splitlines():
            t, c = line.strip().split('\t')
            if t.upper() == ticker:
                return c.zfill(10)
    logging.error(f"CIK not found for ticker {ticker}")
    return None

def get_insider_trading_data(ticker, num_filings=100):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None

    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    content = download_with_cache(url, cache_dir='sec_data', headers=API_HEADERS, as_text=True)
    if not content:
        logging.error(f"Failed to get submissions data for CIK {cik}")
        return None

    data = json.loads(content)

    # Get recent filings
    recent_filings = data.get('filings', {}).get('recent', {})
    if not recent_filings:
        logging.error(f"No recent filings found for CIK {cik}")
        return None

    # Extract filings where 'form' is '3', '4', or '5'
    forms_of_interest = {'3', '4', '5'}
    filings_list = []
    for i in range(len(recent_filings['accessionNumber'])):
        form_type = recent_filings['form'][i]
        if form_type in forms_of_interest:
            filing = {
                'accessionNumber': recent_filings['accessionNumber'][i],
                'reportDate': recent_filings['reportDate'][i] if 'reportDate' in recent_filings else '',
                'filingDate': recent_filings['filingDate'][i],
                'form': form_type,
                'primaryDocument': recent_filings['primaryDocument'][i],
                'primaryDocDescription': recent_filings['primaryDocDescription'][i]
            }
            filings_list.append(filing)

    if not filings_list:
        logging.error(f"No ownership filings found for CIK {cik}")
        return None

    # Limit to num_filings
    filings_list = filings_list[:num_filings]

    # Now fetch and parse each Form 4 filing
    insider_data = []
    for filing in filings_list:
        accession_number = filing['accessionNumber']
        accession_number_no_dashes = accession_number.replace('-', '')
        # Construct the URL to the filing's directory page
        directory_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession_number_no_dashes}/"
        directory_content = download_with_cache(directory_url, cache_dir='sec_data', headers=HTML_HEADERS, as_text=True)
        if not directory_content:
            logging.error(f"Failed to download directory page for {directory_url}")
            continue

        # Parse the directory_content to find the XML file
        soup = BeautifulSoup(directory_content, 'html.parser')
        xml_filename = None
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.endswith('.xml') and 'xsl' not in href.lower():
                xml_filename = href
                break

        if not xml_filename:
            logging.error(f"No XML file found for filing {accession_number}")
            continue

        xml_url = f"https://www.sec.gov{xml_filename}"
        xml_content = download_with_cache(xml_url, cache_dir='sec_data', headers=HTML_HEADERS, as_text=False)
        if not xml_content:
            logging.error(f"Failed to download XML content for {xml_url}")
            continue

        # Parse the XML content
        try:
            root = ET.fromstring(xml_content)
            # Extract information from the XML
            for reporting_owner in root.findall('reportingOwner'):
                owner_name_elem = reporting_owner.find('reportingOwnerId/rptOwnerName')
                owner_cik_elem = reporting_owner.find('reportingOwnerId/rptOwnerCik')
                owner_title_elem = reporting_owner.find('reportingOwnerRelationship/officerTitle')
                owner_name = owner_name_elem.text if owner_name_elem is not None else ''
                owner_cik = owner_cik_elem.text if owner_cik_elem is not None else ''
                owner_title = owner_title_elem.text if owner_title_elem is not None else ''

                for non_derivative_transaction in root.findall('.//nonDerivativeTransaction'):
                    transaction_date_elem = non_derivative_transaction.find('transactionDate/value')
                    transaction_code_elem = non_derivative_transaction.find('transactionCoding/transactionCode')
                    transaction_amount_elem = non_derivative_transaction.find('transactionAmounts/transactionShares/value')
                    transaction_price_elem = non_derivative_transaction.find('transactionAmounts/transactionPricePerShare/value')
                    transaction_acquired_disposed_code_elem = non_derivative_transaction.find('transactionAmounts/transactionAcquiredDisposedCode/value')
                    securities_title_elem = non_derivative_transaction.find('securityTitle/value')
                    direct_or_indirect_elem = non_derivative_transaction.find('ownershipNature/directOrIndirectOwnership/value')

                    transaction_date = transaction_date_elem.text if transaction_date_elem is not None else ''
                    transaction_code = transaction_code_elem.text if transaction_code_elem is not None else ''
                    transaction_amount = transaction_amount_elem.text if transaction_amount_elem is not None else ''
                    transaction_price = transaction_price_elem.text if transaction_price_elem is not None else ''
                    transaction_acquired_disposed_code = transaction_acquired_disposed_code_elem.text if transaction_acquired_disposed_code_elem is not None else ''
                    securities_title = securities_title_elem.text if securities_title_elem is not None else ''
                    direct_or_indirect = direct_or_indirect_elem.text if direct_or_indirect_elem is not None else ''

                    insider_data.append({
                        'owner_cik': owner_cik,
                        'owner_name': owner_name,
                        'owner_title': owner_title,
                        'transaction_date': transaction_date,
                        'transaction_code': transaction_code,
                        'transaction_acquired_disposed_code': transaction_acquired_disposed_code,
                        'transaction_amount': transaction_amount,
                        'transaction_price': transaction_price,
                        'securities_title': securities_title,
                        'direct_or_indirect': direct_or_indirect,
                        'report_date': filing.get('reportDate', ''),
                        'filing_date': filing['filingDate'],
                        'form_type': filing['form'],
                        'company_cik': cik,
                        'company_ticker': ticker.upper(),
                        'accession_number': accession_number
                    })
        except ET.ParseError as e:
            logging.error(f"XML parsing error for {xml_url}: {e}")
            continue

        # Respectful delay
        time.sleep(0.2)

    return insider_data

def save_insider_data_to_csv(ticker, insider_data):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None

    # Create 'insider_trading' folder if it doesn't exist
    ensure_directory_exists('insider_trading')

    # Create DataFrame
    df = pd.DataFrame(insider_data)

    # Save to CSV
    filename = f"insider_trading/{ticker.lower()}-{cik}-insider-trading.csv"
    df.to_csv(filename, index=False)
    logging.info(f"Saved insider trading data to {filename}")
    return filename

# Main code
ticker = "UPS"  # Replace with your desired ticker
num_filings = 50  # Number of recent Form 3, 4, and 5 filings to process

insider_data = get_insider_trading_data(ticker, num_filings=num_filings)

if insider_data:
    filename = save_insider_data_to_csv(ticker, insider_data)
    print(f"Insider trading data saved to {filename}")
else:
    print("Failed to retrieve insider trading data.")


2024-09-21 08:24:05,396 - INFO - Loading cached file for https://www.sec.gov/include/ticker.txt
2024-09-21 08:24:05,397 - INFO - Loading cached file for https://data.sec.gov/submissions/CIK0001090727.json
2024-09-21 08:24:05,400 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007958/
2024-09-21 08:24:06,284 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007958/doc4.xml
2024-09-21 08:24:07,131 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007957/
2024-09-21 08:24:07,958 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007957/doc4.xml
2024-09-21 08:24:09,111 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007564/
2024-09-21 08:24:09,969 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/000122520824007564/doc4.xml
2024-09-21 08:24:10,816 - INFO - Downloading https://www.sec.gov/Archives/edgar/data/1090727/

Insider trading data saved to insider_trading/ups-0001090727-insider-trading.csv
