In [None]:
import pandas as pd
import requests
import os
import time
import hashlib
import logging
import json
import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Headers for SEC API requests
API_HEADERS = {
    'User-Agent': 'Your Name yourname@example.com',  # Replace with your information
    'Accept': 'application/json',
}

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_cached_filename(url, directory):
    return os.path.join(directory, hashlib.md5(url.encode()).hexdigest() + '.json')

def download_with_cache(url, cache_dir='sec_cache', headers=None):
    ensure_directory_exists(cache_dir)
    cached_file = get_cached_filename(url, cache_dir)
    
    if os.path.exists(cached_file):
        logging.info(f"Loading cached file for {url}")
        with open(cached_file, 'r', encoding='utf-8') as file:
            return file.read()
    
    logging.info(f"Downloading {url}")
    try:
        if headers is None:
            headers = API_HEADERS
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content = response.text
        
        with open(cached_file, 'w', encoding='utf-8') as file:
            file.write(content)
        
        time.sleep(0.2)  # Respectful delay after a new download
        return content
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading {url}: {e}")
        return None

def get_cik(ticker):
    ticker = ticker.upper()
    url = "https://www.sec.gov/include/ticker.txt"
    content = download_with_cache(url, 'sec_data')
    if content:
        for line in content.splitlines():
            t, c = line.strip().split('\t')
            if t.upper() == ticker:
                return c.zfill(10)
    logging.error(f"CIK not found for ticker {ticker}")
    return None

def get_financial_data(ticker, concepts):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None
    
    # Define the date 20 years ago from today
    twenty_years_ago = datetime.datetime.now() - datetime.timedelta(days=365 * 20)
    twenty_years_ago_str = twenty_years_ago.strftime('%Y-%m-%d')
    
    financial_data = {}
    for concept in concepts:
        url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/{concept}.json"
        content = download_with_cache(url, cache_dir='sec_data', headers=API_HEADERS)
        if not content:
            logging.error(f"Failed to get data for {ticker} and concept {concept}")
            continue
        data = json.loads(content)
        concept_data = []
        for unit in data.get('units', {}):
            for fact in data['units'][unit]:
                # Filter by date
                fact_end_date = fact.get('end')
                if fact_end_date and fact_end_date >= twenty_years_ago_str:
                    concept_data.append({
                        'date': fact_end_date,
                        'value': fact.get('val'),
                        'unit': unit,
                        'concept': concept,
                        'form': fact.get('form'),
                        'filed': fact.get('filed'),
                        'fy': fact.get('fy'),
                        'fp': fact.get('fp'),
                    })
        # Sort the list by date in descending order
        concept_data.sort(key=lambda x: x['date'], reverse=True)
        financial_data[concept] = concept_data
    return financial_data

def save_financial_data_to_csv(ticker, financial_data, concepts):
    cik = get_cik(ticker)
    if not cik:
        logging.error(f"Failed to get CIK for {ticker}")
        return None
    
    # Create 'fundamentals' folder if it doesn't exist
    ensure_directory_exists('fundamentals')
    
    # Collect all dates
    all_dates = set()
    for concept in financial_data:
        for item in financial_data[concept]:
            all_dates.add(item['date'])
    all_dates = sorted(all_dates, reverse=True)
    
    # Build rows
    rows = []
    for date in all_dates:
        row = {'date': date}
        for concept in concepts:
            # Find the value for this date
            value = None
            unit = None
            form = None
            filed = None
            fy = None
            fp = None
            for item in financial_data.get(concept, []):
                if item['date'] == date:
                    value = item['value']
                    unit = item['unit']
                    form = item['form']
                    filed = item['filed']
                    fy = item['fy']
                    fp = item['fp']
                    break
            row[concept] = value
            # Include unit for each concept
            row[f'{concept}_unit'] = unit
        # Include additional metadata (only if available)
        row['form'] = form
        row['filed'] = filed
        row['fy'] = fy
        row['fp'] = fp
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    # Save to CSV
    filename = f"fundamentals/{ticker.lower()}-{cik}-EPS.csv"
    df.to_csv(filename, index=False)
    logging.info(f"Saved financial data to {filename}")
    return filename

# Read the IWM components file
iwm_df = pd.read_csv('data/IWM_Components.txt', sep='\t')

# Save the IWM components to a CSV file
iwm_df.to_csv('data/iwm_components.csv', index=False)
logging.info("Saved IWM components to data/iwm_components.csv")

# Extract the list of tickers
tickers = iwm_df['Ticker'].tolist()

# Define the financial concepts to fetch
concepts = [
    'EarningsPerShareBasic',
    'EarningsPerShareDiluted',
    'NetIncomeLoss',
    'OperatingIncomeLoss',
    'GrossProfit',
    'Revenues'  # Also known as SalesRevenueNet for some companies
]

# Create 'fundamentals' folder if it doesn't exist
ensure_directory_exists('fundamentals')

# Iterate over tickers
for ticker in tickers:
    logging.info(f"Processing {ticker}")
    try:
        financial_data = get_financial_data(ticker, concepts)
        if financial_data:
            filename = save_financial_data_to_csv(ticker, financial_data, concepts)
            logging.info(f"Financial data for {ticker} saved to {filename}")
        else:
            logging.warning(f"No financial data found for {ticker}")
    except Exception as e:
        logging.error(f"An error occurred while processing {ticker}: {e}")
    # Respectful delay between tickers
    time.sleep(0.5)