In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import os
import json
from datetime import datetime

# List of tickers to process
tickers = [
    "AAPL", "MSFT", "NVDA", "GOOGL", "AMZN", "META", "BRK.B", "AVGO", "TSLA", "LLY",
    "WMT", "JPM", "V", "MA", "XOM", "COST", "UNH", "HD", "PG", "JNJ",
    "ABBV", "CRM", "BAC", "ORCL", "MRK", "CVX", "WFC", "KO", "CSCO", "ACN"
]

# Create a dict to map tickers to company names
ticker_to_company = {
    "AAPL": "apple", "MSFT": "microsoft", "NVDA": "nvidia", "GOOGL": "alphabet",
    "AMZN": "amazon", "META": "meta-platforms", "BRK.B": "berkshire-hathaway",
    "AVGO": "broadcom", "TSLA": "tesla", "LLY": "eli-lilly", "WMT": "walmart",
    "JPM": "jpmorgan-chase", "V": "visa", "MA": "mastercard", "XOM": "exxon-mobil",
    "COST": "costco", "UNH": "unitedhealth-group", "HD": "home-depot",
    "PG": "procter-gamble", "JNJ": "johnson-johnson", "ABBV": "abbvie",
    "CRM": "salesforce", "BAC": "bank-of-america", "ORCL": "oracle",
    "MRK": "merck", "CVX": "chevron", "WFC": "wells-fargo", "KO": "coca-cola",
    "CSCO": "cisco-systems", "ACN": "accenture"
}

# Function to get company name from ticker
def get_company_name(ticker):
    return ticker_to_company.get(ticker, ticker.lower())

# Create a session for better performance and cookie handling
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
})

# Create directory for progress tracking
os.makedirs('data', exist_ok=True)

# Function to save progress
def save_progress(ticker, metric_type, data):
    if data is not None and not data.empty:
        filename = f"data/{ticker}_{metric_type.lower()}.csv"
        data.to_csv(filename, index=False)
        print(f"  Saved {len(data)} {metric_type} data points for {ticker} to {filename}")
        return True
    return False

# Function to load progress
def load_progress(ticker, metric_type):
    filename = f"data/{ticker}_{metric_type.lower()}.csv"
    if os.path.exists(filename):
        try:
            data = pd.read_csv(filename)
            print(f"  Loaded {len(data)} {metric_type} data points for {ticker} from {filename}")
            return data
        except Exception as e:
            print(f"  Error loading {filename}: {e}")
    return None

# Function for exponential backoff
def make_request_with_backoff(url, max_retries=5, initial_delay=5):
    retries = 0
    delay = initial_delay
    
    while retries < max_retries:
        try:
            response = session.get(url)
            
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                print(f"  Rate limited (429). Backing off for {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
                retries += 1
            else:
                print(f"  HTTP error: {response.status_code}")
                return response
        except Exception as e:
            print(f"  Request error: {e}")
            time.sleep(delay)
            delay *= 2
            retries += 1
    
    print("  Max retries exceeded")
    return None

# Function to scrape PE ratio data
def scrape_pe_ratio(ticker):
    # First check if we already have data for this ticker
    existing_data = load_progress(ticker, 'PE')
    if existing_data is not None:
        return existing_data
    
    company_name = get_company_name(ticker)
    url = f"https://www.macrotrends.net/stocks/charts/{ticker}/{company_name}/pe-ratio"
    
    response = make_request_with_backoff(url)
    if response is None or response.status_code != 200:
        print(f"  Failed to fetch PE data for {ticker}")
        return None
    
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for div with style-1 ID
        div = soup.find('div', id='style-1')
        if not div:
            print(f"  Could not find style-1 div for {ticker} PE data")
            return None
        
        # Find the table within this div
        table = div.find('table', class_='table')
        if not table:
            print(f"  Could not find table in style-1 div for {ticker} PE data")
            return None
        
        # Check if this is the PE ratio table
        title_row = table.find('th', string=re.compile('PE Ratio'))
        if not title_row:
            print(f"  Table does not appear to be PE ratio table for {ticker}")
            return None
        
        # Extract data from the table
        pe_data = []
        for row in table.find_all('tr')[2:]:  # Skip the first two header rows
            cells = row.find_all('td')
            if len(cells) >= 4:
                try:
                    date = cells[0].text.strip()
                    stock_price = cells[1].text.strip()
                    eps = cells[2].text.strip()
                    pe_ratio = cells[3].text.strip()
                    
                    # Only include rows with valid PE data
                    if pe_ratio and pe_ratio != "N/A":
                        pe_data.append({
                            'Ticker': ticker,
                            'Date': date,
                            'Stock_Price': stock_price,
                            'EPS': eps,
                            'PE_Ratio': pe_ratio
                        })
                except Exception as e:
                    print(f"  Error parsing row for {ticker} PE data: {e}")
        
        result_df = pd.DataFrame(pe_data)
        if not result_df.empty:
            save_progress(ticker, 'PE', result_df)
        return result_df
    
    except Exception as e:
        print(f"  Error scraping PE data for {ticker}: {e}")
        return None

# Function to scrape ROE data
def scrape_roe(ticker):
    # First check if we already have data for this ticker
    existing_data = load_progress(ticker, 'ROE')
    if existing_data is not None:
        return existing_data
    
    company_name = get_company_name(ticker)
    url = f"https://www.macrotrends.net/stocks/charts/{ticker}/{company_name}/roe"
    
    response = make_request_with_backoff(url)
    if response is None or response.status_code != 200:
        print(f"  Failed to fetch ROE data for {ticker}")
        return None
    
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for div with style-1 ID
        div = soup.find('div', id='style-1')
        if not div:
            print(f"  Could not find style-1 div for {ticker} ROE data")
            return None
        
        # Find the table within this div
        table = div.find('table', class_='table')
        if not table:
            print(f"  Could not find table in style-1 div for {ticker} ROE data")
            return None
        
        # Check if this is the ROE table
        title_row = table.find('th', string=re.compile('Return on Equity'))
        if not title_row:
            print(f"  Table does not appear to be ROE table for {ticker}")
            return None
        
        # Extract data from the table
        roe_data = []
        for row in table.find_all('tr')[2:]:  # Skip the first two header rows
            cells = row.find_all('td')
            if len(cells) >= 4:
                try:
                    date = cells[0].text.strip()
                    net_income = cells[1].text.strip()
                    equity = cells[2].text.strip()
                    roe = cells[3].text.strip()
                    
                    # Only include rows with valid ROE data
                    if roe and roe != "N/A":
                        roe_data.append({
                            'Ticker': ticker,
                            'Date': date,
                            'Net_Income': net_income,
                            'Shareholder_Equity': equity,
                            'ROE': roe
                        })
                except Exception as e:
                    print(f"  Error parsing row for {ticker} ROE data: {e}")
        
        result_df = pd.DataFrame(roe_data)
        if not result_df.empty:
            save_progress(ticker, 'ROE', result_df)
        return result_df
    
    except Exception as e:
        print(f"  Error scraping ROE data for {ticker}: {e}")
        return None

# Function to process tickers in batches
def process_in_batches(tickers_list, batch_size=5, batch_delay=60):
    pe_dataframes = []
    roe_dataframes = []
    
    # Break the list into batches
    batches = [tickers_list[i:i + batch_size] for i in range(0, len(tickers_list), batch_size)]
    total_batches = len(batches)
    
    for batch_num, batch in enumerate(batches, 1):
        print(f"Processing batch {batch_num}/{total_batches}")
        
        for ticker in batch:
            print(f"Processing {ticker}...")
            
            # Scrape PE data
            pe_df = scrape_pe_ratio(ticker)
            if pe_df is not None and not pe_df.empty:
                pe_dataframes.append(pe_df)
                print(f"  Found {len(pe_df)} PE data points for {ticker}")
            else:
                print(f"  No PE data found for {ticker}")
            
            # Add random delay between requests for same ticker
            time.sleep(random.uniform(5, 10))
            
            # Scrape ROE data
            roe_df = scrape_roe(ticker)
            if roe_df is not None and not roe_df.empty:
                roe_dataframes.append(roe_df)
                print(f"  Found {len(roe_df)} ROE data points for {ticker}")
            else:
                print(f"  No ROE data found for {ticker}")
            
            # Add random delay between tickers
            time.sleep(random.uniform(10, 15))
        
        if batch_num < total_batches:
            print(f"Batch {batch_num} completed. Waiting {batch_delay} seconds before next batch...")
            time.sleep(batch_delay)
    
    return pe_dataframes, roe_dataframes

# Function to merge all data files
def merge_progress_files():
    pe_files = [f for f in os.listdir('data') if f.endswith('_pe.csv')]
    roe_files = [f for f in os.listdir('data') if f.endswith('_roe.csv')]
    
    pe_dataframes = []
    roe_dataframes = []
    
    for file in pe_files:
        try:
            df = pd.read_csv(f"data/{file}")
            pe_dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    for file in roe_files:
        try:
            df = pd.read_csv(f"data/{file}")
            roe_dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    pe_data = pd.concat(pe_dataframes, ignore_index=True) if pe_dataframes else pd.DataFrame()
    roe_data = pd.concat(roe_dataframes, ignore_index=True) if roe_dataframes else pd.DataFrame()
    
    return pe_data, roe_data

# Main execution
def main():
    start_time = datetime.now()
    print(f"Script started at: {start_time}")
    
    # Process tickers in batches
    pe_dataframes, roe_dataframes = process_in_batches(tickers, batch_size=5, batch_delay=120)
    
    # Alternatively, merge from progress files
    if not pe_dataframes or not roe_dataframes:
        print("Merging progress files...")
        pe_data, roe_data = merge_progress_files()
    else:
        # Combine all data
        pe_data = pd.concat(pe_dataframes, ignore_index=True) if pe_dataframes else pd.DataFrame()
        roe_data = pd.concat(roe_dataframes, ignore_index=True) if roe_dataframes else pd.DataFrame()
    
    # Save to CSV files
    if not pe_data.empty:
        pe_data.to_csv('pe_ratios.csv', index=False)
        print("PE ratio data saved to pe_ratios.csv")
        print(f"Total tickers with PE data: {pe_data['Ticker'].nunique()}")
        print(f"Total PE data points: {len(pe_data)}")
    else:
        print("No PE ratio data was collected")
    
    if not roe_data.empty:
        roe_data.to_csv('roe_data.csv', index=False)
        print("ROE data saved to roe_data.csv")
        print(f"Total tickers with ROE data: {roe_data['Ticker'].nunique()}")
        print(f"Total ROE data points: {len(roe_data)}")
    else:
        print("No ROE data was collected")
    
    end_time = datetime.now()
    print(f"Script completed at: {end_time}")
    print(f"Total runtime: {end_time - start_time}")

if __name__ == "__main__":
    main()

Script started at: 2025-04-22 13:41:11.973935
Processing batch 1/6
Processing AAPL...
  Saved 62 PE data points for AAPL to data/AAPL_pe.csv
  Found 62 PE data points for AAPL
  Loaded 61 ROE data points for AAPL from data/AAPL_roe.csv
  Found 61 ROE data points for AAPL
Processing MSFT...
  Loaded 62 PE data points for MSFT from data/MSFT_pe.csv
  Found 62 PE data points for MSFT
  Loaded 61 ROE data points for MSFT from data/MSFT_roe.csv
  Found 61 ROE data points for MSFT
Processing NVDA...
  Loaded 63 PE data points for NVDA from data/NVDA_pe.csv
  Found 63 PE data points for NVDA
  Loaded 62 ROE data points for NVDA from data/NVDA_roe.csv
  Found 62 ROE data points for NVDA
Processing GOOGL...
  Loaded 62 PE data points for GOOGL from data/GOOGL_pe.csv
  Found 62 PE data points for GOOGL
  Loaded 61 ROE data points for GOOGL from data/GOOGL_roe.csv
  Found 61 ROE data points for GOOGL
Processing AMZN...
  Saved 62 PE data points for AMZN to data/AMZN_pe.csv
  Found 62 PE data poin

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re

# Create a session for better performance and cookie handling
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
})

# Create directory for progress tracking
os.makedirs('data', exist_ok=True)

# Function to save progress
def save_progress(ticker, metric_type, data):
    if data is not None and not data.empty:
        filename = f"data/{ticker}_{metric_type.lower()}.csv"
        data.to_csv(filename, index=False)
        print(f"  Saved {len(data)} {metric_type} data points for {ticker} to {filename}")
        return True
    return False

# Function for exponential backoff
def make_request_with_backoff(url, max_retries=5, initial_delay=5):
    retries = 0
    delay = initial_delay
    
    while retries < max_retries:
        try:
            response = session.get(url)
            
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                print(f"  Rate limited (429). Backing off for {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
                retries += 1
            else:
                print(f"  HTTP error: {response.status_code}")
                return response
        except Exception as e:
            print(f"  Request error: {e}")
            time.sleep(delay)
            delay *= 2
            retries += 1
    
    print("  Max retries exceeded")
    return None

# Function to scrape PE ratio data for BRK.B
def scrape_pe_for_brkb():
    ticker = "BRK-B"
    
    print(f"Attempting to scrape PE ratio data for {ticker}...")
    
    # Use the specific URL provided
    url = "https://www.macrotrends.net/stocks/charts/BRK.B/berkshire-hathaway/pe-ratio"
    
    print(f"  Trying URL: {url}")
    response = make_request_with_backoff(url)
    
    if response is None or response.status_code != 200:
        print(f"  Failed to fetch PE data for {ticker} using {url}")
        return None
    
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for div with style-1 ID
        div = soup.find('div', id='style-1')
        if not div:
            print(f"  Could not find style-1 div for {ticker} PE data")
            return None
        
        # Find the table within this div
        table = div.find('table', class_='table')
        if not table:
            print(f"  Could not find table in style-1 div for {ticker} PE data")
            return None
        
        # Check if this is the PE ratio table
        title_row = table.find('th', string=re.compile('PE Ratio'))
        if not title_row:
            print(f"  Table does not appear to be PE ratio table for {ticker}")
            return None
        
        # Extract data from the table
        pe_data = []
        for row in table.find_all('tr')[2:]:  # Skip the first two header rows
            cells = row.find_all('td')
            if len(cells) >= 4:
                try:
                    date = cells[0].text.strip()
                    stock_price = cells[1].text.strip()
                    eps = cells[2].text.strip()
                    pe_ratio = cells[3].text.strip()
                    
                    # Only include rows with valid PE data
                    if pe_ratio and pe_ratio != "N/A":
                        pe_data.append({
                            'Ticker': ticker,
                            'Date': date,
                            'Stock_Price': stock_price,
                            'EPS': eps,
                            'PE_Ratio': pe_ratio
                        })
                except Exception as e:
                    print(f"  Error parsing row for {ticker} PE data: {e}")
        
        result_df = pd.DataFrame(pe_data)
        if not result_df.empty:
            save_progress(ticker, 'PE', result_df)
            # Also save to a dedicated file
            result_df.to_csv('brk_b_pe_data.csv', index=False)
            print(f"  Successfully scraped {len(result_df)} PE data points for {ticker}")
            print(f"  Data saved to brk_b_pe_data.csv")
            return result_df
        else:
            print(f"  No PE data found for {ticker} using {url}")
            return None
    
    except Exception as e:
        print(f"  Error scraping PE data for {ticker} using {url}: {e}")
        return None

# Function to scrape ROE data for BRK.B
def scrape_roe_for_brkb():
    ticker = "BRK-B"
    company_name = "berkshire-hathaway"
    
    print(f"Attempting to scrape ROE data for {ticker}...")
    
    # Try both URL formats - with dash and with dot
    urls = [
        f"https://www.macrotrends.net/stocks/charts/{ticker}/{company_name}/roe",
        f"https://www.macrotrends.net/stocks/charts/BRK.B/{company_name}/roe"
    ]
    
    for url in urls:
        print(f"  Trying URL: {url}")
        response = make_request_with_backoff(url)
        
        if response is None or response.status_code != 200:
            print(f"  Failed to fetch ROE data for {ticker} using {url}")
            continue
        
        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for div with style-1 ID
            div = soup.find('div', id='style-1')
            if not div:
                print(f"  Could not find style-1 div for {ticker} ROE data")
                continue
            
            # Find the table within this div
            table = div.find('table', class_='table')
            if not table:
                print(f"  Could not find table in style-1 div for {ticker} ROE data")
                continue
            
            # Check if this is the ROE table
            title_row = table.find('th', string=re.compile('Return on Equity'))
            if not title_row:
                print(f"  Table does not appear to be ROE table for {ticker}")
                continue
            
            # Extract data from the table
            roe_data = []
            for row in table.find_all('tr')[2:]:  # Skip the first two header rows
                cells = row.find_all('td')
                if len(cells) >= 4:
                    try:
                        date = cells[0].text.strip()
                        net_income = cells[1].text.strip()
                        equity = cells[2].text.strip()
                        roe = cells[3].text.strip()
                        
                        # Only include rows with valid ROE data
                        if roe and roe != "N/A":
                            roe_data.append({
                                'Ticker': ticker,
                                'Date': date,
                                'Net_Income': net_income,
                                'Shareholder_Equity': equity,
                                'ROE': roe
                            })
                    except Exception as e:
                        print(f"  Error parsing row for {ticker} ROE data: {e}")
            
            result_df = pd.DataFrame(roe_data)
            if not result_df.empty:
                save_progress(ticker, 'ROE', result_df)
                # Also save to a dedicated file
                result_df.to_csv('brk_b_roe_data.csv', index=False)
                print(f"  Successfully scraped {len(result_df)} ROE data points for {ticker}")
                print(f"  Data saved to brk_b_roe_data.csv")
                return result_df
            else:
                print(f"  No ROE data found for {ticker} using {url}")
        
        except Exception as e:
            print(f"  Error scraping ROE data for {ticker} using {url}: {e}")
    
    print(f"  Could not retrieve ROE data for {ticker} after trying all URLs")
    return None

# Main execution
def main():
    print("Starting BRK.B data retrieval...")
    
    # Add a delay between requests to avoid rate limiting
    pe_data = scrape_pe_for_brkb()
    time.sleep(10)  # Wait 10 seconds between requests
    roe_data = scrape_roe_for_brkb()
    
    # Summarize results
    print("\nResults summary:")
    
    if pe_data is not None and not pe_data.empty:
        print(f"✅ Successfully retrieved {len(pe_data)} PE data points for BRK.B")
    else:
        print("❌ Failed to retrieve PE data for BRK.B")
    
    if roe_data is not None and not roe_data.empty:
        print(f"✅ Successfully retrieved {len(roe_data)} ROE data points for BRK.B")
    else:
        print("❌ Failed to retrieve ROE data for BRK.B")

if __name__ == "__main__":
    main()