In [7]:
import pandas as pd
import yfinance as yf
from datetime import datetime
import time
from tqdm import tqdm

In [8]:
def fetch_stock_data(ticker):
    try:
        stock = yf.Ticker(ticker)
        data = stock.history(start="2010-01-01", end=datetime.now().strftime("%Y-%m-%d"))
        data = data.reset_index()
        data['Ticker'] = ticker
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {str(e)}")
        return pd.DataFrame()

def process_batch(tickers, batch_size=1800, time_limit=3600):
    results = []
    start_time = time.time()

    for i, ticker in enumerate(tqdm(tickers)):
        results.append(fetch_stock_data(ticker))

        if (i + 1) % batch_size == 0:
            elapsed_time = time.time() - start_time
            if elapsed_time < time_limit:
                time.sleep(time_limit - elapsed_time)
            start_time = time.time()

    return pd.concat(results, ignore_index=True)

def main():
    # Read the CSV file
    df = pd.read_csv('~/Small-Cap-Scout/raw_data/cik_ticker_pairs.csv')

    # Get the list of tickers
    tickers = df['TICKER'].tolist()

    all_data = pd.DataFrame()
    batch_size = 1800  # Slightly under 2000 to account for potential errors

    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1} of {len(tickers)//batch_size + 1}")
        batch_data = process_batch(batch)
        all_data = pd.concat([all_data, batch_data], ignore_index=True)

        # Save intermediate results
        all_data.to_csv(f'yahoo_stock_data_since_2010_batch_{i//batch_size + 1}.csv', index=False)

    # Save final results
    all_data.to_csv('yahoo_stock_data_since_2010_complete.csv', index=False)

    print("Data collection complete. Final results saved to yahoo_stock_data_since_2010_complete.csv")

if __name__ == "__main__":
    main()

Processing batch 1 of 4


  0%|          | 3/1800 [00:01<10:06,  2.96it/s]$ACET.Q: possibly delisted; no timezone found
  0%|          | 8/1800 [00:03<09:00,  3.32it/s]$BYI: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  1%|          | 17/1800 [00:05<06:17,  4.73it/s]$IDSA: possibly delisted; no timezone found
  2%|▏         | 35/1800 [00:09<06:02,  4.87it/s]$SSI: possibly delisted; no timezone found
  2%|▏         | 38/1800 [00:10<08:40,  3.38it/s]$TREC: possibly delisted; no timezone found
  3%|▎         | 52/1800 [00:14<06:30,  4.48it/s]$AVP: possibly delisted; no timezone found
  3%|▎         | 56/1800 [00:15<07:48,  3.72it/s]$PTVC.B: possibly delisted; no timezone found
  3%|▎         | 58/1800 [00:16<10:45,  2.70it/s]$BCR: possibly delisted; no price data found  (1d 2010-01-01 -> 2024-09-05)
  3%|▎         | 61/1800 [00:17<07:36,  3.81it/s]$ESTE: possibly delisted; no timezone found
  4%|▍         | 73/1800 [00:20<07:05,  4.06it/s]$BWL A: possibly delisted; no timezone found
  4%|