# Fetch the data from yfinance

In [39]:
import pandas as pd
import yfinance as yf
import time
import os
import random
import tqdm

def fetch_data(ticker, start='2015-01-01', end='2025-01-01', wait=True):
    if os.path.exists(f'data/raw/{ticker}.csv'):
        print(f'{ticker} already exists')
        return pd.read_csv(f'data/raw/{ticker}.csv')
    else:
        try:
            data = yf.download(ticker, start=start, end=end, auto_adjust=True)
            data.to_csv(f'data/raw/{ticker}.csv')
            print(f'{ticker} fetched and saved')
            if wait:
                time.sleep(random.randint(62, 300))
            return data
        except Exception as e:
            print(f'Error fetching {ticker}: {e}')
            return None
        
def get_sp500_tickers():
    """
    Get list of S&P 500 tickers from Wikipedia
    """
    try:
        # Get S&P 500 table from Wikipedia
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        table = pd.read_html(url)[0]
        
        # Get tickers from first column
        tickers = table['Symbol'].tolist()
        
        print(f"Retrieved {len(tickers)} S&P 500 tickers")
        return tickers
        
    except Exception as e:
        print(f"Error getting S&P 500 tickers: {e}")
        return None
    
def format_data(data):
    data = data.iloc[2:].copy()
    data['Date'] = pd.to_datetime(data['Price'])
    data.drop('Price', axis=1, inplace=True)
    data.set_index('Date', inplace=True)
    return data
    
    
    
def import_all_data(folder_with_csv):
    all_data = {}
    for file in os.listdir(folder_with_csv):
        data = format_data(pd.read_csv(os.path.join(folder_with_csv, file)))
        filename = file.split('.')[0]
        all_data[filename] = data
    return all_data

In [2]:
sp500_tickers = get_sp500_tickers()

Retrieved 503 S&P 500 tickers


In [3]:
import os 

csv_count = len([f for f in os.listdir('data/raw') if f.endswith('.csv')])
print(f"Number of CSV files in data/raw: {csv_count}")

Number of CSV files in data/raw: 502


In [None]:
for ticker in tqdm.tqdm(sp500_tickers):
    start = '2015-01-01'
    end = '2025-01-01'

    data = fetch_data(ticker, start, end, wait=False)

In [41]:
all_data = import_all_data("data/raw")

In [47]:
def verify_data(data_df, first='2015-01-02', last='2024-12-31'):
    # Check if first date is 2015-01-02 and last date is 2024-12-31
    first_date = data_df.index[0]
    last_date = data_df.index[-1]
    
    expected_first = pd.Timestamp(first)
    expected_last = pd.Timestamp(last)
    
    if first_date != expected_first or last_date != expected_last:
        print("Error")
        print(f"Expected range: {expected_first} to {expected_last}")
        print(f"Actual range: {first_date} to {last_date}")
        return False
        
    return True

In [49]:
for ticker, data in all_data.items():
    try:
        verify_data(data)
    except Exception as e:
        print(f"Error verifying data for {ticker}: {e}")

Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2020-12-10 00:00:00 to 2024-12-31 00:00:00
Error verifying data for BF: index 0 is out of bounds for axis 0 with size 0
Error verifying data for BRK: index 0 is out of bounds for axis 0 with size 0
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2020-03-19 00:00:00 to 2024-12-31 00:00:00
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2022-01-19 00:00:00 to 2024-12-31 00:00:00
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2021-04-14 00:00:00 to 2024-12-31 00:00:00
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2019-06-12 00:00:00 to 2024-12-31 00:00:00
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2019-05-24 00:00:00 to 2024-12-31 00:00:00
Error
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2020-12-09 00:00:00 to 2024-12-31 