Project

In [2]:
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr
import datetime as dt
import numpy as np
from bs4 import BeautifulSoup
import requests, re

In [5]:
# Scrape S&P 500 stock list
url = 'https://stockanalysis.com/list/sp-500-stocks/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')

# Find the table and extract headers
table = soup.find('table', class_='symbol-table svelte-1ro3niy')
headers = [th.get_text(strip=True) for th in table.find('tr').find_all('th')]

# Extract all row data
stocks_data = []
for row in table.find_all('tr')[1:]:  # Skip header row
    row_data = [cell.get_text(strip=True) for cell in row.find_all('td')]
    stocks_data.append(row_data)

# Create DataFrame and set No. column as index
sp500_df = pd.DataFrame(stocks_data, columns=headers)
sp500_df = sp500_df.set_index('No.')

print(f"Columns: {list(sp500_df.columns)}")
print("\nFirst 10 rows:")
print(sp500_df.head(10)[['Symbol', 'Market Cap', 'Stock Price', '% Change']])

Columns: ['Symbol', 'Company Name', 'Market Cap', 'Stock Price', '% Change', 'Revenue']

First 10 rows:
    Symbol Market Cap Stock Price % Change
No.                                       
1     NVDA      4.57T      188.15    0.04%
2     AAPL      3.97T      268.47   -0.48%
3     MSFT      3.69T      496.82   -0.06%
4     GOOG      3.37T      279.70   -1.98%
5    GOOGL      3.36T      278.83   -2.08%
6     AMZN      2.61T      244.41    0.56%
7     AVGO      1.65T      349.43   -1.73%
8     META      1.57T      621.71    0.45%
9     TSLA      1.43T      429.52   -3.68%
10   BRK.B      1.08T      499.06    1.20%


In [6]:
# Download stock data for S&P 500 companies
stocks = sp500_df['Symbol'].tolist()  # Convert to list
endDate = dt.datetime(2024, 12, 31)
startDate = dt.datetime(2021, 1, 1)

print(f"Downloading data for {len(stocks)} stocks from {startDate.date()} to {endDate.date()}...")
print("This may take a few minutes...")

# Download with error handling
try:
    df = yf.download(stocks, start=startDate, end=endDate, group_by='ticker', auto_adjust=True, prepost=True, threads=True)
    print(f"✅ Successfully downloaded data!")
    print(f"DataFrame shape: {df.shape}")
    print(f"Date range: {df.index[0]} to {df.index[-1]}")
    display(df.head())
except Exception as e:
    print(f"❌ Error downloading data: {e}")
    # Try with a smaller subset if full download fails
    print("Trying with first 10 stocks...")
    df = yf.download(stocks[:10], start=startDate, end=endDate, group_by='ticker', auto_adjust=True)
    display(df.head())

  df = yf.download(stocks, start=startDate, end=endDate, group_by='ticker')


AttributeError: 'Series' object has no attribute 'split'