In [1]:
#Import modules
import pandas as pd
import requests
from collections import defaultdict
import yfinance as yf
from datetime import datetime
import pandas as pd
from time import sleep
from tqdm import tqdm
from io import StringIO

#Extract updated list of SP500 tickers
url_sp500 = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

user_agents = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url_sp500, headers=user_agents, timeout=30)
response.raise_for_status()

tables = pd.read_html(response.text)
df_sp500 = tables[0]

sector_to_tickers = defaultdict(list)

for symbol, sector in zip(df_sp500["Symbol"], df_sp500["GICS Sector"]):
    sector_to_tickers[sector].append(symbol)

dict_sp500 = dict(sector_to_tickers)

#Copy dict_sp500 to csv
rows = [
    {"Sector": sector, "Ticker Symbol": ticker}
    for sector, tickers in dict_sp500.items()
    for ticker in tickers
]

df_sp500_output = pd.DataFrame(rows)

#df_sp500_output.to_csv("sp500_tickers.csv", index=False)

# Make a copy to avoid mutating original
df_sp500_enriched = df_sp500_output.copy()

company_names = []
market_caps = []

# tqdm wraps the iterable
for ticker in tqdm(
    df_sp500_enriched["Ticker Symbol"],
    desc="Fetching company metadata",
    unit="ticker"
):
    try:
        tk = yf.Ticker(ticker)
        info = tk.info

        company_name = info.get("shortName") or info.get("longName")
        market_cap = info.get("marketCap")

    except Exception:
        company_name = None
        market_cap = None

    company_names.append(company_name)
    market_caps.append(market_cap)

    sleep(0.05)  # avoid Yahoo throttling

# Add columns
df_sp500_enriched["Company_Name"] = company_names
df_sp500_enriched["Market_Cap"] = market_caps

df_sp500_enriched.to_csv("sp500_tickers.csv", index=False)

# 1️⃣ Extract ticker list
tickers = df_sp500_output["Ticker Symbol"].dropna().unique().tolist()
print(f"Number of tickers: {len(tickers)}")

  tables = pd.read_html(response.text)
Fetching company metadata: 100%|██████████| 503/503 [02:28<00:00,  3.38ticker/s]

Number of tickers: 503





In [None]:
#Import modules
import pandas as pd
import requests
from collections import defaultdict



import yfinance as yf
from datetime import datetime


import pandas as pd
from time import sleep
from tqdm import tqdm







In [4]:
#Extract updated list of SP500 tickers
url_sp500 = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

user_agents = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

response = requests.get(url_sp500, headers=user_agents, timeout=30)
response.raise_for_status()

tables = pd.read_html(response.text)
df_sp500 = tables[0]

sector_to_tickers = defaultdict(list)

for symbol, sector in zip(df_sp500["Symbol"], df_sp500["GICS Sector"]):
    sector_to_tickers[sector].append(symbol)

dict_sp500 = dict(sector_to_tickers)

#Copy dict_sp500 to csv
rows = [
    {"Sector": sector, "Ticker Symbol": ticker}
    for sector, tickers in dict_sp500.items()
    for ticker in tickers
]

df_sp500_output = pd.DataFrame(rows)

df_sp500_output.to_csv("sp500_tickers.csv", index=False)

  tables = pd.read_html(response.text)


In [9]:
import yfinance as yf
import pandas as pd
from time import sleep
from tqdm import tqdm

# Make a copy to avoid mutating original
df_sp500_enriched = df_sp500_output.copy()

company_names = []
market_caps = []

# tqdm wraps the iterable
for ticker in tqdm(
    df_sp500_enriched["Ticker Symbol"],
    desc="Fetching company metadata",
    unit="ticker"
):
    try:
        tk = yf.Ticker(ticker)
        info = tk.info

        company_name = info.get("shortName") or info.get("longName")
        market_cap = info.get("marketCap")

    except Exception:
        company_name = None
        market_cap = None

    company_names.append(company_name)
    market_caps.append(market_cap)

    sleep(0.05)  # avoid Yahoo throttling

# Add columns
df_sp500_enriched["Company_Name"] = company_names
df_sp500_enriched["Market_Cap"] = market_caps


Fetching company metadata: 100%|██████████| 503/503 [04:00<00:00,  2.09ticker/s]


In [8]:
df_sp500_enriched.to_csv(
    "sp500_tickers_enriched.csv",
    index=False
)

In [17]:
import yfinance as yf
import pandas as pd
from datetime import datetime

# -----------------------------
# Assumes df_sp500_output already exists
# Columns:
# Sector | Ticker Symbol | Last_Updated
# -----------------------------

# 1️⃣ Extract ticker list
tickers = df_sp500_output["Ticker Symbol"].dropna().unique().tolist()
print(f"Number of tickers: {len(tickers)}")

# -----------------------------
# 2️⃣ Download 10 years of daily prices (S&P 500 constituents)
# -----------------------------
df_prices = yf.download(
    tickers=tickers,
    period="10y",
    interval="1d",
    group_by="ticker",
    auto_adjust=False,
    threads=True,
    progress=True
)

# -----------------------------
# 3️⃣ Convert MultiIndex → long format
# -----------------------------
df_long = (
    df_prices
    .stack(level=0)
    .reset_index()
    .rename(columns={"level_1": "Ticker"})
)

# Rename Date column if needed
df_long.rename(columns={"Date": "Trade_Date"}, inplace=True)

# -----------------------------
# 4️⃣ Join sector information
# -----------------------------
df_final = df_long.merge(
    df_sp500_output[["Ticker Symbol", "Sector"]],
    left_on="Ticker",
    right_on="Ticker Symbol",
    how="left"
).drop(columns=["Ticker Symbol"])

# -----------------------------
# ✅ 4.5️⃣ Download S&P 500 index (^GSPC), flatten columns if needed, and append
# -----------------------------
df_spx = yf.download(
    tickers="^GSPC",
    period="10y",
    interval="1d",
    auto_adjust=False,
    progress=True
)

# ✅ Flatten MultiIndex columns returned by yfinance (prevents NaN rows on concat)
if isinstance(df_spx.columns, pd.MultiIndex):
    df_spx.columns = df_spx.columns.get_level_values(0)

df_spx = df_spx.reset_index()
df_spx.rename(columns={"Date": "Trade_Date"}, inplace=True)

df_spx["Ticker"] = "^GSPC"
df_spx["Sector"] = "Index"

# Ensure same base columns as df_final (Volume for ^GSPC can be missing; that's OK)
df_spx = df_spx[["Trade_Date", "Ticker", "Sector", "Open", "High", "Low", "Close", "Adj Close", "Volume"]]

# Append to your main dataset
df_final = pd.concat([df_final, df_spx], ignore_index=True)

# -----------------------------
# 5️⃣ Add metadata (optional but nice)
# -----------------------------
df_final["Data_Source"] = "Yahoo Finance (yfinance)"
df_final["Extracted_At"] = datetime.now()

# -----------------------------
# 6️⃣ Reorder columns (Power BI friendly)
# -----------------------------
df_final = df_final[
    [
        "Trade_Date",
        "Ticker",
        "Sector",
        "Open",
        "High",
        "Low",
        "Close",
        "Adj Close",
        "Volume",
        "Data_Source",
        "Extracted_At",
    ]
]

# -----------------------------
# 7️⃣ Save to CSV
# -----------------------------
output_file = "sp500_prices_daily_10y_plus_spx.csv"
df_final.to_csv(output_file, index=False)

print(f"Saved {len(df_final):,} rows to {output_file}")


Number of tickers: 503


[*******************   40%                       ]  199 of 503 completed$BF.B: possibly delisted; no price data found  (period=10y)
[**********************83%***************        ]  419 of 503 completed$BRK.B: possibly delisted; no price data found  (period=10y) (Yahoo error = "No data found, symbol may be delisted")
[*********************100%***********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: possibly delisted; no price data found  (period=10y)
['BRK.B']: possibly delisted; no price data found  (period=10y) (Yahoo error = "No data found, symbol may be delisted")
  .stack(level=0)
[*********************100%***********************]  1 of 1 completed


Saved 1,225,733 rows to sp500_prices_daily_10y_plus_spx.csv


In [18]:
df_final

Unnamed: 0,Trade_Date,Ticker,Sector,Open,High,Low,Close,Adj Close,Volume,Data_Source,Extracted_At
0,2016-01-27,A,Health Care,37.389999,37.919998,36.810001,37.049999,34.208401,1.479400e+06,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
1,2016-01-27,AAPL,Information Technology,24.010000,24.157499,23.334999,23.355000,21.063309,5.334788e+08,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
2,2016-01-27,ABBV,Health Care,58.360001,58.930000,56.450001,57.110001,37.772869,9.077700e+06,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
3,2016-01-27,ABT,Health Care,39.950001,41.490002,39.900002,40.470001,33.563332,1.053070e+07,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
4,2016-01-27,ACGL,Financials,22.030001,22.576668,21.963333,21.963333,20.884901,1.934100e+06,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
...,...,...,...,...,...,...,...,...,...,...,...
1225728,2026-01-20,^GSPC,Index,6865.240234,6871.169922,6789.049805,6796.859863,6796.859863,5.769500e+09,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
1225729,2026-01-21,^GSPC,Index,6810.709961,6910.390137,6804.959961,6875.620117,6875.620117,5.835520e+09,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
1225730,2026-01-22,^GSPC,Index,6914.439941,6934.750000,6893.620117,6913.350098,6913.350098,5.307580e+09,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
1225731,2026-01-23,^GSPC,Index,6907.850098,6932.959961,6895.500000,6915.609863,6915.609863,4.871930e+09,Yahoo Finance (yfinance),2026-01-26 14:44:08.781248
