In [1]:
import sys, os
import pandas as pd
import numpy as np
import yfinance as yf
from pathlib import Path
from datetime import datetime

In [2]:
# Set folder paths relative to notebook location
PROJECT_ROOT = Path("..").resolve()          # from notebook/ go up one level
DATA = PROJECT_ROOT / "data"
RAW = DATA / "raw"

# Make sure the raw folder exists
RAW.mkdir(parents=True, exist_ok=True)

# FRED via pandas_datareader (rates, CPI, unemployment)
# If not installed, run once in a cell:  %pip install pandas_datareader
try:
    from pandas_datareader.data import DataReader
    HAS_PDR = True
except Exception:
    HAS_PDR = False

In [3]:
# Timestamp helper
def ts():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

# Save function
def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path.resolve())
    return path

# Validation function
def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

In [4]:
# Project-relevant symbols
# Test data acquisition 
SYMBOLS = {
    "sp500": "^GSPC",  # S&P 500 index
    "vix": "^VIX"      # VIX index (CBOE Volatility Index)
}

data_frames = {}

# Download data from Yahoo Finance
for name, ticker in SYMBOLS.items():
    df = yf.download(ticker, period="1y", interval="1d").reset_index()[['Date', 'Close']]
    df.columns = ['date','adj_close']
    data_frames[name] = df

    # Save using your save_csv function
    save_csv(
        df.sort_values('date'),
        prefix="market",
        source="yfinance",
        symbol=ticker,
        name=name
    )

# Validate both datasets
for name, df in data_frames.items():
    print(name, validate(df, ["date", "adj_close"]))

  df = yf.download(ticker, period="1y", interval="1d").reset_index()[['Date', 'Close']]
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, period="1y", interval="1d").reset_index()[['Date', 'Close']]


Saved /Users/yihanyao/bootcamp_yihan_yao/project/data/raw/market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321.csv


[*********************100%***********************]  1 of 1 completed

Saved /Users/yihanyao/bootcamp_yihan_yao/project/data/raw/market_source-yfinance_symbol-^VIX_name-vix_20250820_102321.csv
sp500 {'missing': [], 'shape': (251, 2), 'na_total': 0}
vix {'missing': [], 'shape': (251, 2), 'na_total': 0}





In [4]:
# ========= 1) DOWNLOAD DATA =========
PERIOD = "5y"   # use 5y to compute rolling features reliably (change if needed)
INTERVAL = "1d"

# S&P 500 and VIX from Yahoo
spx = yf.download("^GSPC", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
spx.columns = ["date", "spx_close"]
vix = yf.download("^VIX", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
vix.columns = ["date", "vix"]

# Try FRED (preferred for macro). If not available, fall back to Yahoo proxies for rates.
macro_cols = {}
fred_ok = False
if HAS_PDR:
    try:
        api_key = os.getenv("FRED_API_KEY", None)
        fred_kwargs = {"api_key": api_key} if api_key else {}
        # Monthly CPI (index), monthly unemployment rate, daily/weekly rates:
        # DGS10 (10Y Treasury %), FEDFUNDS (Effective Fed Funds %), CPIAUCSL, UNRATE
        dgs10     = DataReader("DGS10", "fred", **fred_kwargs)      # percent
        fedfunds  = DataReader("FEDFUNDS", "fred", **fred_kwargs)   # percent (monthly)
        cpi       = DataReader("CPIAUCSL", "fred", **fred_kwargs)   # index (monthly)
        unrate    = DataReader("UNRATE", "fred", **fred_kwargs)     # percent (monthly)

        # Bring all to daily index for merging; forward-fill monthly values
        macro = pd.concat(
            [
                dgs10.rename(columns={"DGS10": "dgs10"}),
                fedfunds.rename(columns={"FEDFUNDS": "fedfunds"}),
                cpi.rename(columns={"CPIAUCSL": "cpi"}),
                unrate.rename(columns={"UNRATE": "unrate"}),
            ],
            axis=1
        ).sort_index()

        # Reindex to SPX calendar (business days from spx)
        # and forward-fill to align macro to trading days.
        calendar = pd.DataFrame(index=pd.to_datetime(spx["date"]))
        macro = calendar.join(macro, how="left").ffill().reset_index(names="date")

        fred_ok = True
        print("Macro downloaded from FRED.")
    except Exception as e:
        print("FRED download failed (will fall back to Yahoo proxies for rates). Reason:", e)

if not fred_ok:
    # Yahoo Finance proxies for rates (no CPI/UNRATE fallback available from Yahoo)
    # ^TNX = 10Y yield * 10  (e.g., 45.6 means 4.56%)
    # ^IRX = 13-week T-Bill (already reported in percent)
    tnx = yf.download("^TNX", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
    tnx.columns = ["date", "tnx_x10"]
    irx = yf.download("^IRX", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
    irx.columns = ["date", "irx_pct"]

    # Convert ^TNX to percent
    tnx["dgs10"] = tnx["tnx_x10"] / 10.0
    tnx = tnx.drop(columns=["tnx_x10"])

    macro = (
        pd.merge(tnx[["date", "dgs10"]], irx[["date", "irx_pct"]], on="date", how="outer")
        .sort_values("date")
        .ffill()
    )
    macro["fedfunds"] = np.nan  # not available in fallback
    macro["cpi"] = np.nan       # not available in fallback
    macro["unrate"] = np.nan    # not available in fallback
    print("Macro downloaded from Yahoo proxies (rates only). CPI/UNRATE missing.")

# Save raw pulls (optional, reproducibility)
raw_stamp = ts()
spx.to_csv(RAW / f"spx_yf_{raw_stamp}.csv", index=False)
vix.to_csv(RAW / f"vix_yf_{raw_stamp}.csv", index=False)
macro.to_csv(RAW / f"macro_{'fred' if fred_ok else 'yahoo'}_{raw_stamp}.csv", index=False)

  spx = yf.download("^GSPC", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
[*********************100%***********************]  1 of 1 completed
  vix = yf.download("^VIX", period=PERIOD, interval=INTERVAL)[["Close"]].reset_index()
[*********************100%***********************]  1 of 1 completed


Macro downloaded from FRED.


## Documentation
- **Source**
    - URL/endpoint: yfinance (no direct URL)
    - Ticker: ^GSPC (S&P 500 Index), ^VIX (CBOE Volatility Index) 
    - Params period='1y', interval='1d'
    - Output: Daily adjusted close prices stored as CSV in data/raw/ with timestamped filenames 
- **Assumptions & risks**
    - Yfinance provides sufficient historical market and volatility data for analysis. 
    - Required columns exist and are validated after download.
    - Data is stored locally as CSV with timestamp, ensuring reproducibility
    - Free data source (yfinance) may be subject to outages or delays
    - VIX and S&P 500 availability depends on Yahoo Finance's API backend, which may change without notice
    - CSV schema is simple, but if yfinance changes column names, downstream validation could fail 
- Confirm `.env` is not committed