# 01 – Data Ingestion & Exploratory Data Analysis

**Objective:**  
- Load historical price data  
- Perform basic cleaning & feature‐engineering  
- Visualize key statistics and detect preliminary outliers

In [12]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [13]:
import os
import pandas as pd
import yfinance as yf
from datetime import datetime

In [14]:
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

In [15]:
TICKERS = ["AAPL", "MSFT", "SPY"]
START_DATE = "2015-01-01"
END_DATE = datetime.today().strftime("%Y-%m-%d")
INTERVAL = "1d"

RAW_DIR = "data/raw"
COMBINED_CSV = os.path.join(RAW_DIR, "combined_price_data.csv")

os.makedirs(RAW_DIR, exist_ok=True)

In [16]:
def fetch_and_save(ticker: str, start: str, end: str, interval: str) -> pd.DataFrame:
    logging.info(f"Fetching {ticker} from {start} to {end} at {interval} interval...")
    df = yf.download(
        tickers=ticker,
        start=start,
        end=end,
        interval=interval,
        progress=False,
        auto_adjust=False,
        actions=False
    )
    if df.empty:
        logging.warning(f"No data for {ticker}, skipping.")
        return None

    out_path = os.path.join(RAW_DIR, f"{ticker}.csv")
    df.to_csv(out_path)
    logging.info(f"Saved {len(df)} rows to {out_path}")
    logging.info("Absolute path: " + os.path.abspath(out_path))


    df = df.reset_index()
    df["Ticker"] = ticker
    return df

In [17]:
all_data = []
for tk in TICKERS:
    df_tk = fetch_and_save(tk, START_DATE, END_DATE, INTERVAL)
    if df_tk is not None:
        all_data.append(df_tk)

if all_data:
    combined = pd.concat(all_data, ignore_index=True)
    combined.to_csv(COMBINED_CSV, index=False)
    logging.info(f"Combined dataset with {len(combined)} rows saved to {COMBINED_CSV}")
else:
    logging.error("No data fetched. Combined file not created.")

2025-06-20 16:10:51,685 INFO Fetching AAPL from 2015-01-01 to 2025-06-20 at 1d interval...
2025-06-20 16:10:51,736 INFO Saved 2631 rows to data/raw/AAPL.csv
2025-06-20 16:10:51,738 INFO Absolute path: /Users/virajchawda/Quant/price-action-anomaly-detection-quant/notebooks/data/raw/AAPL.csv
2025-06-20 16:10:51,740 INFO Fetching MSFT from 2015-01-01 to 2025-06-20 at 1d interval...
2025-06-20 16:10:51,779 INFO Saved 2631 rows to data/raw/MSFT.csv
2025-06-20 16:10:51,779 INFO Absolute path: /Users/virajchawda/Quant/price-action-anomaly-detection-quant/notebooks/data/raw/MSFT.csv
2025-06-20 16:10:51,780 INFO Fetching SPY from 2015-01-01 to 2025-06-20 at 1d interval...
2025-06-20 16:10:51,816 INFO Saved 2631 rows to data/raw/SPY.csv
2025-06-20 16:10:51,816 INFO Absolute path: /Users/virajchawda/Quant/price-action-anomaly-detection-quant/notebooks/data/raw/SPY.csv
2025-06-20 16:10:51,860 INFO Combined dataset with 7893 rows saved to data/raw/combined_price_data.csv


In [18]:
preview = pd.read_csv(COMBINED_CSV)
preview.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Adj Close.1,Close.1,High.1,Low.1,Open.1,Volume.1,Adj Close.2,Close.2,High.2,Low.2,Open.2,Volume.2
0,,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,,MSFT,MSFT,MSFT,MSFT,MSFT,MSFT,SPY,SPY,SPY,SPY,SPY,SPY
1,2015-01-02,24.288583755493164,27.332500457763672,27.860000610351562,26.837499618530273,27.84749984741211,212818400.0,AAPL,,,,,,,,,,,,
2,2015-01-05,23.604331970214844,26.5625,27.162500381469727,26.352500915527344,27.072500228881836,257142000.0,AAPL,,,,,,,,,,,,
3,2015-01-06,23.606555938720703,26.565000534057617,26.857500076293945,26.157499313354492,26.635000228881836,263188400.0,AAPL,,,,,,,,,,,,
4,2015-01-07,23.937570571899414,26.9375,27.049999237060547,26.674999237060547,26.799999237060547,160423600.0,AAPL,,,,,,,,,,,,
