In [1]:
import pandas as pd
from pathlib import Path
import sys

# Add project root (parent of notebooks/) to Python path 
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))
DATA = PROJECT_ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"

In [2]:
from src.cleaning import fill_missing_median, drop_missing, normalize_data

csv_files = sorted(RAW.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in data/raw/")

for csv_path in csv_files:
    print(f"\nProcessing: {csv_path.name}")
    
    # Load raw data
    df_raw = pd.read_csv(csv_path, parse_dates=['date'])
    print("Raw shape:", df_raw.shape)
    
    # Fill missing values (median for numeric)
    df_clean = fill_missing_median(df_raw, columns=['adj_close'])
    
    # Drop rows with missing critical columns
    df_clean = drop_missing(df_clean, columns=['date','adj_close'])
    
    # Normalize numeric columns 
    df_clean = df_clean.set_index('date')
    df_clean[['adj_close']] = normalize_data(df_clean, columns=['adj_close'], method='minmax')
    df_clean = df_clean.reset_index()


    # Save processed dataset (both CSV & Parquet)
    base_name = csv_path.stem + "_processed"
    pq_path = PROCESSED / f"{base_name}.parquet"
    csv_out_path = PROCESSED / f"{base_name}.csv"
    
    df_clean.to_parquet(pq_path, index=False)
    df_clean.to_csv(csv_out_path, index=False)
    
    print("Saved:", pq_path.name, "and", csv_out_path.name, "-> shape:", df_clean.shape)


Processing: market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321.csv
Raw shape: (251, 2)
Saved: market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321_processed.parquet and market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321_processed.csv -> shape: (251, 2)

Processing: market_source-yfinance_symbol-^VIX_name-vix_20250820_102321.csv
Raw shape: (251, 2)
Saved: market_source-yfinance_symbol-^VIX_name-vix_20250820_102321_processed.parquet and market_source-yfinance_symbol-^VIX_name-vix_20250820_102321_processed.csv -> shape: (251, 2)


### Assumptions & Rationale during Cleaning

1. **Missing values filled with median**  
   - Assumption: Stock prices (`adj_close`) are continuous and median imputation is robust to outliers.  
   - Rationale: Preserves dataset size while reducing distortion from extreme values compared to mean imputation.  

2. **Dropping rows with missing critical fields**  
   - Assumption: `date` and `adj_close` are required for analysis; rows missing these are unusable.  
   - Rationale: Ensures consistency in time series and prevents errors in downstream volatility modeling.  

3. **Normalization (Min-Max scaling)**  
   - Assumption: Scaling `adj_close` to [0,1] improves stability for ML models and makes features comparable.  
   - Rationale: Prevents models from being biased by absolute price levels, focusing instead on relative changes.  