In [1]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /Users/yihanyao/bootcamp_yihan_yao/project/data/raw
PROC -> /Users/yihanyao/bootcamp_yihan_yao/project/data/processed


In [2]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'adj_close_is_numeric': pd.api.types.is_numeric_dtype(reloaded['adj_close']) if 'adj_close' in reloaded.columns else False,
    }
    return checks

# pick the first CSV file in raw folder 
csv_files = sorted(RAW.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in data/raw/")
csv_path = csv_files[0]

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print("Loaded CSV:", csv_path.name, df_csv.shape)

# store as Parquet in processed folder
pq_path = PROC / (csv_path.stem + ".parquet")
df_csv.to_parquet(pq_path, index=False)
print("Saved Parquet:", pq_path.name)

# reload and validate
print("CSV validation:", validate_loaded(df_csv, df_csv))

try:
    df_pq = pd.read_parquet(pq_path)
    print("Parquet validation:", validate_loaded(df_csv, df_pq))
except Exception as e:
    print("Parquet read failed:", e)

Loaded CSV: market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321.csv (251, 2)
Saved Parquet: market_source-yfinance_symbol-^GSPC_name-sp500_20250820_102321.parquet
CSV validation: {'shape_equal': True, 'date_is_datetime': True, 'adj_close_is_numeric': True}
Parquet validation: {'shape_equal': True, 'date_is_datetime': True, 'adj_close_is_numeric': True}
