In [5]:
from datetime import datetime
from pathlib import Path
import sys
import os
import pandas as pd
sys.path.append(os.getcwd() + "/../src")
from config import load_env, get_path
from storage import write_df, read_df

load_env()
RAW = get_path("DATA_DIR_RAW", "data/raw")
PROC = get_path("DATA_DIR_PROCESSED", "data/processed")

RAW, PROC

(WindowsPath('data/raw'), WindowsPath('data/processed'))

In [6]:
import numpy as np
now = datetime.now().strftime("%Y%m%d-%H%M")

df = pd.DataFrame({
    "counterparty": ["A", "B", "C", "A", "B", "C"],
    "date": pd.to_datetime(["2025-08-01","2025-08-01","2025-08-01","2025-08-02","2025-08-02","2025-08-02"]),
    "ead": [10.5, 20.0, 15.2, 11.0, 19.8, 15.5],   # Exposure at Default
    "pd":  [0.01, 0.02, 0.015, 0.011, 0.019, 0.016] # Probability of Default
})
df.head()


Unnamed: 0,counterparty,date,ead,pd
0,A,2025-08-01,10.5,0.01
1,B,2025-08-01,20.0,0.02
2,C,2025-08-01,15.2,0.015
3,A,2025-08-02,11.0,0.011
4,B,2025-08-02,19.8,0.019


In [7]:
csv_path = RAW / f"sample_{now}.csv"
parq_path = PROC / f"sample_{now}.parquet"

write_df(df, csv_path)
write_df(df, parq_path)

csv_path, parq_path


(WindowsPath('data/raw/sample_20250821-0423.csv'),
 WindowsPath('data/processed/sample_20250821-0423.parquet'))

In [8]:
df_csv = read_df(csv_path, parse_dates=["date"])
df_parq = read_df(parq_path)

# Validation helper
def validate_roundtrip(original: pd.DataFrame, csv_df: pd.DataFrame, parq_df: pd.DataFrame):
    results = {}
    results["shape_csv_matches"] = original.shape == csv_df.shape
    results["shape_parquet_matches"] = original.shape == parq_df.shape

    # Check dtypes for critical columns
    results["date_dtype_csv_is_datetime"] = str(csv_df["date"].dtype).startswith("datetime64")
    results["date_dtype_parquet_is_datetime"] = str(parq_df["date"].dtype).startswith("datetime64")
    results["ead_is_float_csv"] = str(csv_df["ead"].dtype).startswith("float")
    results["ead_is_float_parquet"] = str(parq_df["ead"].dtype).startswith("float")

    return pd.Series(results)

validate_roundtrip(df, df_csv, df_parq)


shape_csv_matches                 True
shape_parquet_matches             True
date_dtype_csv_is_datetime        True
date_dtype_parquet_is_datetime    True
ead_is_float_csv                  True
ead_is_float_parquet              True
dtype: bool

## Data Storage Notes
- **Folders:** `data/raw/` for immutable inputs; `data/processed/` for derived outputs.  
- **Formats:** Saved CSV (exchange-friendly) and Parquet (analysis-ready, type-preserving, fast).  
- **Env paths:** `DATA_DIR_RAW` and `DATA_DIR_PROCESSED` from `.env` drive all I/O.  
- **Validation:** Checked shapes and key dtypes (`date` as datetime, `ead` as float) after reload.

References: Stage 05 reading (folder conventions, CSV vs Parquet, env-driven paths) and homework tasks (save/load, utilities, validation, documentation).
