# 01_split_by_type outputs

Automated preview of deterministic pipeline outputs saved under `data/interim`. Update the `DATASETS` list if the step emits new files.


In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/interim/01_split_by_type/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    ("train_kodate", PROJECT_ROOT / "data" / "interim" / "01_split_by_type" / "train_kodate.parquet"),
    ("train_mansion", PROJECT_ROOT / "data" / "interim" / "01_split_by_type" / "train_mansion.parquet"),
    ("test_kodate", PROJECT_ROOT / "data" / "interim" / "01_split_by_type" / "test_kodate.parquet"),
    ("test_mansion", PROJECT_ROOT / "data" / "interim" / "01_split_by_type" / "test_mansion.parquet"),
]
HEAD_ROWS = 20


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)


In [None]:
for label, path in DATASETS:
    inspect_dataset(label, path)
