# Interim Dataset Quick Audit

データパイプライン各ステップが生成した中間 Parquet を一括で点検し、レコード数・`data_id` のユニーク件数・列数・カラム別の欠損率を素早く把握するためのノートブックです。

下のセルを順に実行すると `data/interim` 配下を走査して最新の統計を再計算します。


In [2]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable, Tuple

import pandas as pd
from IPython.display import display

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)


def find_project_root(start: Path) -> Path:
    """Ascend directories until we hit the git repo boundary."""
    for candidate in [start, *start.parents]:
        if (candidate / ".git").exists():
            return candidate
    raise FileNotFoundError("Could not locate project root (missing .git directory).")


PROJECT_ROOT = find_project_root(Path.cwd())
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
if not INTERIM_DIR.exists():
    raise FileNotFoundError(f"{INTERIM_DIR} not found. Run a data pipeline step first.")

print(f"Project root: {PROJECT_ROOT}")
print(f"Interim data dir: {INTERIM_DIR}")


Project root: /Users/takamiya/work/0000_repos/signate_comp_2nd
Interim data dir: /Users/takamiya/work/0000_repos/signate_comp_2nd/data/interim


In [3]:
from typing import Dict, List

DatasetKey = Tuple[str, str]


def iter_parquet_files(interim_dir: Path) -> Iterable[Tuple[str, str, Path]]:
    """Yield (step_name, relative_dataset_path, absolute_path) tuples."""
    for step_dir in sorted(p for p in interim_dir.iterdir() if p.is_dir()):
        step_name = step_dir.relative_to(interim_dir).as_posix()
        for parquet_path in sorted(step_dir.rglob("*.parquet")):
            dataset_rel_path = parquet_path.relative_to(step_dir).as_posix()
            yield step_name, dataset_rel_path, parquet_path


def compute_dataset_stats(
    step_name: str,
    dataset_rel_path: str,
    parquet_path: Path,
) -> Tuple[dict, pd.DataFrame]:
    """Read a Parquet file and collect high-level summary statistics."""
    df = pd.read_parquet(parquet_path)
    row_count = int(len(df))
    column_count = int(df.shape[1])
    unique_data_ids = (
        int(df["data_id"].nunique(dropna=True)) if "data_id" in df.columns else None
    )
    null_rate_table = (
        df.isna()
        .mean()
        .rename("null_ratio")
        .to_frame()
        .reset_index()
        .rename(columns={"index": "column"})
        .sort_values("null_ratio", ascending=False)
        .reset_index(drop=True)
    )
    del df

    dataset_key = f"{step_name}/{dataset_rel_path}"
    summary = {
        "step": step_name,
        "dataset": dataset_rel_path,
        "dataset_key": dataset_key,
        "rows": row_count,
        "unique_data_ids": unique_data_ids,
        "n_columns": column_count,
        "path": str(parquet_path.relative_to(PROJECT_ROOT)),
    }
    return summary, null_rate_table


def build_interim_catalog(interim_dir: Path) -> Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]:
    summaries: List[dict] = []
    null_rate_tables: Dict[str, pd.DataFrame] = {}

    for step_name, dataset_rel_path, parquet_path in iter_parquet_files(interim_dir):
        summary, null_rates = compute_dataset_stats(step_name, dataset_rel_path, parquet_path)
        summaries.append(summary)
        null_rate_tables[summary["dataset_key"]] = null_rates

    if summaries:
        summary_df = (
            pd.DataFrame(summaries)
            .sort_values(["step", "dataset"])
            .reset_index(drop=True)
        )
    else:
        summary_df = pd.DataFrame(
            columns=["step", "dataset", "dataset_key", "rows", "unique_data_ids", "n_columns", "path"]
        )

    return summary_df, null_rate_tables


In [4]:
summary_df, NULL_RATE_TABLES = build_interim_catalog(INTERIM_DIR)

if summary_df.empty:
    print("No Parquet files found in data/interim.")
else:
    display(summary_df)


Unnamed: 0,step,dataset,dataset_key,rows,unique_data_ids,n_columns,path
0,00_assign_data_id,test.parquet,00_assign_data_id/test.parquet,112437,112437.0,150,data/interim/00_assign_data_id/test.parquet
1,00_assign_data_id,train.parquet,00_assign_data_id/train.parquet,363924,363924.0,150,data/interim/00_assign_data_id/train.parquet
2,00_split_by_type,test_kodate.parquet,00_split_by_type/test_kodate.parquet,52892,,149,data/interim/00_split_by_type/test_kodate.parquet
3,00_split_by_type,test_mansion.parquet,00_split_by_type/test_mansion.parquet,59545,,149,data/interim/00_split_by_type/test_mansion.par...
4,00_split_by_type,train_kodate.parquet,00_split_by_type/train_kodate.parquet,165310,,149,data/interim/00_split_by_type/train_kodate.par...
5,00_split_by_type,train_mansion.parquet,00_split_by_type/train_mansion.parquet,198614,,149,data/interim/00_split_by_type/train_mansion.pa...
6,01_split_by_type,test_kodate.parquet,01_split_by_type/test_kodate.parquet,52892,52892.0,150,data/interim/01_split_by_type/test_kodate.parquet
7,01_split_by_type,test_mansion.parquet,01_split_by_type/test_mansion.parquet,59545,59545.0,150,data/interim/01_split_by_type/test_mansion.par...
8,01_split_by_type,train_kodate.parquet,01_split_by_type/train_kodate.parquet,165310,165310.0,150,data/interim/01_split_by_type/train_kodate.par...
9,01_split_by_type,train_mansion.parquet,01_split_by_type/train_mansion.parquet,198614,198614.0,150,data/interim/01_split_by_type/train_mansion.pa...


In [5]:
def show_null_rates(dataset_key: str, top_n: int | None = 20) -> None:
    """Display the per-column null ratio table for a specific dataset."""
    if dataset_key not in NULL_RATE_TABLES:
        available = "\n".join(sorted(NULL_RATE_TABLES)) or "<none>"
        raise KeyError(
            f"Unknown dataset_key '{dataset_key}'. Available keys:\n{available}"
        )

    table = NULL_RATE_TABLES[dataset_key]
    if top_n is not None:
        display(table.head(top_n))
    else:
        display(table)

print("Use show_null_rates('<step>/<file>.parquet', top_n=20) to inspect columns.")


Use show_null_rates('<step>/<file>.parquet', top_n=20) to inspect columns.


In [6]:
if summary_df.empty:
    print("No datasets to inspect yet.")
else:
    example_key = summary_df.loc[0, "dataset_key"]
    print(f"Example dataset_key: {example_key}")
    show_null_rates(example_key)


Example dataset_key: 00_assign_data_id/test.parquet


Unnamed: 0,column,null_ratio
0,free_rent_gen_timing,1.0
1,traffic_car,1.0
2,free_rent_duration,1.0
3,name_ruby,1.0
4,school_jun_code,1.0
5,school_ele_code,1.0
6,building_name_ruby,1.0
7,money_hoshou_company,0.999635
8,reform_etc,0.999084
9,reform_place_other,0.998052
