# 04_01_join_population_projection outputs

Automated preview of deterministic pipeline outputs saved under `data/interim`. Update the `DATASETS` list if the step emits new files.


In [3]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/interim/04_01_join_population_projection/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "interim" / "04_01_join_population_projection" / "train_population_features.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "interim" / "04_01_join_population_projection" / "test_population_features.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")


In [4]:
for label, path in DATASETS:
    inspect_dataset(label, path)



=== train ===
path: data/interim/04_01_join_population_projection/train_population_features.parquet
shape: 363924 rows x 6 cols
dtypes (first 20 columns):
data_id                   int64
mesh_id_1km              string
mesh_population_2025    float64
mesh_population_2035    float64
mesh_population_2045    float64
mesh_population_2055    float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0
mesh_id_1km,52364550.0,52364581.0,52364581.0,52364700.0,52364700.0,52363762.0,52364609.0,52365526.0,52363753.0,52363752.0,52363752.0,52363790.0,52364700.0,52363689.0,52363752.0,52363752.0,52363752.0,52363752.0,52363678.0,52363679.0
mesh_population_2025,3682.9554,3054.8402,3054.8402,4594.915,4594.915,3554.5238,869.4974,1620.7779,4580.3431,4814.8464,4814.8464,2782.5418,4594.915,8726.9877,4814.8464,4814.8464,4814.8464,4814.8464,1630.4829,6907.0917
mesh_population_2035,3638.1363,2985.6286,2985.6286,4510.4234,4510.4234,3381.6847,834.9253,1478.301,4387.0352,4597.3614,4597.3614,2750.5292,4510.4234,8275.6335,4597.3614,4597.3614,4597.3614,4597.3614,1419.9893,6883.7572
mesh_population_2045,3429.8858,2878.4524,2878.4524,4325.702,4325.702,3154.1957,794.8922,1326.2375,4129.9772,4321.1194,4321.1194,2676.1248,4325.702,7660.0349,4321.1194,4321.1194,4321.1194,4321.1194,1283.4602,6729.2419
mesh_population_2055,3102.9278,2723.2881,2723.2881,4085.6396,4085.6396,2921.3776,760.967,1181.5224,3876.4705,4012.5919,4012.5919,2555.0828,4085.6396,7043.686,4012.5919,4012.5919,4012.5919,4012.5919,1183.8843,6467.9783



--- train dataset: カラムサマリー ---
6 columns
columns 1-6 / 6


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,363924,363924,100.0,0.0,363923.0,84721.0,38497.0,115179.0,68594.0,713.0
1,mesh_id_1km,363924,363924,100.0,39271504.0,65436725.0,57403674.0,53391593.0,52366730.0,53395623.0,50306620.0
2,mesh_population_2025,363924,363905,99.99,0.0,33642.3438,9615.871,1863.7343,6305.5763,9151.9775,2659.7444
3,mesh_population_2035,363924,363905,99.99,0.0,35614.0103,7734.2369,9655.7151,5925.7348,3876.0673,3353.41
4,mesh_population_2045,363924,363905,99.99,0.0,37207.3593,5044.0161,12688.1655,7865.243,18917.4302,17236.8343
5,mesh_population_2055,363924,363905,99.99,0.0,37370.2454,7175.9474,20047.4479,11860.6648,5332.16,5805.7908



=== test ===
path: data/interim/04_01_join_population_projection/test_population_features.parquet
shape: 112437 rows x 6 cols
dtypes (first 20 columns):
data_id                  string
mesh_id_1km              string
mesh_population_2025    float64
mesh_population_2035    float64
mesh_population_2045    float64
mesh_population_2055    float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0
mesh_id_1km,52364585.0,52364573.0,52363628.0,52364700.0,52364560.0,52364582.0,52365516.0,52365515.0,52363771.0,52363689.0,52363752.0,52363753.0,52367272.0,53360312.0,53360277.0,50302177.0,53361219.0,52367272.0,52367272.0,53360268.0
mesh_population_2025,3359.3868,3444.6715,3465.9083,4594.915,1950.2567,4052.8434,1598.1301,1283.0887,150.2899,8726.9877,4814.8464,4580.3431,1948.1817,848.0171,886.9833,7155.2983,149.3747,1948.1817,1948.1817,979.6045
mesh_population_2035,3134.8871,3235.3916,2912.5785,4510.4234,1846.3354,4076.9501,1510.0884,1173.2617,128.9409,8275.6335,4597.3614,4387.0352,1879.0298,740.8564,855.4872,7240.5861,122.4596,1879.0298,1879.0298,868.7637
mesh_population_2045,2860.1611,3041.352,2451.729,4325.702,1699.1221,4037.7735,1404.5049,1084.7826,107.8596,7660.0349,4321.1194,4129.9772,1755.7996,638.8948,787.42,7128.8639,100.3472,1755.7996,1755.7996,813.3867
mesh_population_2055,2519.7713,2822.9217,2174.2312,4085.6396,1539.4938,3840.011,1285.2264,968.1698,96.0618,7043.686,4012.5919,3876.4705,1582.3166,542.3827,691.4669,6985.2512,81.3228,1582.3166,1582.3166,736.0981



--- test dataset: カラムサマリー ---
6 columns
columns 1-6 / 6


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,112437,112437,100.0,0.0,99999.0,80075.0,86027.0,5668.0,111827.0,9382.0
1,mesh_id_1km,112437,112437,100.0,39271516.0,66441044.0,53393660.0,53394355.0,53393577.0,52365646.0,64414337.0
2,mesh_population_2025,112437,112435,100.0,0.0,33642.3438,2798.7162,3554.9588,7077.2231,13342.4193,1427.3357
3,mesh_population_2035,112437,112435,100.0,0.0,35614.0103,14681.6744,9297.8871,12453.4843,20174.9648,30118.2202
4,mesh_population_2045,112437,112435,100.0,0.0,37207.3593,67.8886,6682.3368,6363.0047,13297.2222,7140.5047
5,mesh_population_2055,112437,112435,100.0,0.0,37370.2454,1472.0742,7616.2383,11619.463,6129.3371,7981.1152
