# 01_02_join_population_projection outputs

Automated preview of deterministic pipeline outputs saved under `data/interim`. Update the `DATASETS` list if the step emits new files.


In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/interim/01_02_join_population_projection/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    ("train", PROJECT_ROOT / "data" / "interim" / "01_02_join_population_projection" / "train.parquet"),
    ("test", PROJECT_ROOT / "data" / "interim" / "01_02_join_population_projection" / "test.parquet"),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")


In [2]:
for label, path in DATASETS:
    inspect_dataset(label, path)



=== train ===
path: data/interim/01_02_join_population_projection/train.parquet
shape: 363924 rows x 141 cols
dtypes (first 20 columns):
data_id                       int64
target_ym                     int64
money_room                    int64
building_id                   int64
building_status               int64
building_create_date         string
building_modify_date         string
building_type                 int64
building_name                string
homes_building_name          string
homes_building_name_ruby     string
unit_count                  float64
full_address                 string
lon                         float64
lat                         float64
building_structure          float64
total_floor_area            float64
building_area               float64
floor_count                 float64
basement_floor_count        float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
target_ym,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901,201901
money_room,13980000,24480000,24480000,16300000,18800000,9000000,9900000,5400000,14500000,11680000,14800000,22800000,16900000,16000000,16800000,16500000,16700000,14480000,32000000,16000000
building_id,206271,83315,140201,216551,134968,39357,88925,87743,175661,43281,136298,166012,123843,61017,28221,167422,101182,101182,203963,47201
building_status,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,1,1
building_create_date,2014-06-27 21:09:41,2014-06-27 21:09:43,2014-06-27 21:09:43,2014-06-27 19:24:11,2014-06-27 19:24:12,2014-06-27 19:24:12,2014-06-27 19:24:12,2014-06-27 21:09:52,2014-06-27 19:24:12,2014-06-27 19:24:12,2014-06-27 19:24:12,2014-06-27 19:24:14,2014-06-27 19:24:14,2014-06-27 19:24:15,2014-06-27 19:24:16,2014-06-27 19:24:16,2014-06-27 19:24:16,2014-06-27 19:24:16,2014-06-27 19:24:17,2014-06-27 19:24:18
building_modify_date,2019-01-24 00:10:08,2019-04-03 00:10:08,2020-06-06 00:10:11,2019-04-26 00:10:08,2025-09-15 02:49:23,2019-05-22 00:10:08,2020-10-26 00:10:05,2019-02-05 00:10:08,2019-02-06 00:10:08,2019-03-19 00:10:09,2019-05-26 00:10:09,2019-03-12 00:10:08,2025-09-15 02:49:23,2022-06-01 00:10:05,2020-03-08 00:10:10,2019-01-19 00:10:09,2025-09-15 02:49:23,2025-09-15 02:49:23,2022-06-01 00:10:05,2020-12-01 00:10:05
building_type,4,4,4,4,1,4,4,999,4,4,999,4,1,4,4,4,1,1,4,4
building_name,,,,,ロイヤル知多寺本,,,,,,,,ユーハウス知多寺本,,,,リビオ巽が丘,リビオ巽が丘,,
homes_building_name,桑名市東正和台7丁目10-11,桑名市松ノ木5丁目,桑名市松ノ木七丁目,中古戸建 知多市八幡字荒井,ロイヤル知多寺本,知多市八幡字大平地,中古戸建 知多市八幡字小根,,中古戸建 知多市八幡字笹廻間,中古戸建 知多市八幡字笹廻間,,中古戸建 知多市八幡堀之内,ユーハウス知多寺本,中古戸建 知多市八幡新町3丁目,知多市西巽が丘一丁目,中古戸建 知多市西巽が丘1丁目,リビオ巽ヶ丘,リビオ巽ヶ丘,中古戸建 知多市長浦1丁目,知多市新知東町



=== test ===
path: data/interim/01_02_join_population_projection/test.parquet
shape: 112437 rows x 141 cols
dtypes (first 20 columns):
data_id                      string
id                            int64
target_ym                     int64
building_id                   int64
building_status               int64
building_create_date         string
building_modify_date         string
building_type                 int64
building_name                string
homes_building_name          string
homes_building_name_ruby     string
unit_count                  float64
full_address                 string
lon                         float64
lat                         float64
building_structure          float64
total_floor_area            float64
building_area               float64
floor_count                 float64
basement_floor_count        float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
target_ym,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301,202301
building_id,129053,47690,130646,52506,62277,97770,214102,179871,148834,105685,108964,100739,112943,158994,118495,68675,81076,123056,200406,122307
building_status,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,1,1,1,1
building_create_date,2014-06-27 21:09:41,2014-06-27 21:09:42,2014-06-27 19:24:11,2014-06-27 19:24:12,2014-06-27 21:09:46,2014-06-27 21:09:46,2014-06-27 21:09:53,2014-06-27 21:09:54,2014-06-27 19:24:14,2014-06-27 19:24:16,2014-06-27 19:24:16,2014-06-27 19:24:16,2014-06-27 21:11:22,2014-06-27 21:11:23,2014-06-27 21:11:24,2014-06-27 19:24:21,2014-06-27 21:11:25,2014-06-27 21:11:25,2014-06-27 21:11:26,2014-06-27 21:11:26
building_modify_date,2025-10-02 00:45:07,2023-02-11 00:10:05,2025-06-28 00:45:08,2023-03-05 00:10:05,2023-04-06 00:10:05,2023-03-20 00:10:05,2024-03-27 00:10:05,2025-09-15 02:51:02,2023-01-31 00:10:05,2022-10-25 00:10:04,2023-04-25 00:10:05,2025-08-17 00:45:08,2025-03-11 00:45:08,2023-08-25 00:10:06,2023-08-30 00:10:05,2024-12-09 15:30:56,2023-07-04 00:10:05,2025-09-23 00:45:08,2025-09-15 02:52:27,2023-01-07 00:10:05
building_type,1,4,4,4,4,4,4,1,999,4,4,4,4,999,4,4,999,1,1,4
building_name,スペリア桑名三番館,,,,,,,ラドーニ長島風の館,,,,,,,,,,エネッツ米原,サンビレッジ米原,
homes_building_name,スペリア桑名三番館,桑名市松並町二丁目 ミサワホーム中古住宅,知多市南粕谷4丁目 戸建て,,赤尾台八丁目 戸建,桑名市新西方五丁目 定期借地権付ミサワホーム中古住宅,中古戸建 桑名市長島町東殿名字木曽,ラドーニ長島 風の館,,知多市八幡新町三丁目ストックヘーベルハウス,中古戸建 知多市巽が丘1丁目,巽が丘1丁目 中古戸建,米原市入江中古戸建,,米原市坂口 戸建て,糸島市潤 潤2期 新築戸建,,エネッツ米原 1階,サンビレッジ米原 3階,米原市野一色 中古戸建
