# 05_01_join_land_price outputs

Pipeline ステップ `join_land_price` の決定的な出力を素早く点検するためのノート。
`data/interim/05_01_join_land_price/` 配下の train/test Parquet を読み込み、
先頭データと列サマリーを確認する。


In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/interim/05_01_join_land_price/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "interim" / "05_01_join_land_price" / "train.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "interim" / "05_01_join_land_price" / "test.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")


In [None]:
for label, path in DATASETS:
    inspect_dataset(label, path)



=== train ===
path: data/interim/05_01_join_land_price/train.parquet
shape: 363924 rows x 18 cols
dtypes (first 20 columns):
data_id                    int64
bukken_type                Int64
bukken_type_label         string
2023_land_price          Float64
2022_land_price          Float64
2021_land_price          Float64
2020_land_price          Float64
2019_land_price          Float64
2023_land_usage_code      string
2022_land_usage_code      string
2021_land_usage_code      string
2020_land_usage_code      string
2019_land_usage_code      string
2023_land_distance_km    Float64
2022_land_distance_km    Float64
2021_land_distance_km    Float64
2020_land_distance_km    Float64
2019_land_distance_km    Float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1302,1302,1202,1202
bukken_type_label,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,mansion,mansion,kodate,kodate
2023_land_price,40100.0,52700.0,52700.0,135000.0,135000.0,72400.0,108000.0,40200.0,68500.0,68500.0,68500.0,108000.0,135000.0,108000.0,68500.0,68500.0,83200.0,83200.0,65300.0,85100.0
2022_land_price,40000.0,52500.0,52500.0,122000.0,122000.0,72000.0,104000.0,40600.0,68000.0,68000.0,68000.0,104000.0,122000.0,104000.0,68000.0,68000.0,83200.0,83200.0,65300.0,82500.0
2021_land_price,40000.0,52500.0,52500.0,119000.0,119000.0,70900.0,102000.0,41500.0,67600.0,67600.0,67600.0,102000.0,119000.0,102000.0,67600.0,67600.0,83200.0,83200.0,66200.0,82000.0
2020_land_price,40100.0,52700.0,52700.0,119000.0,119000.0,70900.0,102000.0,42500.0,67800.0,67800.0,67800.0,102000.0,119000.0,102000.0,67800.0,67800.0,83500.0,83500.0,67600.0,82400.0
2019_land_price,40100.0,52700.0,52700.0,121000.0,121000.0,71500.0,103000.0,43600.0,68300.0,68300.0,68300.0,103000.0,121000.0,103000.0,68300.0,68300.0,84500.0,84500.0,69500.0,83000.0
2023_land_usage_code,1中専,1中専,1中専,商業,商業,1低専,1低専,1低専,1中専,1中専,1中専,1低専,商業,1低専,1中専,1中専,近商,近商,1中専,1低専
2022_land_usage_code,1中専,1中専,1中専,商業,商業,1低専,1低専,1低専,1中専,1中専,1中専,1低専,商業,1低専,1中専,1中専,近商,近商,1中専,1低専



--- train dataset: カラムサマリー ---
18 columns
columns 1-18 / 18


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,363924,363924,100.0,0,363923,84721,38497,115179,68594,713
1,bukken_type,363924,363924,100.0,1202,1302,1202,1302,1302,1302,1302
2,bukken_type_label,363924,363924,100.0,kodate,mansion,mansion,kodate,mansion,kodate,mansion
3,2023_land_price,363924,362133,99.51,1530.0,27800000.0,239000.0,91200.0,56300.0,28800.0,85900.0
4,2022_land_price,363924,362133,99.51,1560.0,27300000.0,92300.0,130000.0,52400.0,11800.0,184000.0
5,2021_land_price,363924,362125,99.51,1590.0,27000000.0,272000.0,960000.0,125000.0,66200.0,353000.0
6,2020_land_price,363924,362115,99.5,1630.0,27000000.0,1700000.0,203000.0,26200.0,388000.0,491000.0
7,2019_land_price,363924,362118,99.5,1670.0,26600000.0,81400.0,155000.0,24100.0,73400.0,440000.0
8,2023_land_usage_code,363924,345350,94.9,1中専,近商,1低専,1中専,1低専,1中専,1低専
9,2022_land_usage_code,363924,345219,94.86,1中専,近商,1中専,近商,商業,2住居,商業



=== test ===
path: data/interim/05_01_join_land_price/test.parquet
shape: 112437 rows x 18 cols
dtypes (first 20 columns):
data_id                   string
bukken_type                Int64
bukken_type_label         string
2023_land_price          Float64
2022_land_price          Float64
2021_land_price          Float64
2020_land_price          Float64
2019_land_price          Float64
2023_land_usage_code      string
2022_land_usage_code      string
2021_land_usage_code      string
2020_land_usage_code      string
2019_land_usage_code      string
2023_land_distance_km    Float64
2022_land_distance_km    Float64
2021_land_distance_km    Float64
2020_land_distance_km    Float64
2019_land_distance_km    Float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1302,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1202,1202,1302,1302,1202
bukken_type_label,mansion,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,mansion,kodate
2023_land_price,95400.0,95500.0,34800.0,135000.0,52700.0,95500.0,44200.0,44200.0,72400.0,108000.0,94500.0,83200.0,69000.0,20200.0,25400.0,74700.0,5200.0,73500.0,73500.0,18100.0
2022_land_price,94800.0,92200.0,35000.0,122000.0,52500.0,92200.0,45000.0,45000.0,72000.0,104000.0,94500.0,83200.0,68700.0,20600.0,25900.0,68500.0,5350.0,73500.0,73500.0,18100.0
2021_land_price,95000.0,90200.0,35300.0,119000.0,52500.0,90200.0,46200.0,46200.0,70900.0,102000.0,94500.0,83200.0,68700.0,21000.0,26700.0,63500.0,5550.0,73800.0,73800.0,18100.0
2020_land_price,96000.0,89800.0,35800.0,119000.0,52700.0,89800.0,47700.0,47700.0,70900.0,102000.0,96000.0,83500.0,68700.0,21400.0,27500.0,60500.0,5750.0,74200.0,74200.0,18300.0
2019_land_price,96000.0,88800.0,36700.0,121000.0,52700.0,88800.0,48700.0,48700.0,71500.0,103000.0,84500.0,84500.0,68700.0,21900.0,28300.0,58000.0,5900.0,74600.0,74600.0,18700.0
2023_land_usage_code,商業,1低専,,商業,1中専,1低専,近商,近商,1低専,1低専,近商,近商,1住居,1中専,,1住居,,近商,近商,
2022_land_usage_code,商業,1低専,,商業,1中専,1低専,近商,近商,1低専,1低専,近商,近商,1住居,1中専,,1住居,,近商,近商,



--- test dataset: カラムサマリー ---
18 columns
columns 1-18 / 18


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,112437,112437,100.0,0,99999,80075,86027,5668,111827,9382
1,bukken_type,112437,112437,100.0,1202,1302,1302,1202,1302,1202,1302
2,bukken_type_label,112437,112437,100.0,kodate,mansion,kodate,mansion,mansion,kodate,mansion
3,2023_land_price,112437,111961,99.58,2350.0,20300000.0,164000.0,72700.0,332000.0,106000.0,198000.0
4,2022_land_price,112437,111969,99.58,2400.0,19600000.0,189000.0,295000.0,2090000.0,73200.0,88500.0
5,2021_land_price,112437,111971,99.59,2450.0,19500000.0,320000.0,173000.0,270000.0,191000.0,130000.0
6,2020_land_price,112437,111972,99.59,2500.0,23300000.0,170000.0,167000.0,243000.0,43500.0,158000.0
7,2019_land_price,112437,111970,99.58,2220.0,24400000.0,25000.0,30300.0,29000.0,176000.0,525000.0
8,2023_land_usage_code,112437,107034,95.19,1中専,近商,近商,商業,商業,1低専,1住居
9,2022_land_usage_code,112437,107011,95.17,1中専,近商,1低専,商業,近商,1住居,1低専
