# 0002_mansion / 0006_same_unit_id プレビュー

`data/processed/0002_mansion/0006_same_unit_id` の train/test Parquet をざっと検品するノートです。列構成とサンプル値を人間がすぐ確認できるようにしてあります。



In [1]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/processed/0002_mansion/0006_same_unit_id/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "processed" / "0002_mansion" / "0006_same_unit_id" / "train.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "processed" / "0002_mansion" / "0006_same_unit_id" / "test.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")



In [2]:
for label, path in DATASETS:
    inspect_dataset(label, path)




=== train ===
path: data/processed/0002_mansion/0006_same_unit_id/train.parquet
shape: 54372 rows x 75 cols
dtypes (first 20 columns):
data_id                         int64
money_room                      int64
building_structure            float64
floor_count                   float64
year_built                    float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
management_form               float64
room_floor                    float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
unit_house_area_adjusted      Float64
floor_plan_code               float64
flg_investment                float64
post1                         float64
post_all                       string
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,36520,36530,36564,36565,36566,36567,36628,36629,36631,36648,36677,36680,36698,36765,36788,36799,36816,36821,36833,36851
money_room,17800000,16700000,9950000,8480000,8200000,9800000,22800000,17800000,16800000,39980000,13200000,11900000,11800000,5600000,26800000,29800000,4900000,14300000,5800000,20800000
building_structure,4.0,4.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0
floor_count,6.0,5.0,14.0,14.0,14.0,14.0,7.0,5.0,5.0,23.0,11.0,14.0,15.0,7.0,11.0,6.0,4.0,11.0,11.0,11.0
year_built,200703.0,200302.0,199506.0,199810.0,199810.0,199604.0,200702.0,200609.0,200609.0,200403.0,198910.0,199203.0,198710.0,199103.0,198911.0,197303.0,198602.0,200702.0,199003.0,198110.0
building_land_chimoku,,,,,,,,,,,,1.0,,,,,,,,
land_youto,3.0,11.0,2.0,2.0,2.0,2.0,11.0,11.0,11.0,11.0,12.0,12.0,11.0,12.0,5.0,3.0,2.0,5.0,,5.0
land_toshi,1.0,1.0,1.0,,,,1.0,,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0
land_chisei,,,,,,,,,,1.0,1.0,1.0,,,,,,,,
management_form,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0



--- train dataset: カラムサマリー ---
75 columns
columns 1-75 / 75


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,54372,54372,100.0,36520,363923,180876,126185,178010,360806,314282
1,money_room,54372,54372,100.0,4900000,188000000,7900000,10800000,17800000,8800000,6900000
2,building_structure,54372,53576,98.54,1.0,11.0,4.0,5.0,4.0,5.0,4.0
3,floor_count,54372,54365,99.99,0.0,58.0,14.0,11.0,10.0,15.0,13.0
4,year_built,54372,53573,98.53,195709.0,202204.0,198910.0,197811.0,198506.0,200502.0,198307.0
5,building_land_chimoku,54372,5968,10.98,0.0,9.0,1.0,9.0,1.0,1.0,1.0
6,land_youto,54372,47594,87.53,1.0,99.0,7.0,12.0,11.0,11.0,3.0
7,land_toshi,54372,36054,66.31,0.0,4.0,1.0,1.0,1.0,2.0,2.0
8,land_chisei,54372,11246,20.68,0.0,9.0,1.0,1.0,1.0,1.0,1.0
9,management_form,54372,52761,97.04,0.0,3.0,3.0,3.0,3.0,3.0,3.0



=== test ===
path: data/processed/0002_mansion/0006_same_unit_id/test.parquet
shape: 13566 rows x 74 cols
dtypes (first 20 columns):
data_id                        string
building_structure            float64
floor_count                   float64
year_built                    float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
management_form               float64
room_floor                    float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
unit_house_area_adjusted      Float64
floor_plan_code               float64
flg_investment                float64
post1                         float64
post_all                       string
addr1_1                         int64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,17,51,52,110,123,125,138,148,154,164,166,169,170,180,212,219,226,231,235
building_structure,5.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,3.0,4.0,4.0,4.0,5.0
floor_count,14.0,6.0,7.0,7.0,24.0,15.0,14.0,10.0,11.0,6.0,15.0,15.0,15.0,15.0,10.0,3.0,11.0,11.0,7.0,11.0
year_built,199510.0,199201.0,199803.0,199803.0,200401.0,200901.0,199103.0,198901.0,199702.0,197404.0,198710.0,198710.0,198710.0,198710.0,198803.0,197803.0,200208.0,200707.0,199511.0,197809.0
building_land_chimoku,,,,,,,,,,,,,,,,,,,,
land_youto,7.0,11.0,3.0,3.0,11.0,3.0,11.0,12.0,12.0,11.0,11.0,11.0,11.0,11.0,5.0,10.0,5.0,5.0,,5.0
land_toshi,1.0,1.0,,,,1.0,1.0,1.0,,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
land_chisei,,,,,0.0,1.0,,1.0,,,,,,,,,,1.0,,
management_form,3.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
room_floor,4.0,2.0,6.0,3.0,1.0,1.0,3.0,2.0,3.0,6.0,14.0,7.0,2.0,2.0,3.0,3.0,4.0,4.0,2.0,7.0



--- test dataset: カラムサマリー ---
74 columns
columns 1-74 / 74


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,13566,13566,100.0,0,99962,43509,36891,42658,23806,12109
1,building_structure,13566,13429,98.99,3.0,12.0,5.0,5.0,4.0,5.0,5.0
2,floor_count,13566,13566,100.0,0.0,58.0,4.0,8.0,11.0,7.0,4.0
3,year_built,13566,13429,98.99,193101.0,202204.0,202011.0,200112.0,198204.0,200610.0,199105.0
4,building_land_chimoku,13566,1599,11.79,1.0,9.0,1.0,1.0,1.0,1.0,1.0
5,land_youto,13566,12010,88.53,1.0,99.0,4.0,11.0,7.0,11.0,11.0
6,land_toshi,13566,9355,68.96,1.0,4.0,1.0,1.0,1.0,1.0,1.0
7,land_chisei,13566,2769,20.41,0.0,9.0,1.0,1.0,1.0,1.0,1.0
8,management_form,13566,13317,98.16,1.0,3.0,3.0,3.0,3.0,3.0,3.0
9,room_floor,13566,13555,99.92,-1.0,49.0,4.0,4.0,7.0,12.0,8.0
