# 0001_kodate / 0007_same_unit_id プレビュー

`data/processed/0001_kodate/0007_same_unit_id` の train/test Parquet をざっと検品するノートです。列構成とサンプル値を人間がすぐ確認できるようにしてあります。



In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/processed/0001_kodate/0007_same_unit_id/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0007_same_unit_id" / "train.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0007_same_unit_id" / "test.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")



In [None]:
for label, path in DATASETS:
    inspect_dataset(label, path)




=== train ===
path: data/processed/0001_kodate/0007_same_unit_id/train.parquet
shape: 59205 rows x 78 cols
dtypes (first 20 columns):
data_id                         int64
money_room                      int64
building_structure            float64
total_floor_area              float64
floor_count                   float64
year_built                    float64
years_old                     float64
building_land_area            float64
land_area_all                 float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
land_kenpei                   float64
land_youseki                  float64
land_road_cond                float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,36516,36521,36526,36531,36537,36540,36542,36544,36548,36549,36553,36555,36557,36586,36589,36591,36595,36596,36598,36599
money_room,23490000,15980000,16000000,32000000,16000000,18000000,11980000,79200000,30800000,19800000,9000000,5000000,10480000,34800000,12800000,7500000,26800000,15800000,14300000,22800000
building_structure,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
total_floor_area,,74.519997,144.899994,112.629997,,138.919998,92.0,,138.0,,,,81.790001,,78.360001,66.919998,,,71.07,
floor_count,2.0,2.0,3.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
year_built,199506.0,199010.0,200102.0,200812.0,201201.0,197604.0,199607.0,200005.0,200406.0,200508.0,201601.0,197606.0,198410.0,200803.0,198908.0,198611.0,200612.0,200208.0,199001.0,197605.0
years_old,24.082136,28.747433,18.409309,10.579055,7.496235,43.247091,22.997947,19.164956,15.080082,13.913758,3.496235,43.080082,34.746064,11.331964,29.913758,32.66256,12.580424,16.914442,29.494867,43.164956
building_land_area,235.649994,97.190002,170.309998,176.600006,124.389999,253.119995,205.210007,666.690002,449.359985,234.699997,486.0,330.619995,151.949997,150.830002,86.379997,116.089996,120.089996,106.919998,133.220001,366.200012
land_area_all,,,,,,,205.210007,,,,,,151.949997,,,,,,,
building_land_chimoku,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



--- train dataset: カラムサマリー ---
78 columns
columns 1-78 / 78


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,59205,59205,100.0,36516,363922,310098,163790,41798,112510,54714
1,money_room,59205,59205,100.0,4900000,188000000,29500000,24800000,32800000,16800000,29800000
2,building_structure,59205,55626,93.95,1.0,12.0,1.0,1.0,1.0,3.0,1.0
3,total_floor_area,59205,31574,53.33,1.0,9108.0,109.720001,109.589996,167.160004,97.510002,101.599998
4,floor_count,59205,58785,99.29,0.0,898.0,1.0,1.0,1.0,2.0,1.0
5,year_built,59205,55842,94.32,150001.0,203407.0,199112.0,197403.0,200503.0,198305.0,199705.0
6,years_old,59205,55839,94.31,-13.494867,154.494182,-4.665298,47.419576,26.16564,90.001369,21.831622
7,building_land_area,59205,55618,93.94,10.51,31145.4395,46.509998,167.710007,122.519997,166.470001,100.07
8,land_area_all,59205,14288,24.13,17.700001,18169.6504,453.179993,119.889999,181.740005,264.309998,434.109985
9,building_land_chimoku,59205,54095,91.37,0.0,10.0,1.0,1.0,1.0,1.0,1.0



=== test ===
path: data/processed/0001_kodate/0007_same_unit_id/test.parquet
shape: 14431 rows x 77 cols
dtypes (first 20 columns):
data_id                        string
building_structure            float64
total_floor_area              float64
floor_count                   float64
year_built                    float64
years_old                     float64
building_land_area            float64
land_area_all                 float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
land_kenpei                   float64
land_youseki                  float64
land_road_cond                float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
floor_plan_code               float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,5,8,11,12,20,22,24,27,38,43,48,54,56,59,66,67,68,69,72,75
building_structure,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
total_floor_area,146.559998,,100.589996,138.919998,116.75,,288.23999,134.899994,99.360001,,121.230003,100.440002,,,80.32,158.910004,158.910004,,,66.5
floor_count,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
year_built,200310.0,,198904.0,197604.0,199107.0,199908.0,198810.0,199308.0,198903.0,200511.0,197503.0,202311.0,200709.0,196506.0,199602.0,199510.0,199510.0,199510.0,200708.0,198209.0
years_old,19.252567,,33.752225,46.75154,31.504449,23.419576,34.250513,29.418207,33.837098,17.166324,47.838467,-0.832307,15.334702,57.585216,26.915811,27.252567,27.252567,27.252567,15.419576,40.334018
building_land_area,344.440002,,110.290001,253.119995,206.529999,174.470001,556.099976,169.0,902.929993,1520.0,227.130005,145.0,280.540009,117.610001,88.410004,368.850006,368.850006,368.850006,125.519997,113.879997
land_area_all,344.440002,,,,,,,169.0,902.929993,,227.130005,145.0,,,,,,,,
building_land_chimoku,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
land_youto,1.0,,11.0,99.0,99.0,12.0,99.0,12.0,99.0,99.0,1.0,1.0,1.0,1.0,99.0,11.0,11.0,1.0,1.0,1.0



--- test dataset: カラムサマリー ---
77 columns
columns 1-77 / 77


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,14431,14431,100.0,100,99993,22655,14565,1651,26284,95695
1,building_structure,14431,13822,95.78,1.0,11.0,1.0,9.0,1.0,10.0,4.0
2,total_floor_area,14431,8129,56.33,12.86,603.140015,97.139999,91.5,107.650002,105.0,95.639999
3,floor_count,14431,14336,99.34,0.0,5.0,2.0,2.0,2.0,2.0,2.0
4,year_built,14431,13879,96.17,186801.0,203407.0,201607.0,200009.0,200212.0,201312.0,198707.0
5,years_old,14431,13879,96.17,-11.000684,155.493498,56.829569,9.913758,28.914442,44.670773,44.167009
6,building_land_area,14431,13814,95.72,18.9,18169.6504,43.470001,206.789993,82.459999,88.919998,253.960007
7,land_area_all,14431,3286,22.77,26.719999,18169.6504,187.0,75.139999,132.399994,135.570007,122.25
8,building_land_chimoku,14431,13522,93.7,1.0,10.0,1.0,1.0,1.0,1.0,1.0
9,land_youto,14431,13425,93.03,1.0,99.0,1.0,12.0,11.0,1.0,99.0
