# 0001_kodate / 0005_targetyear_geo_features プレビュー

`data/processed/0001_kodate/0005_targetyear_geo_features` に出力した train/test Parquet を手早く点検するノートです。形状・スキーマ・先頭データを最低限チェックできるようにしています。



In [1]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/processed/0001_kodate/0005_targetyear_geo_features/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0005_targetyear_geo_features" / "train.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0005_targetyear_geo_features" / "test.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")



In [2]:
for label, path in DATASETS:
    inspect_dataset(label, path)




=== train ===
path: data/processed/0001_kodate/0005_targetyear_geo_features/train.parquet
shape: 165310 rows x 51 cols
dtypes (first 20 columns):
data_id                         int64
money_room                      int64
building_structure            float64
total_floor_area              float64
floor_count                   float64
year_built                    float64
years_old                     float64
building_land_area            float64
land_area_all                 float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
land_kenpei                   float64
land_youseki                  float64
land_road_cond                float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,5,6,7,8,9,10,11,13,14,15,18,19,20,21,22,23
money_room,13980000,24480000,24480000,16300000,9000000,9900000,5400000,14500000,11680000,14800000,22800000,16000000,16800000,16500000,32000000,16000000,18800000,18000000,12980000,17800000
building_structure,1.0,10.0,1.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0
total_floor_area,106.82,,,106.809998,78.739998,74.519997,,74.519997,70.160004,,78.0,144.899994,112.620003,105.160004,112.629997,,,138.919998,94.400002,139.110001
floor_count,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
year_built,199204.0,198108.0,199506.0,200203.0,196605.0,199010.0,,201605.0,198603.0,,201006.0,200102.0,200706.0,198812.0,200812.0,201201.0,,197604.0,198907.0,200302.0
years_old,26.75154,37.418207,23.586585,16.837782,52.670773,28.251882,,2.669405,32.837782,,8.5859,17.913758,11.586585,30.083504,10.083504,7.000684,,42.75154,29.50308,15.915127
building_land_area,188.490005,290.519989,235.649994,169.729996,105.779999,97.190002,,99.370003,112.510002,,165.0,170.309998,141.259995,199.679993,176.600006,124.389999,,253.119995,186.059998,509.089996
land_area_all,188.490005,,,,105.779999,,,,,,,,141.259995,,,,,,186.059998,509.089996
building_land_chimoku,1.0,,,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0



--- train dataset: カラムサマリー ---
51 columns
columns 1-51 / 51


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,165310,165310,100.0,0,363922,214247,287780,81813,212193,309576
1,money_room,165310,165310,100.0,4900000,188000000,16800000,18500000,8800000,23800000,6800000
2,building_structure,165310,153323,92.75,0.0,12.0,1.0,1.0,1.0,1.0,1.0
3,total_floor_area,165310,87685,53.04,1.0,9108.0,119.220001,115.290001,101.779999,228.369995,92.339996
4,floor_count,165310,164026,99.22,0.0,980.0,2.0,2.0,2.0,2.0,2.0
5,year_built,165310,154099,93.22,150001.0,203407.0,201207.0,199901.0,198611.0,201908.0,201406.0
6,years_old,165310,154095,93.22,-13.998631,220.492813,13.409993,29.50308,31.835729,25.409993,44.084873
7,building_land_area,165310,153256,92.71,0.0,219025.0,181.990005,221.490005,78.699997,183.919998,187.429993
8,land_area_all,165310,40907,24.75,0.0,26450.0,140.380005,48.860001,105.050003,200.389999,185.039993
9,building_land_chimoku,165310,148627,89.91,0.0,11.0,1.0,1.0,1.0,1.0,1.0



=== test ===
path: data/processed/0001_kodate/0005_targetyear_geo_features/test.parquet
shape: 52892 rows x 50 cols
dtypes (first 20 columns):
data_id                        string
building_structure            float64
total_floor_area              float64
floor_count                   float64
year_built                    float64
years_old                     float64
building_land_area            float64
land_area_all                 float64
building_land_chimoku         float64
land_youto                    float64
land_toshi                    float64
land_chisei                   float64
land_kenpei                   float64
land_youseki                  float64
land_road_cond                float64
balcony_area                  float64
dwelling_unit_window_angle    float64
room_count                    float64
unit_area                     float64
floor_plan_code               float64
dtype: object
... (truncated)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,19,20,21,22,23
building_structure,1.0,10.0,1.0,1.0,1.0,1.0,,10.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,9.0
total_floor_area,171.820007,92.129997,,105.980003,146.559998,,,,100.190002,100.589996,138.919998,,,,,171.5,116.75,80.519997,,116.860001
floor_count,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0
year_built,199206.0,197511.0,201603.0,199411.0,200310.0,195701.0,,200405.0,199509.0,198904.0,197604.0,,198811.0,202307.0,,201212.0,199107.0,198101.0,199908.0,197410.0
years_old,30.584531,47.167693,6.836413,28.167009,19.252567,65.998631,,18.669405,27.334702,33.752225,46.75154,,34.16564,-0.495551,,10.083504,31.504449,41.998631,23.419576,48.251882
building_land_area,197.529999,223.309998,100.589996,150.990005,344.440002,636.940002,,177.729996,140.0,110.290001,253.119995,,203.279999,129.740005,,205.130005,206.529999,276.940002,174.470001,202.830002
land_area_all,197.529999,223.309998,,150.990005,344.440002,,,,,,,,,,,,,276.940002,,202.830002
building_land_chimoku,1.0,1.0,5.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
land_youto,1.0,1.0,3.0,1.0,1.0,99.0,,11.0,11.0,11.0,99.0,,99.0,12.0,,99.0,99.0,14.0,12.0,6.0



--- test dataset: カラムサマリー ---
50 columns
columns 1-50 / 50


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,52892,52892,100.0,1,99999,7987,93416,111517,18416,51128
1,building_structure,52892,48648,91.98,1.0,12.0,9.0,1.0,10.0,3.0,9.0
2,total_floor_area,52892,29349,55.49,11.92,984.960022,131.470001,93.959999,117.660004,99.660004,90.720001
3,floor_count,52892,52533,99.32,0.0,62.0,2.0,1.0,2.0,2.0,1.0
4,year_built,52892,48841,92.34,186801.0,220211.0,197112.0,201806.0,197401.0,200007.0,197303.0
5,years_old,52892,48841,92.34,-179.827515,155.493498,27.164956,16.334018,1.494867,20.580424,48.5859
6,building_land_area,52892,48381,91.47,1.0,166074.0,147.910004,159.660004,57.729999,178.720001,110.120003
7,land_area_all,52892,11998,22.68,19.4,33076.9883,108.769997,205.259995,89.07,201.710007,72.120003
8,building_land_chimoku,52892,47208,89.25,1.0,11.0,1.0,1.0,1.0,1.0,1.0
9,land_youto,52892,47081,89.01,1.0,99.0,11.0,1.0,5.0,11.0,12.0
