# 0001_kodate / 0001_initial プレビュー

`data/processed/0001_kodate/0001_initial` に出力した train/test Parquet を手早く点検するノートです。インシデント防止のため、形状・型・サンプルを最低限チェックできるようにしてあります。



In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 400)

NOTEBOOK_RELATIVE_PATH = Path("notebooks/data/processed/0001_kodate/0001_initial/preview.ipynb")
try:
    NOTEBOOK_PATH = Path(__file__).resolve()
except NameError:  # running interactively
    NOTEBOOK_PATH = (Path.cwd() / NOTEBOOK_RELATIVE_PATH).resolve()


def resolve_project_root(notebook_path: Path) -> Path:
    for candidate in notebook_path.parents:
        if (candidate / "data").exists() and (candidate / "src").exists():
            return candidate
    return notebook_path.parents[-1]


PROJECT_ROOT = resolve_project_root(NOTEBOOK_PATH)
DATASETS = [
    (
        "train",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0001_initial" / "train.parquet",
    ),
    (
        "test",
        PROJECT_ROOT / "data" / "processed" / "0001_kodate" / "0001_initial" / "test.parquet",
    ),
]
HEAD_ROWS = 20
COLUMN_OVERVIEW_SAMPLE_SIZE = 5
COLUMN_OVERVIEW_CHUNK_SIZE = 200
COLUMN_OVERVIEW_RANDOM_SEED = 314159


def build_column_overview(
    df: pd.DataFrame,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> pd.DataFrame:
    row_count = len(df)
    overview_records: list[dict[str, object]] = []
    sample_columns = [f"random_sample_{i + 1}" for i in range(sample_size)]
    for idx, column in enumerate(df.columns):
        series = df[column]
        not_null_count = int(series.notna().sum())
        not_null_rate = (not_null_count / row_count * 100) if row_count else 0.0
        min_value = series.min(skipna=True)
        max_value = series.max(skipna=True)
        non_null = series.dropna()
        sample_n = min(sample_size, len(non_null))
        samples_list: list[object] = []
        if sample_n > 0:
            seed = None if random_state is None else random_state + idx
            samples_list = non_null.sample(n=sample_n, random_state=seed, replace=False).tolist()
        record: dict[str, object] = {
            "column_name": column,
            "row_count": row_count,
            "not_null_count": not_null_count,
            "not_null_rate_pct": round(not_null_rate, 2),
            "min": min_value,
            "max": max_value,
        }
        for sample_idx, col_name in enumerate(sample_columns):
            record[col_name] = samples_list[sample_idx] if sample_idx < len(samples_list) else pd.NA
        overview_records.append(record)
    return pd.DataFrame(overview_records)


def display_column_overview(
    df: pd.DataFrame | None,
    label: str,
    *,
    sample_size: int = COLUMN_OVERVIEW_SAMPLE_SIZE,
    chunk_size: int = COLUMN_OVERVIEW_CHUNK_SIZE,
    random_state: int | None = COLUMN_OVERVIEW_RANDOM_SEED,
) -> None:
    print(f"\n--- {label}: カラムサマリー ---")
    if df is None:
        print("⚠️ DataFrame が None のためサマリーを表示できません。")
        return
    if df.shape[1] == 0:
        print("(列が存在しません)")
        return
    overview = build_column_overview(
        df=df,
        sample_size=sample_size,
        random_state=random_state,
    )
    if overview.empty:
        print("(列が存在しません)")
        return
    total_cols = overview.shape[0]
    print(f"{total_cols} columns")
    for start in range(0, total_cols, chunk_size):
        end = min(start + chunk_size, total_cols)
        print(f"columns {start + 1}-{end} / {total_cols}")
        display(overview.iloc[start:end])


def inspect_dataset(label: str, path: Path, head_rows: int = HEAD_ROWS) -> None:
    if not path.exists():
        print(f"⚠️ Missing file: {path}")
        return
    df = pd.read_parquet(path)
    print(f"\n=== {label} ===")
    print(f"path: {path.relative_to(PROJECT_ROOT)}")
    print(f"shape: {df.shape[0]} rows x {df.shape[1]} cols")
    dtype_info = df.dtypes.astype(str)
    print("dtypes (first 20 columns):")
    print(dtype_info.head(20))
    if len(dtype_info) > 20:
        print("... (truncated)")
    sample = df.head(head_rows)
    display(sample.T)
    display_column_overview(df, label=f"{label} dataset")



In [None]:
for label, path in DATASETS:
    inspect_dataset(label, path)




=== train ===
path: data/processed/0001_kodate/0001_initial/train.parquet
shape: 165310 rows x 12 cols
dtypes (first 20 columns):
data_id                   int64
money_room                int64
target_ym                 int64
lon                     float64
lat                     float64
unit_area_max           float64
land_area_all           float64
unit_count              float64
year_built              float64
2023_land_price         Float64
2023_koji_price         Float64
mesh_population_2025    float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0.0,1.0,2.0,3.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,13.0,14.0,15.0,18.0,19.0,20.0,21.0,22.0,23.0
money_room,13980000.0,24480000.0,24480000.0,16300000.0,9000000.0,9900000.0,5400000.0,14500000.0,11680000.0,14800000.0,22800000.0,16000000.0,16800000.0,16500000.0,32000000.0,16000000.0,18800000.0,18000000.0,12980000.0,17800000.0
target_ym,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0,201901.0
lon,136.637467,136.639936,136.644708,136.875602,136.905282,136.872669,136.706911,136.913411,136.910391,136.910385,136.879586,136.871552,136.907383,136.907416,136.850106,136.868058,136.865587,136.282109,136.348144,136.376829
lat,35.047688,35.074625,35.072248,35.003174,34.971859,35.000171,35.10079,34.966003,34.965898,34.965281,34.998075,34.989686,34.964279,34.963667,34.979704,34.976485,34.970489,35.311584,35.397429,35.381259
unit_area_max,,,,,,,,,,,,,,,,,,,,
land_area_all,188.490005,,,,105.779999,,,,,,,,141.259995,,,,,,186.059998,509.089996
unit_count,,1.0,1.0,,1.0,,,,,,,,,,,1.0,,,,
year_built,199204.0,198108.0,199506.0,200203.0,196605.0,199010.0,,201605.0,198603.0,,201006.0,200102.0,200706.0,198812.0,200812.0,201201.0,,197604.0,198907.0,200302.0
2023_land_price,40100.0,52700.0,52700.0,135000.0,72400.0,108000.0,40200.0,68500.0,68500.0,68500.0,108000.0,108000.0,68500.0,68500.0,65300.0,85100.0,71400.0,69000.0,25400.0,22000.0



--- train dataset: カラムサマリー ---
12 columns
columns 1-12 / 12


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,165310,165310,100.0,0.0,363922.0,214247.0,287780.0,81813.0,212193.0,309576.0
1,money_room,165310,165310,100.0,4900000.0,188000000.0,16800000.0,18500000.0,8800000.0,23800000.0,6800000.0
2,target_ym,165310,165310,100.0,201901.0,202207.0,202201.0,202101.0,202007.0,202007.0,202001.0
3,lon,165310,165310,100.0,127.6564,144.4441,139.604094,139.66343,135.641388,139.531527,135.832994
4,lat,165310,165310,100.0,26.0901,43.8564,34.658706,35.806187,35.330239,35.562637,35.646147
5,unit_area_max,165310,0,0.0,,,,,,,
6,land_area_all,165310,40907,24.75,0.0,26450.0,132.240005,162.279999,230.839996,245.0,162.0
7,unit_count,165310,40798,24.68,1.0,2000.0,1.0,1.0,1.0,1.0,1.0
8,year_built,165310,154099,93.22,150001.0,203407.0,200610.0,198212.0,196803.0,199601.0,198808.0
9,2023_land_price,165310,163691,99.02,1530.0,7480000.0,45500.0,98200.0,141000.0,45400.0,128000.0



=== test ===
path: data/processed/0001_kodate/0001_initial/test.parquet
shape: 52892 rows x 11 cols
dtypes (first 20 columns):
data_id                  string
target_ym                 int64
lon                     float64
lat                     float64
unit_area_max           float64
land_area_all           float64
unit_count              float64
year_built              float64
2023_land_price         Float64
2023_koji_price         Float64
mesh_population_2025    float64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,1.0,2.0,3.0,4.0,5.0,6.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,19.0,20.0,21.0,22.0,23.0
target_ym,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0,202301.0
lon,136.673603,136.854324,136.877587,136.627432,136.657815,136.71048,136.899907,136.873301,136.910151,136.912719,136.282109,136.404917,136.347296,130.220809,136.370321,136.352003,136.350385,136.322595,136.291543,130.183089
lat,35.066061,34.937964,35.003429,35.05009,35.071887,35.099212,34.977285,34.989178,34.962984,34.965128,35.311584,35.343238,35.397991,33.563095,35.427892,35.386553,35.387825,35.325219,35.34012,33.549743
unit_area_max,,,,,,,,,,,,,,,,,,,,
land_area_all,197.529999,223.309998,,150.990005,344.440002,,,,,,,,,,,,,276.940002,,202.830002
unit_count,,,1.0,,1.0,1.0,,,,1.0,,,,1.0,,,,,,
year_built,199206.0,197511.0,201603.0,199411.0,200310.0,195701.0,,200405.0,199509.0,198904.0,197604.0,,198811.0,202307.0,,201212.0,199107.0,198101.0,199908.0,197410.0
2023_land_price,95500.0,34800.0,135000.0,52700.0,95500.0,44200.0,72400.0,108000.0,94500.0,83200.0,69000.0,20200.0,25400.0,74700.0,5200.0,18100.0,25400.0,12900.0,58600.0,56000.0
2023_koji_price,88000.0,56100.0,71900.0,41600.0,67400.0,36100.0,64000.0,100000.0,72900.0,72900.0,55000.0,17600.0,23000.0,98000.0,,23000.0,23000.0,39600.0,16000.0,48800.0



--- test dataset: カラムサマリー ---
11 columns
columns 1-11 / 11


Unnamed: 0,column_name,row_count,not_null_count,not_null_rate_pct,min,max,random_sample_1,random_sample_2,random_sample_3,random_sample_4,random_sample_5
0,data_id,52892,52892,100.0,1.0,99999.0,7987.0,93416.0,111517.0,18416.0,51128.0
1,target_ym,52892,52892,100.0,202301.0,202307.0,202307.0,202301.0,202301.0,202301.0,202301.0
2,lon,52892,52892,100.0,127.655876,144.432181,136.232918,138.911572,139.867894,139.171919,130.938142
3,lat,52892,52892,100.0,26.091937,44.116951,34.663658,34.778759,35.492798,34.464803,34.983574
4,unit_area_max,52892,0,0.0,,,,,,,
5,land_area_all,52892,11998,22.68,19.4,33076.9883,177.929993,211.550003,101.110001,58.299999,100.0
6,unit_count,52892,15177,28.69,1.0,2000.0,1.0,1.0,1.0,1.0,1.0
7,year_built,52892,48841,92.34,186801.0,220211.0,200801.0,199106.0,202504.0,201304.0,199401.0
8,2023_land_price,52892,52472,99.21,2350.0,10000000.0,147000.0,62000.0,80000.0,208000.0,145000.0
9,2023_koji_price,52892,51891,98.11,810.0,3450000.0,205000.0,56100.0,47200.0,114000.0,60300.0
