# 03_01_join_koji_price 検証ノート

公示価格GeoJSONから生成した `train.parquet` / `test.parquet` をざっと確認する。`pd.options.display.max_rows` を増やし、先頭20行を転置表示して列ごとのスケールを把握する。


In [10]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.options.display.max_rows = 400


def find_project_root(start: Path) -> Path:
    candidate = start.resolve()
    for path in [candidate, *candidate.parents]:
        if (path / "data").is_dir() and (path / "src").is_dir():
            return path
    raise RuntimeError(f"Project root not found from {start}")


try:
    PROJECT_ROOT = find_project_root(Path(__file__))
except NameError:
    PROJECT_ROOT = find_project_root(Path.cwd())

ARTIFACTS = [
    ("train", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "train.parquet"),
    ("test", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "test.parquet"),
]
PRICE_COLUMNS = [
    "2023_koji_price",
    "2022_koji_price",
    "2021_koji_price",
    "2020_koji_price",
    "2019_koji_price",
    "2018_koji_price",
]
GROWTH_COLUMNS = ["koji_price_growth_2023_vs_2022"]


In [11]:
for label, artifact_path in ARTIFACTS:
    rel_path = artifact_path.relative_to(PROJECT_ROOT)
    df = pd.read_parquet(artifact_path)

    print(f"=== {label} ===")
    print(f"path: {rel_path}")
    print(f"shape: {df.shape}")
    print(df.dtypes)
    display(df.head(20).transpose())
    display(df[PRICE_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    display(df[GROWTH_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    print(df["bukken_type_label"].value_counts(dropna=False).to_frame("count"))
    print("\n")


=== train ===
path: data/interim/03_01_join_koji_price/train.parquet
shape: (363924, 14)
data_id                                    int64
bukken_type                                Int64
bukken_type_label                 string[python]
2023_koji_price                          Float64
2022_koji_price                          Float64
2021_koji_price                          Float64
2020_koji_price                          Float64
2019_koji_price                          Float64
2018_koji_price                          Float64
koji_usage_code                   string[python]
koji_price_growth_2023_vs_2022           Float64
koji_distance_km                         Float64
koji_usage_status                 string[python]
koji_building_structure           string[python]
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1302,1302,1202,1202
bukken_type_label,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,mansion,mansion,kodate,kodate
2023_koji_price,41600.0,63700.0,63700.0,71900.0,71900.0,64000.0,62200.0,36100.0,72900.0,72900.0,72900.0,71900.0,90000.0,94000.0,72900.0,72900.0,72900.0,72900.0,52300.0,72600.0
2022_koji_price,41500.0,63400.0,63400.0,71500.0,71500.0,63600.0,61800.0,36700.0,72000.0,72000.0,72000.0,71500.0,85200.0,92900.0,72000.0,72000.0,72000.0,72000.0,52300.0,71700.0
2021_koji_price,41500.0,63200.0,63200.0,71500.0,71500.0,63600.0,61800.0,37600.0,72000.0,72000.0,72000.0,71500.0,83600.0,92300.0,72000.0,72000.0,72000.0,72000.0,52300.0,71700.0
2020_koji_price,41600.0,63200.0,63200.0,72300.0,72300.0,64300.0,62500.0,38600.0,72800.0,72800.0,72800.0,72300.0,84000.0,93100.0,72800.0,72800.0,72800.0,72800.0,52900.0,72500.0
2019_koji_price,41600.0,62900.0,62900.0,72300.0,72300.0,64300.0,63000.0,39300.0,72800.0,72800.0,72800.0,72300.0,83900.0,92200.0,72800.0,72800.0,72800.0,72800.0,53100.0,72500.0
2018_koji_price,41700.0,62700.0,62700.0,73000.0,73000.0,64500.0,63600.0,40100.0,73400.0,73400.0,73400.0,73000.0,83700.0,92200.0,73400.0,73400.0,73400.0,73400.0,53600.0,73000.0
koji_usage_code,1低専,1低専,1低専,1中専,1中専,1中専,1住居,1中専,1中専,1中専,1中専,1中専,準工,1中専,1中専,1中専,1中専,1中専,準工,1低専


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,348666.0,285526.338989,438378.742912,3300.0,83600.0,161000.0,313000.0,20000000.0
2022_koji_price,348666.0,270651.258798,420339.459691,0.0,80000.0,155000.0,296000.0,19700000.0
2021_koji_price,348666.0,262836.198683,414830.978973,0.0,76800.0,150000.0,288000.0,19700000.0
2020_koji_price,348666.0,261470.555546,419218.03159,0.0,75400.0,149000.0,285000.0,19800000.0
2019_koji_price,348666.0,244266.623617,379652.096105,0.0,72200.0,144000.0,273000.0,18300000.0
2018_koji_price,348666.0,227840.578347,347553.593413,0.0,68600.0,138000.0,260000.0,16600000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
koji_price_growth_2023_vs_2022,343813.0,0.023093,0.029289,-0.062992,0.00304,0.017937,0.035313,0.3


                    count
bukken_type_label        
mansion            198614
kodate             165310


=== test ===
path: data/interim/03_01_join_koji_price/test.parquet
shape: (112437, 14)
data_id                           string[python]
bukken_type                                Int64
bukken_type_label                 string[python]
2023_koji_price                          Float64
2022_koji_price                          Float64
2021_koji_price                          Float64
2020_koji_price                          Float64
2019_koji_price                          Float64
2018_koji_price                          Float64
koji_usage_code                   string[python]
koji_price_growth_2023_vs_2022           Float64
koji_distance_km                         Float64
koji_usage_status                 string[python]
koji_building_structure           string[python]
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1302,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1202,1202,1302,1302,1202
bukken_type_label,mansion,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,mansion,kodate
2023_koji_price,69600.0,88000.0,56100.0,71900.0,41600.0,67400.0,36100.0,41700.0,64000.0,94000.0,72900.0,72900.0,55000.0,17600.0,,98000.0,,55000.0,55000.0,23000.0
2022_koji_price,69200.0,86400.0,56300.0,71500.0,41500.0,66900.0,36700.0,42400.0,63600.0,92900.0,72000.0,72000.0,55000.0,18300.0,,0.0,,55000.0,55000.0,23500.0
2021_koji_price,69000.0,85600.0,56900.0,71500.0,41500.0,66700.0,37600.0,43500.0,63600.0,92300.0,72000.0,72000.0,55100.0,19000.0,,0.0,,55100.0,55100.0,23700.0
2020_koji_price,69000.0,85600.0,58000.0,72300.0,41600.0,66700.0,38600.0,44700.0,64300.0,93100.0,72800.0,72800.0,55200.0,19600.0,,0.0,,55200.0,55200.0,23900.0
2019_koji_price,68400.0,84100.0,59400.0,72300.0,41600.0,66300.0,39300.0,45700.0,64300.0,92200.0,72800.0,72800.0,55300.0,20000.0,,0.0,,55300.0,55300.0,24100.0
2018_koji_price,68000.0,82700.0,60000.0,73000.0,41700.0,66000.0,40100.0,46700.0,64500.0,92200.0,73400.0,73400.0,55400.0,20300.0,,0.0,,55400.0,55400.0,24200.0
koji_usage_code,1住居,1低専,1低専,1中専,1低専,1低専,1中専,1住居,1中専,1中専,1中専,1中専,,近商,,1低専,,,,


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,108074.0,280121.819124,417160.896127,4300.0,81800.0,159000.0,310000.0,20000000.0
2022_koji_price,108074.0,265474.160668,399290.958968,0.0,77700.0,152000.0,293000.0,19700000.0
2021_koji_price,108074.0,257775.849603,393282.332261,0.0,75000.0,147000.0,286000.0,19700000.0
2020_koji_price,108074.0,256376.898884,396989.805684,0.0,73400.0,146000.0,282000.0,19800000.0
2019_koji_price,108074.0,240059.667635,362942.297565,0.0,70400.0,141000.0,271000.0,18300000.0
2018_koji_price,108074.0,224214.939671,332588.515326,0.0,67000.0,135000.0,258000.0,16600000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
koji_price_growth_2023_vs_2022,106599.0,0.023458,0.030735,-0.062992,0.002268,0.017857,0.035573,0.293617


                   count
bukken_type_label       
mansion            59545
kodate             52892


