# 03_01_join_koji_price 検証ノート

公示価格GeoJSONから生成した `train.parquet` / `test.parquet` をざっと確認する。`pd.options.display.max_rows` を増やし、先頭20行を転置表示して列ごとのスケールを把握する。


In [7]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.options.display.max_rows = 400


def find_project_root(start: Path) -> Path:
    candidate = start.resolve()
    for path in [candidate, *candidate.parents]:
        if (path / "data").is_dir() and (path / "src").is_dir():
            return path
    raise RuntimeError(f"Project root not found from {start}")


try:
    PROJECT_ROOT = find_project_root(Path(__file__))
except NameError:
    PROJECT_ROOT = find_project_root(Path.cwd())

ARTIFACTS = [
    ("train", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "train.parquet"),
    ("test", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "test.parquet"),
]
PRICE_COLUMNS = [
    "2023_koji_price",
    "2022_koji_price",
    "2021_koji_price",
    "2020_koji_price",
    "2019_koji_price",
    "2018_koji_price",
]
GROWTH_COLUMNS = ["koji_price_growth_2023_vs_2022"]


In [8]:
for label, artifact_path in ARTIFACTS:
    rel_path = artifact_path.relative_to(PROJECT_ROOT)
    df = pd.read_parquet(artifact_path)

    print(f"=== {label} ===")
    print(f"path: {rel_path}")
    print(f"shape: {df.shape}")
    print(df.dtypes)
    display(df.head(20).transpose())
    display(df[PRICE_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    display(df[GROWTH_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    print(df["bukken_type_label"].value_counts(dropna=False).to_frame("count"))
    print("\n")


=== train ===
path: data/interim/03_01_join_koji_price/train.parquet
shape: (363924, 23)
data_id                             int64
bukken_type                         Int64
bukken_type_label          string[python]
2023_koji_price                   Float64
2022_koji_price                   Float64
2021_koji_price                   Float64
2020_koji_price                   Float64
2019_koji_price                   Float64
2018_koji_price                   Float64
2023_koji_usage_code       string[python]
2022_koji_usage_code       string[python]
2021_koji_usage_code       string[python]
2020_koji_usage_code       string[python]
2019_koji_usage_code       string[python]
2018_koji_usage_code       string[python]
2023_koji_distance_km             Float64
2022_koji_distance_km             Float64
2021_koji_distance_km             Float64
2020_koji_distance_km             Float64
2019_koji_distance_km             Float64
2018_koji_distance_km             Float64
koji_usage_status          st

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1302,1302,1202,1202
bukken_type_label,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,mansion,mansion,kodate,kodate
2023_koji_price,41600.0,63700.0,14700.0,71900.0,71900.0,64000.0,62200.0,36100.0,72900.0,72900.0,72900.0,92100.0,90000.0,100000.0,72900.0,72900.0,72900.0,72900.0,26800.0,108000.0
2022_koji_price,41500.0,63400.0,14600.0,71500.0,71500.0,63600.0,61800.0,36700.0,72000.0,72000.0,72000.0,91600.0,85200.0,99400.0,72000.0,72000.0,72000.0,72000.0,26100.0,107000.0
2021_koji_price,41500.0,63200.0,14500.0,71500.0,71500.0,63600.0,61800.0,37600.0,72000.0,72000.0,72000.0,91600.0,83600.0,99400.0,72000.0,72000.0,72000.0,72000.0,25700.0,107000.0
2020_koji_price,41600.0,63200.0,63200.0,72300.0,72300.0,64300.0,62500.0,38600.0,72800.0,72800.0,72800.0,93000.0,84000.0,101000.0,72800.0,72800.0,72800.0,72800.0,26000.0,109000.0
2019_koji_price,41600.0,62900.0,62900.0,72300.0,72300.0,64300.0,63000.0,39300.0,72800.0,72800.0,72800.0,93000.0,83900.0,101000.0,72800.0,72800.0,72800.0,72800.0,26000.0,109000.0
2018_koji_price,41700.0,62700.0,62700.0,73000.0,73000.0,64500.0,63600.0,40100.0,73400.0,73400.0,73400.0,94400.0,83700.0,101000.0,73400.0,73400.0,73400.0,73400.0,26000.0,109000.0
2023_koji_usage_code,1低専,1低専,1低専,1中専,1中専,1中専,1住居,1中専,1中専,1中専,1中専,準住居,準工,近商,1中専,1中専,1中専,1中専,工専,1中専


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,350649.0,320695.812436,570044.339354,570.0,84300.0,162000.0,327000.0,22400000.0
2022_koji_price,350655.0,309087.793586,547181.165779,600.0,83200.0,158000.0,314000.0,22100000.0
2021_koji_price,350624.0,305841.001258,571118.257495,630.0,82800.0,157000.0,308000.0,39300000.0
2020_koji_price,350438.0,308811.064768,588631.472061,660.0,83000.0,157000.0,307000.0,42700000.0
2019_koji_price,350447.0,289476.503252,533859.74896,690.0,82500.0,155000.0,297000.0,42000000.0
2018_koji_price,350396.0,273151.336231,481419.757048,720.0,82100.0,153000.0,287000.0,40100000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_distance_km,350649.0,0.373977,0.254174,2.7e-05,0.198707,0.317248,0.477223,1.499908
2022_koji_distance_km,350655.0,0.373827,0.254023,2.7e-05,0.198816,0.317281,0.476661,1.499909
2021_koji_distance_km,350624.0,0.373772,0.254477,2.7e-05,0.198639,0.317185,0.476166,1.499909
2020_koji_distance_km,350438.0,0.373401,0.254154,2.7e-05,0.198658,0.316885,0.475613,1.499909
2019_koji_distance_km,350447.0,0.373223,0.254191,2.8e-05,0.198573,0.316544,0.47539,1.499923
2018_koji_distance_km,350396.0,0.37321,0.254112,2.8e-05,0.198675,0.316642,0.475363,1.499923


Unnamed: 0,count,unique,top,freq
2023_koji_usage_code,342663,12,1低専,82353
2022_koji_usage_code,342679,12,1低専,82209
2021_koji_usage_code,342631,12,1低専,81962
2020_koji_usage_code,342578,12,1低専,82069
2019_koji_usage_code,342606,12,1低専,81907
2018_koji_usage_code,342536,12,1低専,81785


                    count
bukken_type_label        
mansion            198614
kodate             165310


=== test ===
path: data/interim/03_01_join_koji_price/test.parquet
shape: (112437, 23)
data_id                    string[python]
bukken_type                         Int64
bukken_type_label          string[python]
2023_koji_price                   Float64
2022_koji_price                   Float64
2021_koji_price                   Float64
2020_koji_price                   Float64
2019_koji_price                   Float64
2018_koji_price                   Float64
2023_koji_usage_code       string[python]
2022_koji_usage_code       string[python]
2021_koji_usage_code       string[python]
2020_koji_usage_code       string[python]
2019_koji_usage_code       string[python]
2018_koji_usage_code       string[python]
2023_koji_distance_km             Float64
2022_koji_distance_km             Float64
2021_koji_distance_km             Float64
2020_koji_distance_km             Float64
2019_koji

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1302,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1202,1202,1302,1302,1202
bukken_type_label,mansion,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,mansion,kodate
2023_koji_price,69600.0,88000.0,56100.0,71900.0,41600.0,67400.0,36100.0,41700.0,64000.0,100000.0,72900.0,72900.0,55000.0,17600.0,,98000.0,,55000.0,55000.0,23000.0
2022_koji_price,69200.0,86400.0,56300.0,71500.0,41500.0,66900.0,36700.0,42400.0,63600.0,99400.0,72000.0,72000.0,55000.0,18300.0,,72500.0,,55000.0,55000.0,23500.0
2021_koji_price,69000.0,85600.0,56900.0,71500.0,41500.0,66700.0,37600.0,43500.0,63600.0,99400.0,72000.0,72000.0,55100.0,19000.0,,68500.0,,55100.0,55100.0,23700.0
2020_koji_price,69000.0,85600.0,58000.0,72300.0,41600.0,66700.0,38600.0,44700.0,64300.0,101000.0,72800.0,72800.0,55200.0,19600.0,,65800.0,,55200.0,55200.0,23900.0
2019_koji_price,68400.0,84100.0,59400.0,72300.0,41600.0,66300.0,39300.0,45700.0,64300.0,101000.0,72800.0,72800.0,55300.0,20000.0,,61500.0,,55300.0,55300.0,24100.0
2018_koji_price,68000.0,82700.0,60000.0,73000.0,41700.0,66000.0,40100.0,46700.0,64500.0,101000.0,73400.0,73400.0,55400.0,20300.0,,59000.0,,55400.0,55400.0,24200.0
2023_koji_usage_code,1住居,1低専,1低専,1中専,1低専,1低専,1中専,1住居,1中専,近商,1中専,1中専,,近商,,1低専,,,,


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,108735.0,311755.872718,548242.969729,810.0,82100.0,159000.0,322000.0,39500000.0
2022_koji_price,108732.0,300591.843064,527450.816656,830.0,80900.0,156000.0,310000.0,39100000.0
2021_koji_price,108729.0,297555.923903,562398.878588,860.0,80500.0,154000.0,305000.0,39900000.0
2020_koji_price,108679.0,300295.513991,582073.03049,895.0,80900.0,155000.0,304000.0,43300000.0
2019_koji_price,108677.0,282257.365404,535703.367163,930.0,80500.0,152000.0,294000.0,42600000.0
2018_koji_price,108650.0,267074.810769,490983.54375,970.0,79800.0,150000.0,284000.0,40600000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_distance_km,108735.0,0.374348,0.254585,4.2e-05,0.198571,0.316534,0.478054,1.499885
2022_koji_distance_km,108732.0,0.37411,0.254157,4.2e-05,0.198883,0.316719,0.477266,1.499883
2021_koji_distance_km,108729.0,0.374069,0.254529,4.2e-05,0.198674,0.316435,0.476869,1.499883
2020_koji_distance_km,108679.0,0.3738,0.254244,4.2e-05,0.198648,0.316172,0.476661,1.499883
2019_koji_distance_km,108677.0,0.373735,0.25434,3.3e-05,0.198637,0.315975,0.476147,1.499884
2018_koji_distance_km,108650.0,0.373738,0.254188,3.3e-05,0.198715,0.316149,0.47638,1.499884


Unnamed: 0,count,unique,top,freq
2023_koji_usage_code,106296,12,1低専,24215
2022_koji_usage_code,106293,12,1低専,24187
2021_koji_usage_code,106287,12,1低専,24134
2020_koji_usage_code,106261,12,1低専,24152
2019_koji_usage_code,106261,12,1低専,24116
2018_koji_usage_code,106224,12,1低専,24098


                   count
bukken_type_label       
mansion            59545
kodate             52892


