# 03_01_join_koji_price 検証ノート

公示価格GeoJSONから生成した `train.parquet` / `test.parquet` をざっと確認する。`pd.options.display.max_rows` を増やし、先頭20行を転置表示して列ごとのスケールを把握する。


In [None]:
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.options.display.max_rows = 400


def find_project_root(start: Path) -> Path:
    candidate = start.resolve()
    for path in [candidate, *candidate.parents]:
        if (path / "data").is_dir() and (path / "src").is_dir():
            return path
    raise RuntimeError(f"Project root not found from {start}")


try:
    PROJECT_ROOT = find_project_root(Path(__file__))
except NameError:
    PROJECT_ROOT = find_project_root(Path.cwd())

ARTIFACTS = [
    ("train", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "train.parquet"),
    ("test", PROJECT_ROOT / "data" / "interim" / "03_01_join_koji_price" / "test.parquet"),
]
PRICE_COLUMNS = [
    "2023_koji_price",
    "2022_koji_price",
    "2021_koji_price",
    "2020_koji_price",
    "2019_koji_price",
    "2018_koji_price",
]
GROWTH_COLUMNS = ["koji_price_growth_2023_vs_2022"]


In [None]:
for label, artifact_path in ARTIFACTS:
    rel_path = artifact_path.relative_to(PROJECT_ROOT)
    df = pd.read_parquet(artifact_path)

    print(f"=== {label} ===")
    print(f"path: {rel_path}")
    print(f"shape: {df.shape}")
    print(df.dtypes)
    display(df.head(20).transpose())
    display(df[PRICE_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    display(df[GROWTH_COLUMNS].describe(percentiles=[0.25, 0.5, 0.75]).transpose())
    print(df["bukken_type_label"].value_counts(dropna=False).to_frame("count"))
    print("\n")


=== train ===
path: data/interim/03_01_join_koji_price/train.parquet
shape: (363924, 23)
data_id                             int64
bukken_type                         Int64
bukken_type_label          string[python]
2023_koji_price                   Float64
2022_koji_price                   Float64
2021_koji_price                   Float64
2020_koji_price                   Float64
2019_koji_price                   Float64
2018_koji_price                   Float64
2023_koji_usage_code       string[python]
2022_koji_usage_code       string[python]
2021_koji_usage_code       string[python]
2020_koji_usage_code       string[python]
2019_koji_usage_code       string[python]
2018_koji_usage_code       string[python]
2023_koji_distance_km             Float64
2022_koji_distance_km             Float64
2021_koji_distance_km             Float64
2020_koji_distance_km             Float64
2019_koji_distance_km             Float64
2018_koji_distance_km             Float64
koji_usage_status          st

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1302,1302,1202,1202
bukken_type_label,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,mansion,mansion,kodate,kodate
2023_koji_price,41600.0,63700.0,14700.0,71900.0,71900.0,64000.0,62200.0,36100.0,72900.0,72900.0,72900.0,92100.0,90000.0,100000.0,72900.0,72900.0,72900.0,72900.0,26800.0,108000.0
2022_koji_price,41500.0,63400.0,14600.0,71500.0,71500.0,63600.0,61800.0,36700.0,72000.0,72000.0,72000.0,91600.0,85200.0,99400.0,72000.0,72000.0,72000.0,72000.0,26100.0,107000.0
2021_koji_price,41500.0,63200.0,14500.0,71500.0,71500.0,63600.0,61800.0,37600.0,72000.0,72000.0,72000.0,91600.0,83600.0,99400.0,72000.0,72000.0,72000.0,72000.0,25700.0,107000.0
2020_koji_price,41600.0,63200.0,63200.0,72300.0,72300.0,64300.0,62500.0,38600.0,72800.0,72800.0,72800.0,93000.0,84000.0,101000.0,72800.0,72800.0,72800.0,72800.0,26000.0,109000.0
2019_koji_price,41600.0,62900.0,62900.0,72300.0,72300.0,64300.0,63000.0,39300.0,72800.0,72800.0,72800.0,93000.0,83900.0,101000.0,72800.0,72800.0,72800.0,72800.0,26000.0,109000.0
2018_koji_price,41700.0,62700.0,62700.0,73000.0,73000.0,64500.0,63600.0,40100.0,73400.0,73400.0,73400.0,94400.0,83700.0,101000.0,73400.0,73400.0,73400.0,73400.0,26000.0,109000.0
2023_koji_usage_code,1低専,1低専,1低専,1中専,1中専,1中専,1住居,1中専,1中専,1中専,1中専,準住居,準工,近商,1中専,1中専,1中専,1中専,工専,1中専


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,360162.0,313254.336021,564299.926344,570.0,79800.0,157000.0,318000.0,22400000.0
2022_koji_price,360167.0,301956.233161,541664.155377,600.0,78600.0,153000.0,305000.0,22100000.0
2021_koji_price,360171.0,298758.691441,565148.165746,630.0,78100.0,152000.0,300000.0,39300000.0
2020_koji_price,360160.0,301519.263036,582297.926975,660.0,78500.0,152000.0,300000.0,42700000.0
2019_koji_price,360157.0,282711.684501,528198.087515,690.0,78000.0,150000.0,290000.0,42000000.0
2018_koji_price,360171.0,266782.486763,476390.324285,720.0,77500.0,148000.0,282000.0,40100000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_distance_km,360162.0,0.417959,0.372397,2.7e-05,0.202026,0.324335,0.495494,2.999553
2022_koji_distance_km,360167.0,0.417703,0.371812,2.7e-05,0.202079,0.324255,0.494859,2.999556
2021_koji_distance_km,360171.0,0.417764,0.372301,2.7e-05,0.202002,0.324095,0.494609,2.999556
2020_koji_distance_km,360160.0,0.418129,0.373458,2.7e-05,0.202053,0.323957,0.494508,2.999556
2019_koji_distance_km,360157.0,0.417939,0.373557,2.8e-05,0.20201,0.323752,0.494119,2.999549
2018_koji_distance_km,360171.0,0.418224,0.374169,2.8e-05,0.202066,0.323882,0.494199,2.999549


Unnamed: 0,count,unique,top,freq
2023_koji_usage_code,349387,12,1低専,83938
2022_koji_usage_code,349414,12,1低専,83794
2021_koji_usage_code,349394,12,1低専,83548
2020_koji_usage_code,349405,12,1低専,83666
2019_koji_usage_code,349406,12,1低専,83510
2018_koji_usage_code,349386,12,1低専,83399


                    count
bukken_type_label        
mansion            198614
kodate             165310


=== test ===
path: data/interim/03_01_join_koji_price/test.parquet
shape: (112437, 23)
data_id                    string[python]
bukken_type                         Int64
bukken_type_label          string[python]
2023_koji_price                   Float64
2022_koji_price                   Float64
2021_koji_price                   Float64
2020_koji_price                   Float64
2019_koji_price                   Float64
2018_koji_price                   Float64
2023_koji_usage_code       string[python]
2022_koji_usage_code       string[python]
2021_koji_usage_code       string[python]
2020_koji_usage_code       string[python]
2019_koji_usage_code       string[python]
2018_koji_usage_code       string[python]
2023_koji_distance_km             Float64
2022_koji_distance_km             Float64
2021_koji_distance_km             Float64
2020_koji_distance_km             Float64
2019_koji

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
data_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
bukken_type,1302,1202,1202,1202,1202,1202,1202,1302,1202,1202,1202,1202,1202,1202,1202,1202,1202,1302,1302,1202
bukken_type_label,mansion,kodate,kodate,kodate,kodate,kodate,kodate,mansion,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,kodate,mansion,mansion,kodate
2023_koji_price,69600.0,88000.0,56100.0,71900.0,41600.0,67400.0,36100.0,41700.0,64000.0,100000.0,72900.0,72900.0,55000.0,17600.0,23000.0,98000.0,,55000.0,55000.0,23000.0
2022_koji_price,69200.0,86400.0,56300.0,71500.0,41500.0,66900.0,36700.0,42400.0,63600.0,99400.0,72000.0,72000.0,55000.0,18300.0,23500.0,72500.0,,55000.0,55000.0,23500.0
2021_koji_price,69000.0,85600.0,56900.0,71500.0,41500.0,66700.0,37600.0,43500.0,63600.0,99400.0,72000.0,72000.0,55100.0,19000.0,23700.0,68500.0,,55100.0,55100.0,23700.0
2020_koji_price,69000.0,85600.0,58000.0,72300.0,41600.0,66700.0,38600.0,44700.0,64300.0,101000.0,72800.0,72800.0,55200.0,19600.0,23900.0,65800.0,,55200.0,55200.0,23900.0
2019_koji_price,68400.0,84100.0,59400.0,72300.0,41600.0,66300.0,39300.0,45700.0,64300.0,101000.0,72800.0,72800.0,55300.0,20000.0,24100.0,61500.0,,55300.0,55300.0,24100.0
2018_koji_price,68000.0,82700.0,60000.0,73000.0,41700.0,66000.0,40100.0,46700.0,64500.0,101000.0,73400.0,73400.0,55400.0,20300.0,24200.0,59000.0,,55400.0,55400.0,24200.0
2023_koji_usage_code,1住居,1低専,1低専,1中専,1低専,1低専,1中専,1住居,1中専,近商,1中専,1中専,,近商,,1低専,,,,


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_price,111431.0,305162.806849,543207.095706,810.0,78000.0,155000.0,314500.0,39500000.0
2022_koji_price,111430.0,294269.929103,522591.249478,830.0,76500.0,151000.0,303000.0,39100000.0
2021_koji_price,111429.0,291288.724748,556981.076239,860.0,76200.0,149000.0,298000.0,39900000.0
2020_koji_price,111425.0,293854.64236,576297.355109,895.0,76600.0,149000.0,297000.0,43300000.0
2019_koji_price,111426.0,276253.539659,530414.866814,930.0,76000.0,147000.0,289000.0,42600000.0
2018_koji_price,111426.0,261391.269632,486148.201592,970.0,75300.0,145000.0,279000.0,40600000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2023_koji_distance_km,111431.0,0.414447,0.363566,4.2e-05,0.201588,0.32338,0.494531,2.999917
2022_koji_distance_km,111430.0,0.414173,0.363015,4.2e-05,0.201817,0.323446,0.494001,2.999917
2021_koji_distance_km,111429.0,0.414156,0.36335,4.2e-05,0.201715,0.323263,0.493869,2.999917
2020_koji_distance_km,111425.0,0.414476,0.364181,4.2e-05,0.201715,0.323122,0.494019,2.999917
2019_koji_distance_km,111426.0,0.414489,0.364519,3.3e-05,0.201704,0.322887,0.49339,2.999903
2018_koji_distance_km,111426.0,0.414927,0.365528,3.3e-05,0.201784,0.323274,0.493902,2.999903


Unnamed: 0,count,unique,top,freq
2023_koji_usage_code,108174,12,1低専,24643
2022_koji_usage_code,108171,12,1低専,24614
2021_koji_usage_code,108174,12,1低専,24563
2020_koji_usage_code,108160,12,1低専,24579
2019_koji_usage_code,108160,12,1低専,24547
2018_koji_usage_code,108150,12,1低専,24539


                   count
bukken_type_label       
mansion            59545
kodate             52892


