<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%EF%BC%91%EF%BC%97%EF%BC%8E%EF%BC%90%E3%81%AE%E3%82%B3%E3%83%BC%E3%83%89base/re_hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# =========================================================
# 1) 読み込み & 定義（後工程で困らない“土台”）
# =========================================================

# --- core ---
import os
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

# --- geo / model ---
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point

# --- sklearn ---
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

warnings.filterwarnings("ignore")

# =========================================================
# 0. 乱数・基本設定
# =========================================================
SEED = 2025
np.random.seed(SEED)

# =========================================================
# 1. パス定義（Colab / ローカルでも崩れにくい）
# =========================================================
DATA_DIR = Path("/content")  # 必要ならここだけ変える

TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"

# 外部データ（あなたの現状コードに合わせた名前）
DID_SHP_PATH          = DATA_DIR / "A16-20_00_DID.shp"
STATION_CSV_PATH      = DATA_DIR / "S12-24_NumberOfPassengers_utf8.csv"
STATION_GEOJSON_PATH  = DATA_DIR / "S12-24_NumberOfPassengers.geojson"
LAND_GEOJSON_PATH     = DATA_DIR / "L01-23.geojson"   # 地価/公示地価系を読んでいたやつ

# =========================================================
# 2. 列名（後工程で参照するのでここで一元管理）
# =========================================================
ID_COL          = "building_id"
TARGET_COL      = "money_room"
YM_COL          = "target_ym"

LON_COL         = "lon"
LAT_COL         = "lat"

# building_type で mansion/house を分けている前提
BUILDING_TYPE_COL = "building_type"

# “首都圏”判定に使っていたコード（あなたの現状に合わせる）
CAPITAL_PREF_CODES = [13, 14, 12, 11]  # 東京・神奈川・千葉・埼玉

# 時系列（README情報を反映）
BASE_YEAR = 2019  # trainの最初が2019年なので基準年にする

# =========================================================
# 3. 読み込みユーティリティ（Shift-JIS前提で安定）
# =========================================================
def read_csv_sjis(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(
        path,
        encoding="shift_jis",
        encoding_errors="replace",
        low_memory=False
    )

def ensure_required_columns(df: pd.DataFrame, required: list, name: str):
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"[{name}] missing columns: {missing}")

In [8]:
# =========================================================
# 4. train/test 読み込み
# =========================================================
train = read_csv_sjis(TRAIN_PATH)
test  = read_csv_sjis(TEST_PATH)

# 必須列チェック（ここで落とすと後が楽）
ensure_required_columns(train, [ID_COL, TARGET_COL, YM_COL, LON_COL, LAT_COL], "train")
ensure_required_columns(test,  [ID_COL, YM_COL, LON_COL, LAT_COL], "test")

In [9]:
# =========================================================
# 5. target_ym 由来の基本列（“定義だけ”ここで用意）
#    ※特徴量として使う/使わないは2でコントロールする
# =========================================================
def add_time_columns(df: pd.DataFrame, ym_col: str = YM_COL, base_year: int = BASE_YEAR) -> pd.DataFrame:
    df = df.copy()
    ym = df[ym_col].astype("Int64")  # 欠損に強い
    year = (ym // 100).astype("Int64")
    month = (ym % 100).astype("Int64")

    df["year"] = year
    df["month"] = month

    # “時間の流れ”を連続量で持たせる（READMEの時系列外挿対策の核）
    # 2019-01 -> 0, 2019-07 -> 6, ... 2023-07 -> 54 みたいな値になる
    df["elapsed_months"] = (df["year"] - base_year) * 12 + (df["month"] - 1)
    return df

train = add_time_columns(train)
test  = add_time_columns(test)

# =========================================================
# 6. 以降で使う“外部データ”読み込み（ここで変数だけ確保）
#    ※2で特徴量化するので、ここでは読み込みと最低限の整形まで
# =========================================================

# --- DID shapefile ---
did_gdf = gpd.read_file(DID_SHP_PATH)

# DID の人口・面積列名（あなたの既存コードに合わせて定義）
DID_POP_COL  = "A16_007"  # ※あなたのノートブックで参照していた列名に合わせる
DID_AREA_COL = "A16_008"  # ※同上

# 必須列が無いと後工程が死ぬので早期チェック
ensure_required_columns(did_gdf, [DID_POP_COL, DID_AREA_COL, "geometry"], "did_gdf")

# --- 駅データ（CSV & GeoJSON） ---
station_df = pd.read_csv(STATION_CSV_PATH)
station_point_gdf = gpd.read_file(STATION_GEOJSON_PATH)
station_point_gdf = station_point_gdf.set_crs("EPSG:4326", allow_override=True)





ここから






# --- 地価/土地系（GeoJSON） ---
land_gdf = gpd.read_file(LAND_GEOJSON_PATH).to_crs(epsg=4326)

# =========================================================
# 7. ここまでの sanity check（軽く）
# =========================================================
print("train shape:", train.shape, " test shape:", test.shape)
print("train ym range:", int(train[YM_COL].min()), "->", int(train[YM_COL].max()))
print("test  ym range:", int(test[YM_COL].min()),  "->", int(test[YM_COL].max()))
print("=== TRAIN COLUMNS ===")
display(pd.DataFrame(train.columns, columns=["train_columns"]))


train shape: (363924, 152)  test shape: (112437, 152)
train ym range: 201901 -> 202207
test  ym range: 202301 -> 202307
=== TRAIN COLUMNS ===


Unnamed: 0,train_columns
0,target_ym
1,money_room
2,building_id
3,building_status
4,building_create_date
...,...
147,free_rent_duration
148,free_rent_gen_timing
149,year
150,month


In [10]:
# =========================================================
# 2-1. 緯度経度 → Point
# =========================================================
def add_point_geometry(df: pd.DataFrame, lon_col=LON_COL, lat_col=LAT_COL):
    df = df.copy()
    df["geometry"] = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

train_gdf = add_point_geometry(train)
test_gdf  = add_point_geometry(test)


In [11]:
# =========================================================
# 2-2. DID join
# =========================================================
def add_did_features(df_gdf, did_gdf):
    df = df_gdf.copy()
    joined = gpd.sjoin(df, did_gdf, how="left", predicate="within")

    df["DID_population"] = joined[DID_POP_COL].fillna(0)
    df["DID_area"]       = joined[DID_AREA_COL].fillna(0)

    # 密度（ゼロ割防止）
    df["DID_density"] = df["DID_population"] / (df["DID_area"] + 1e-6)
    return df

train_gdf = add_did_features(train_gdf, did_gdf)
test_gdf  = add_did_features(test_gdf, did_gdf)


In [13]:
print(station_point_gdf.geometry.geom_type.value_counts())


MultiLineString    10531
Name: count, dtype: int64


In [12]:
# =========================================================
# 2-3. 駅距離（最短距離）
# =========================================================
def add_station_distance(df_gdf, station_gdf):
    df = df_gdf.copy()

    station_coords = np.vstack([
        station_gdf.geometry.x.values,
        station_gdf.geometry.y.values
    ]).T

    tree = KDTree(station_coords, leaf_size=40)

    point_coords = np.vstack([df.geometry.x, df.geometry.y]).T
    dist, idx = tree.query(point_coords, k=1)

    df["station_distance"] = dist[:, 0]
    return df

train_gdf = add_station_distance(train_gdf, station_point_gdf)
test_gdf  = add_station_distance(test_gdf, station_point_gdf)


ValueError: x attribute access only provided for Point geometries

In [None]:
# =========================================================
# 2-4. 周辺施設距離（汎用関数）
# =========================================================
def add_nearest_distance(df_gdf, facility_gdf, prefix):
    df = df_gdf.copy()

    coords = np.vstack([facility_gdf.geometry.x, facility_gdf.geometry.y]).T
    tree = KDTree(coords, leaf_size=40)

    pcoords = np.vstack([df.geometry.x, df.geometry.y]).T
    dist, _ = tree.query(pcoords, k=1)

    df[f"{prefix}_distance"] = dist[:, 0]
    return df

# 例（すでに読み込んでいる前提）
# train_gdf = add_nearest_distance(train_gdf, drugstore_gdf, "drugstore")
# test_gdf  = add_nearest_distance(test_gdf, drugstore_gdf, "drugstore")


In [None]:
# =========================================================
# 2-5. 建物年数・修繕年数
# =========================================================
def add_building_age_features(df):
    df = df.copy()

    # 築年
    df["building_create_year"] = pd.to_numeric(
        df["building_create_date"].astype(str).str[:4],
        errors="coerce"
    )

    # 経過年数（elapsed_months と相互作用する）
    df["building_age"] = df["year"] - df["building_create_year"]
    df["building_age"] = df["building_age"].clip(lower=0).fillna(0)

    return df

train_gdf = add_building_age_features(train_gdf)
test_gdf  = add_building_age_features(test_gdf)


In [None]:
# =========================================================
# 2-6. スラッシュ区切り条件
# =========================================================
SLASH_COLS = [
    "statuses",
    "parking_keiyaku",
    "free_rent_gen_timing",
    "money_hoshou_company"
]

def add_slashed_features(train_df, test_df, cols):
    combined = pd.concat([train_df, test_df], axis=0)

    new_features = []
    for col in cols:
        if col not in combined.columns:
            continue

        oh = combined[col].fillna("").str.get_dummies(sep="/")
        oh.columns = [f"{col}_{c}" for c in oh.columns]
        combined = pd.concat([combined, oh], axis=1)

        combined[f"{col}_count"] = (combined[col].fillna("") != "").astype(int) + combined[col].fillna("").str.count("/")

        new_features.extend(list(oh.columns) + [f"{col}_count"])

    train_out = combined.iloc[:len(train_df)].copy()
    test_out  = combined.iloc[len(train_df):].copy()

    return train_out, test_out, new_features

train_gdf, test_gdf, SLASH_FEATURES = add_slashed_features(train_gdf, test_gdf, SLASH_COLS)


In [None]:
# =========================================================
# 2-7. 数値ガード（最終安全装置）
# =========================================================
def final_numeric_guard(df):
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = (
        df[num_cols]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    return df

train_gdf = final_numeric_guard(train_gdf)
test_gdf  = final_numeric_guard(test_gdf)


In [None]:
# =========================================================
# 2-8. 最終特徴量リスト
# =========================================================
DROP_COLS = [
    ID_COL,
    TARGET_COL,
    "geometry"
]

FEATURE_COLS = [c for c in train_gdf.columns if c not in DROP_COLS]

print("num features:", len(FEATURE_COLS))
print("=== FEATURE_COLS CHECK ===")
print(len(FEATURE_COLS))
print([c for c in FEATURE_COLS if c not in train_gdf.columns])


In [None]:
# =========================================================
# 3-1. 学習データ定義
# =========================================================
X_all = train_gdf[FEATURE_COLS]
y_all = train_gdf[TARGET_COL]

X_test = test_gdf[FEATURE_COLS]

# 建物タイプで分割
is_mansion_train = train_gdf[BUILDING_TYPE_COL] == "mansion"
is_house_train   = train_gdf[BUILDING_TYPE_COL] == "house"

is_mansion_test = test_gdf[BUILDING_TYPE_COL] == "mansion"
is_house_test   = test_gdf[BUILDING_TYPE_COL] == "house"


In [None]:
# =========================================================
# 3-2. 時系列CV用インデックス
# =========================================================
def time_based_split(df, valid_year=2022):
    train_idx = df["year"] < valid_year
    valid_idx = df["year"] == valid_year
    return train_idx, valid_idx

train_idx, valid_idx = time_based_split(train_gdf)


In [None]:
# =========================================================
# 3-3. LightGBM 学習関数
# =========================================================
def train_lgb_regressor(X_train, y_train, X_valid, y_valid, seed=SEED):
    params = {
        "objective": "regression",
        "metric": "mae",          # MAPEは評価のみで使用
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": seed,
        "verbosity": -1
    }

    lgb_train = lgb.Dataset(X_train, np.log1p(y_train))
    lgb_valid = lgb.Dataset(X_valid, np.log1p(y_valid))

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        num_boost_round=5000,
        early_stopping_rounds=200,
        verbose_eval=200
    )
    return model


In [None]:
# =========================================================
# 3-4. mansion モデル
# =========================================================
model_mansion = train_lgb_regressor(
    X_all.loc[is_mansion_train & train_idx],
    y_all.loc[is_mansion_train & train_idx],
    X_all.loc[is_mansion_train & valid_idx],
    y_all.loc[is_mansion_train & valid_idx]
)

# =========================================================
# 3-4. house モデル
# =========================================================
model_house = train_lgb_regressor(
    X_all.loc[is_house_train & train_idx],
    y_all.loc[is_house_train & train_idx],
    X_all.loc[is_house_train & valid_idx],
    y_all.loc[is_house_train & valid_idx]
)


In [None]:
# =========================================================
# 3-5. 検証用予測
# =========================================================
valid_pred = np.zeros(len(train_gdf))

valid_pred[is_mansion_train & valid_idx] = np.expm1(
    model_mansion.predict(X_all.loc[is_mansion_train & valid_idx])
)

valid_pred[is_house_train & valid_idx] = np.expm1(
    model_house.predict(X_all.loc[is_house_train & valid_idx])
)

train_gdf["valid_pred"] = valid_pred


In [None]:
# =========================================================
# 3-6. test 予測（④提出用）
# =========================================================
test_pred = np.zeros(len(test_gdf))

test_pred[is_mansion_test] = np.expm1(
    model_mansion.predict(X_test.loc[is_mansion_test])
)

test_pred[is_house_test] = np.expm1(
    model_house.predict(X_test.loc[is_house_test])
)

test_gdf["pred_raw"] = test_pred


In [None]:
# =========================================================
# 3-7. ④用にまとめておく変数
# =========================================================
MODELS = {
    "mansion": model_mansion,
    "house": model_house
}

PRED_COL_RAW = "pred_raw"      # 後処理前
VALID_PRED_COL = "valid_pred" # CV評価用


In [None]:
# =========================================================
# 4) 検証・可視化・後処理・提出
#   - ①②③の整合性チェック
#   - valid(=2022年)で多角的な検証
#   - 次の改善が見える診断（年/月/価格帯/タイプ別の誤差）
#   - 後処理（低価格補正など）
#   - submit.csv 出力
# =========================================================

import matplotlib.pyplot as plt

# =========================================================
# 4-0. 整合性チェック（重大：ここで破綻を早期発見）
# =========================================================
def assert_integrity(train_gdf, test_gdf):
    # 必須列
    must_train = [ID_COL, TARGET_COL, YM_COL, "year", "month", "elapsed_months", BUILDING_TYPE_COL]
    must_test  = [ID_COL, YM_COL, "year", "month", "elapsed_months", BUILDING_TYPE_COL]

    for c in must_train:
        if c not in train_gdf.columns:
            raise KeyError(f"train_gdf missing: {c}")
    for c in must_test:
        if c not in test_gdf.columns:
            raise KeyError(f"test_gdf missing: {c}")

    # 特徴量列の存在
    missing_train_feats = [c for c in FEATURE_COLS if c not in train_gdf.columns]
    missing_test_feats  = [c for c in FEATURE_COLS if c not in test_gdf.columns]
    if missing_train_feats:
        raise KeyError(f"train_gdf missing FEATURE_COLS: {missing_train_feats[:20]} ... total={len(missing_train_feats)}")
    if missing_test_feats:
        raise KeyError(f"test_gdf missing FEATURE_COLS: {missing_test_feats[:20]} ... total={len(missing_test_feats)}")

    # ③で定義した予測列
    if VALID_PRED_COL not in train_gdf.columns:
        raise KeyError(f"train_gdf missing validation pred col: {VALID_PRED_COL}")
    if PRED_COL_RAW not in test_gdf.columns:
        raise KeyError(f"test_gdf missing pred col: {PRED_COL_RAW}")

    # NaN/inf チェック（特徴量＋予測）
    def _check_bad(df, cols, name):
        x = df[cols]
        bad_inf = np.isinf(x.select_dtypes(include=[np.number]).to_numpy()).sum()
        bad_nan = x.isna().sum().sum()
        if bad_inf > 0 or bad_nan > 0:
            raise ValueError(f"[{name}] has NaN/inf in selected columns. nan={bad_nan}, inf={bad_inf}")

    _check_bad(train_gdf, FEATURE_COLS + [VALID_PRED_COL, TARGET_COL], "train_gdf")
    _check_bad(test_gdf,  FEATURE_COLS + [PRED_COL_RAW], "test_gdf")

    # 型の簡易チェック
    if not np.issubdtype(train_gdf[TARGET_COL].dtype, np.number):
        raise TypeError(f"{TARGET_COL} must be numeric, got {train_gdf[TARGET_COL].dtype}")

    print("[OK] integrity check passed.")

assert_integrity(train_gdf, test_gdf)

In [None]:
# =========================================================
# 4-1. 指標（MAPE + 補助指標）
#   ※ MAPEはゼロ割を避けるためEPSを入れる
# =========================================================
EPS = 1e-6

def mape(y_true, y_pred, eps=EPS):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs(y_true - y_pred) / denom)

def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean(np.abs(y_true - y_pred))

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [None]:
# =========================================================
# 4-2. 検証対象（③の設計通り：valid_year=2022）
# =========================================================
VALID_YEAR = 2022
valid_mask = (train_gdf["year"] == VALID_YEAR)

y_valid_true = train_gdf.loc[valid_mask, TARGET_COL].values
y_valid_pred = train_gdf.loc[valid_mask, VALID_PRED_COL].values

print("---- VALID METRICS (year=2022) ----")
print("MAPE:", mape(y_valid_true, y_valid_pred))
print("MAE :", mae(y_valid_true, y_valid_pred))
print("RMSE:", rmse(y_valid_true, y_valid_pred))

In [None]:
# =========================================================
# 4-3. 予測の基本チェック（分布・散布図）
# =========================================================
def plot_pred_vs_true(y_true, y_pred, title):
    plt.figure()
    plt.scatter(y_true, y_pred, s=6, alpha=0.4)
    mn = min(y_true.min(), y_pred.min())
    mx = max(y_true.max(), y_pred.max())
    plt.plot([mn, mx], [mn, mx])
    plt.xlabel("true")
    plt.ylabel("pred")
    plt.title(title)
    plt.show()

def plot_residual_hist(y_true, y_pred, title):
    plt.figure()
    resid = y_pred - y_true
    plt.hist(resid, bins=60)
    plt.xlabel("pred - true")
    plt.ylabel("count")
    plt.title(title)
    plt.show()

def plot_ape_hist(y_true, y_pred, title):
    plt.figure()
    ape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), EPS)
    plt.hist(ape, bins=60)
    plt.xlabel("APE")
    plt.ylabel("count")
    plt.title(title)
    plt.show()

plot_pred_vs_true(y_valid_true, y_valid_pred, f"VALID {VALID_YEAR}: pred vs true")
plot_residual_hist(y_valid_true, y_valid_pred, f"VALID {VALID_YEAR}: residual histogram")
plot_ape_hist(y_valid_true, y_valid_pred, f"VALID {VALID_YEAR}: APE histogram")

In [None]:
# =========================================================
# 4-4. “次の改善が見える”分解検証
#   - 月別
#   - 価格帯別（MAPEで重要）
#   - 建物タイプ別
# =========================================================
valid_df = train_gdf.loc[valid_mask, [TARGET_COL, VALID_PRED_COL, "month", BUILDING_TYPE_COL]].copy()
valid_df["ape"] = np.abs(valid_df[VALID_PRED_COL] - valid_df[TARGET_COL]) / np.maximum(np.abs(valid_df[TARGET_COL]), EPS)

# 月別
month_summary = valid_df.groupby("month").agg(
    n=(TARGET_COL, "size"),
    mape=("ape", "mean"),
    mae_abs=(TARGET_COL, lambda s: np.nan)  # placeholder
)
# MAEを別途
month_summary["mae"] = valid_df.groupby("month").apply(lambda g: mae(g[TARGET_COL].values, g[VALID_PRED_COL].values))
month_summary = month_summary.drop(columns=["mae_abs"])

print("\n---- VALID: month summary ----")
display(month_summary)

plt.figure()
plt.plot(month_summary.index.values, month_summary["mape"].values, marker="o")
plt.xlabel("month")
plt.ylabel("MAPE")
plt.title(f"VALID {VALID_YEAR}: MAPE by month")
plt.show()

# タイプ別
type_summary = valid_df.groupby(BUILDING_TYPE_COL).agg(
    n=(TARGET_COL, "size"),
    mape=("ape", "mean")
)
type_summary["mae"] = valid_df.groupby(BUILDING_TYPE_COL).apply(lambda g: mae(g[TARGET_COL].values, g[VALID_PRED_COL].values))

print("\n---- VALID: building_type summary ----")
display(type_summary)

# 価格帯別（quantile）
valid_df["price_bin"] = pd.qcut(valid_df[TARGET_COL], q=10, duplicates="drop")
bin_summary = valid_df.groupby("price_bin").agg(
    n=(TARGET_COL, "size"),
    true_mean=(TARGET_COL, "mean"),
    pred_mean=(VALID_PRED_COL, "mean"),
    mape=("ape", "mean")
)
print("\n---- VALID: price quantile summary ----")
display(bin_summary)

plt.figure()
plt.plot(np.arange(len(bin_summary)), bin_summary["mape"].values, marker="o")
plt.xlabel("price quantile bin (low -> high)")
plt.ylabel("MAPE")
plt.title(f"VALID {VALID_YEAR}: MAPE by price quantile")
plt.show()

In [None]:
# =========================================================
# 4-5. エラートップの確認（改善ヒント用）
# =========================================================
valid_df2 = train_gdf.loc[valid_mask, [ID_COL, TARGET_COL, VALID_PRED_COL, YM_COL, BUILDING_TYPE_COL]].copy()
valid_df2["ape"] = np.abs(valid_df2[VALID_PRED_COL] - valid_df2[TARGET_COL]) / np.maximum(np.abs(valid_df2[TARGET_COL]), EPS)

print("\n---- Worst 30 APE in VALID ----")
display(valid_df2.sort_values("ape", ascending=False).head(30))

In [None]:
# =========================================================
# 4-6. Feature importance（次の改善の方向を作る）
#   - mansion / house で上位を見る
# =========================================================
def plot_feature_importance(model, feature_cols, topn=40, title="feature importance"):
    imp = pd.DataFrame({
        "feature": feature_cols,
        "importance": model.feature_importance(importance_type="gain")
    }).sort_values("importance", ascending=False).head(topn)

    plt.figure(figsize=(8, max(6, topn * 0.2)))
    plt.barh(imp["feature"][::-1], imp["importance"][::-1])
    plt.xlabel("gain importance")
    plt.title(title)
    plt.show()
    return imp

imp_mansion = plot_feature_importance(MODELS["mansion"], FEATURE_COLS, topn=40, title="Mansion: top gain importance")
imp_house   = plot_feature_importance(MODELS["house"], FEATURE_COLS, topn=40, title="House: top gain importance")

print("\n---- Top features (mansion) ----")
display(imp_mansion.head(20))
print("\n---- Top features (house) ----")
display(imp_house.head(20))

In [None]:
# =========================================================
# 4-7. 後処理（低価格帯補正：あなたの強みを“④で管理”）
#   - まずは “raw” を保持
#   - 補正後を pred_final に格納
#   - VALIDにも同じ補正を適用して、改善有無を確認する
# =========================================================
LOW_TH_MANSION = 9_000_000
LOW_TH_HOUSE   = 9_000_000
LOW_SCALE_MANSION = 0.83
LOW_SCALE_HOUSE   = 0.83

def apply_low_scale(pred, building_type_series, low_th_m, low_th_h, low_scale_m, low_scale_h):
    pred = pred.copy().astype(float)
    is_m = (building_type_series == "mansion").values
    is_h = (building_type_series == "house").values

    # mansion
    m_pred = pred[is_m]
    m_mask = m_pred <= low_th_m
    m_pred[m_mask] *= low_scale_m
    pred[is_m] = m_pred

    # house
    h_pred = pred[is_h]
    h_mask = h_pred <= low_th_h
    h_pred[h_mask] *= low_scale_h
    pred[is_h] = h_pred

    return pred

# VALID後処理評価
valid_pred_post = apply_low_scale(
    y_valid_pred,
    train_gdf.loc[valid_mask, BUILDING_TYPE_COL],
    LOW_TH_MANSION, LOW_TH_HOUSE,
    LOW_SCALE_MANSION, LOW_SCALE_HOUSE
)

print("\n---- VALID METRICS (post-processed) ----")
print("MAPE:", mape(y_valid_true, valid_pred_post))
print("MAE :", mae(y_valid_true, valid_pred_post))
print("RMSE:", rmse(y_valid_true, valid_pred_post))

plot_pred_vs_true(y_valid_true, valid_pred_post, f"VALID {VALID_YEAR}: pred vs true (post-processed)")
plot_ape_hist(y_valid_true, valid_pred_post, f"VALID {VALID_YEAR}: APE histogram (post-processed)")

# TEST後処理
test_pred_post = apply_low_scale(
    test_gdf[PRED_COL_RAW].values,
    test_gdf[BUILDING_TYPE_COL],
    LOW_TH_MANSION, LOW_TH_HOUSE,
    LOW_SCALE_MANSION, LOW_SCALE_HOUSE
)

test_gdf["pred_final"] = test_pred_post

In [None]:
# =========================================================
# 4-8. 提出ファイル作成
#   - id と money_room（=pred_final）で出す
# =========================================================
SUBMIT_PATH = DATA_DIR / "submit.csv"

submit = test_gdf[[ID_COL]].copy()
submit[TARGET_COL] = test_gdf["pred_final"].values

# 念のため負値クリップ（不動産価格）
submit[TARGET_COL] = submit[TARGET_COL].clip(lower=0)

submit.to_csv(SUBMIT_PATH, index=False, encoding="utf-8")
print(f"\n[OK] saved: {SUBMIT_PATH}")
display(submit.head())

In [None]:
# =========================================================
# 4-9. “次の改善が見通せる”診断メモ（自動で目を向ける）
# =========================================================
print("\n==== Next Improvement Hints ====")

# 1) 月別にMAPEが跳ねている月
worst_month = month_summary["mape"].idxmax()
print(f"- Worst month in valid: month={worst_month}, mape={month_summary.loc[worst_month, 'mape']:.4f}")

# 2) 低価格帯（MAPEで重要）のbin
worst_bin = bin_summary["mape"].idxmax()
print(f"- Worst price bin in valid: {worst_bin}, mape={bin_summary.loc[worst_bin, 'mape']:.4f}")

# 3) タイプ別の弱点
worst_type = type_summary["mape"].idxmax()
print(f"- Worse building_type: {worst_type}, mape={type_summary.loc[worst_type, 'mape']:.4f}")

print("\n[Action ideas]")
print("- 低価格帯binのMAPEが高い → LOW_TH / LOW_SCALE をCVで微調整、または低価格帯専用の補正（piecewise）を検討")
print("- monthで偏り → month/elapsed_monthsと相互作用する特徴（条件系one-hot、築年、free_rent系）を強化/再点検")
print("- 特徴量重要度で elapsed_months が弱い → 時間水準特徴（prefecture×year平均など）を②へ追加する余地が大きい")