<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%EF%BC%91%EF%BC%97%EF%BC%8E%EF%BC%90%E3%81%AE%E3%82%B3%E3%83%BC%E3%83%89base/re_hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
# =========================================================
# 1) 読み込み & 定義（完成度重視）
#  - 列名ゆらぎに強い
#  - 2/3/4で使う定数・関数をここで完備
# =========================================================

import os
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# =========================================================
# 乱数・基本設定
# =========================================================
SEED = 2025
np.random.seed(SEED)

# =========================================================
# パス定義
# =========================================================
DATA_DIR = Path("/content")  # Colab想定
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"

# -------------------------
# 重要列（確定しているもの）
# -------------------------
ID_COL     = "building_id"
TARGET_COL = "money_room"
YM_COL     = "target_ym"
BUILDING_TYPE_COL = "building_type"

# -------------------------
# 時系列設定（README準拠）
# -------------------------
BASE_YEAR = 2019  # elapsed_months の基準

# -------------------------
# distance列のNaN対策（Step2で使う設定）
# -------------------------
DIST_SUFFIX = "_distance"
# 大きな定数埋めの方針：trainの分位点で決める（例：0.99）
DIST_FILL_QUANTILE = 0.99

# =========================================================
# 便利関数：読み込み
# =========================================================
def read_csv_sjis(path: Path) -> pd.DataFrame:
    """Shift-JIS前提の安定読み込み（このコンペに合わせる）"""
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(
        path,
        encoding="shift_jis",
        encoding_errors="replace",
        low_memory=False
    )

def ensure_required_columns(df: pd.DataFrame, required: list[str], name: str):
    missing = [c for c in required if c not in df.columns]
    if missing:
        # ここで止めるのが完成度（後段で謎バグにしない）
        raise KeyError(f"[{name}] missing columns: {missing}")

# =========================================================
# 便利関数：列名ゆらぎ対応（lon/latなどが揺れる前提）
# =========================================================
def pick_first_existing(df: pd.DataFrame, candidates: list[str]) -> str:
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"None of candidates exist: {candidates}")

# lon/lat はコンペで揺れがちなので候補を多めに持つ
LON_CANDIDATES = ["lon", "longitude", "x", "X", "経度"]
LAT_CANDIDATES = ["lat", "latitude", "y", "Y", "緯度"]

# =========================================================
# 便利関数：時間列追加（Step2以降で使う前提列）
# =========================================================
def add_time_columns(df: pd.DataFrame, ym_col: str = YM_COL, base_year: int = BASE_YEAR) -> pd.DataFrame:
    df = df.copy()
    ym = pd.to_numeric(df[ym_col], errors="coerce")
    year = (ym // 100).astype("Int64")
    month = (ym % 100).astype("Int64")

    df["year"] = year
    df["month"] = month
    df["elapsed_months"] = (df["year"] - base_year) * 12 + (df["month"] - 1)
    return df

# =========================================================
# 便利関数：distance列検出（Step2でNaN処理対象にする）
# =========================================================
def get_distance_cols(df: pd.DataFrame, suffix: str = DIST_SUFFIX) -> list[str]:
    return [c for c in df.columns if c.endswith(suffix)]

In [66]:
# =========================================================
# 読み込み
# =========================================================
train = read_csv_sjis(TRAIN_PATH)
test  = read_csv_sjis(TEST_PATH)

In [67]:
# =========================================================
# 必須列チェック（ここで落とすのが正解）
# =========================================================
ensure_required_columns(train, [ID_COL, TARGET_COL, YM_COL, BUILDING_TYPE_COL], "train")
ensure_required_columns(test,  [ID_COL, YM_COL, BUILDING_TYPE_COL], "test")

# lon/lat は列名が揺れる可能性があるので自動解決
LON_COL = pick_first_existing(train, LON_CANDIDATES)
LAT_COL = pick_first_existing(train, LAT_CANDIDATES)
ensure_required_columns(test, [LON_COL, LAT_COL], "test")  # test側にも同名がある前提で確認

# 時間列を追加（Step2/3/4で使用）
train = add_time_columns(train)
test  = add_time_columns(test)

# distance列（Step2で NaNフラグ+大きな定数埋め の対象）
DISTANCE_COLS = get_distance_cols(train)
# train/testで一致しているかも早期に確認（完成度）
missing_in_test = sorted(list(set(DISTANCE_COLS) - set(test.columns)))
if len(missing_in_test) > 0:
    raise KeyError(f"[test] missing some distance columns present in train: {missing_in_test[:30]} ... total={len(missing_in_test)}")

# =========================================================
# Step3/4で使う“分割マスク”の土台（ここでは定義だけ）
# =========================================================
# building_type が mansion/house 以外の表記なら、ここで気づけるようにしておく
BUILDING_TYPES_TRAIN = train[BUILDING_TYPE_COL].astype(str).value_counts()
BUILDING_TYPES_TEST  = test[BUILDING_TYPE_COL].astype(str).value_counts()

print("train shape:", train.shape, " test shape:", test.shape)
print("lon/lat cols:", LON_COL, LAT_COL)
print("target_ym range train:", int(train[YM_COL].min()), "->", int(train[YM_COL].max()))
print("target_ym range test :", int(test[YM_COL].min()),  "->", int(test[YM_COL].max()))
print("num distance cols:", len(DISTANCE_COLS))
print("building_type(train):")
display(BUILDING_TYPES_TRAIN.head(20))
print("building_type(test):")
display(BUILDING_TYPES_TEST.head(20))

train shape: (363924, 152)  test shape: (112437, 152)
lon/lat cols: lon lat
target_ym range train: 201901 -> 202207
target_ym range test : 202301 -> 202307
num distance cols: 11
building_type(train):


Unnamed: 0_level_0,count
building_type,Unnamed: 1_level_1
1,194587
4,153456
999,14313
5,468
8,358
15,274
9,196
901,126
3,61
2,37


building_type(test):


Unnamed: 0_level_0,count
building_type,Unnamed: 1_level_1
1,58650
4,48594
999,4756
5,157
8,80
15,70
9,47
901,42
3,17
2,10


In [68]:
# =========================================================
# 2) 特徴量作成
#   - distance: NaNフラグ + 大きな定数埋め
#   - log距離
#   - 時系列補助特徴
# =========================================================

# -------------------------
# コピー（破壊的変更を避ける）
# -------------------------
train_feat = train.copy()
test_feat  = test.copy()

In [69]:
# =========================================================
# 2-1. distance 特徴量
#   ・NaN = 「一定距離内に存在しない」
#   ・情報なのでフラグ化
#   ・距離自体は大きな定数で埋める
# =========================================================

DIST_FILL_VALUES = {}  # Step3/4で再現できるよう保存

for c in DISTANCE_COLS:
    # ---- NaNフラグ
    train_feat[f"{c}_is_nan"] = train_feat[c].isna().astype("int8")
    test_feat[f"{c}_is_nan"]  = test_feat[c].isna().astype("int8")

    # ---- 埋め値（trainの分位点で固定）
    fill_value = train_feat[c].quantile(DIST_FILL_QUANTILE)
    DIST_FILL_VALUES[c] = fill_value

    train_feat[c] = train_feat[c].fillna(fill_value)
    test_feat[c]  = test_feat[c].fillna(fill_value)

    # ---- log距離（右裾対策）
    train_feat[f"{c}_log"] = np.log1p(train_feat[c])
    test_feat[f"{c}_log"]  = np.log1p(test_feat[c])

In [70]:
# =========================================================
# 2-2. 時系列特徴（README準拠：掲載時期ズレ対策）
# =========================================================

# year / month / elapsed_months は Step1 ですでに作成済み
# 追加で「周期性」を与える
train_feat["month_sin"] = np.sin(2 * np.pi * train_feat["month"] / 12)
train_feat["month_cos"] = np.cos(2 * np.pi * train_feat["month"] / 12)

test_feat["month_sin"] = np.sin(2 * np.pi * test_feat["month"] / 12)
test_feat["month_cos"] = np.cos(2 * np.pi * test_feat["month"] / 12)

In [71]:
# =========================================================
# 2-3. 建物年数系（壊れにくい最小構成）
# =========================================================

DATE_COLS = [
    "building_create_date",
    "building_modify_date"
]

for col in DATE_COLS:
    if col in train_feat.columns:
        train_feat[col] = pd.to_datetime(train_feat[col], errors="coerce")
        test_feat[col]  = pd.to_datetime(test_feat[col],  errors="coerce")

# 築年数（存在する場合のみ）
if "building_create_date" in train_feat.columns:
    train_feat["building_age"] = train_feat["year"] - train_feat["building_create_date"].dt.year
    test_feat["building_age"]  = test_feat["year"]  - test_feat["building_create_date"].dt.year

# マイナスや異常値を防ぐ
if "building_age" in train_feat.columns:
    train_feat["building_age"] = train_feat["building_age"].clip(lower=0)
    test_feat["building_age"]  = test_feat["building_age"].clip(lower=0)

In [72]:
# =========================================================
# 2-4. カテゴリ列の整理（Step3でそのまま使える形）
# =========================================================

# LightGBMに渡す予定のカテゴリ列
CATEGORICAL_COLS = []

if BUILDING_TYPE_COL in train_feat.columns:
    CATEGORICAL_COLS.append(BUILDING_TYPE_COL)
    train_feat[BUILDING_TYPE_COL] = train_feat[BUILDING_TYPE_COL].astype("category")
    test_feat[BUILDING_TYPE_COL]  = test_feat[BUILDING_TYPE_COL].astype("category")

In [73]:
# =========================================================
# 2-5. 数値列の最終ガード（inf / -inf / NaN）
# =========================================================

NUMERIC_COLS = train_feat.select_dtypes(include=[np.number]).columns.tolist()
NUMERIC_COLS = [c for c in NUMERIC_COLS if c != TARGET_COL]

def final_numeric_guard(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
    df = df.copy()
    df[num_cols] = (
        df[num_cols]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    return df

train_feat = final_numeric_guard(train_feat, NUMERIC_COLS)
test_feat  = final_numeric_guard(test_feat,  NUMERIC_COLS)

In [74]:
# =========================================================
# 2-6. Step3/4 用の列リストを確定
# =========================================================

FEATURE_COLS = [
    c for c in train_feat.columns
    if c not in [TARGET_COL]
    and c in test_feat.columns
]

print("num features:", len(FEATURE_COLS))
print("sample feature cols:", FEATURE_COLS[:20])

num features: 176
sample feature cols: ['target_ym', 'building_id', 'building_status', 'building_create_date', 'building_modify_date', 'building_type', 'building_name', 'building_name_ruby', 'homes_building_name', 'homes_building_name_ruby', 'unit_count', 'full_address', 'lon', 'lat', 'building_structure', 'total_floor_area', 'building_area', 'floor_count', 'basement_floor_count', 'year_built']


In [75]:
# =========================================================
# 3) モデル作成（完成度重視）
#   - 時系列 valid（例：2022）で検証可能な形にする
#   - Step4で必要な変数を全てここで確定させる
# =========================================================

import lightgbm as lgb


In [76]:
# =========================================================
# 3-0. 整合性チェック（Step2の成果物が揃っているか）
# =========================================================
required_step2_vars = ["train_feat", "test_feat", "FEATURE_COLS", "CATEGORICAL_COLS"]
for v in required_step2_vars:
    if v not in globals():
        raise NameError(f"Missing required variable from Step2: {v}")

# targetの存在確認
if TARGET_COL not in train_feat.columns:
    raise KeyError(f"train_feat missing target column: {TARGET_COL}")

In [77]:
# =========================================================
# 3-1. 学習行列の作成（Step4でそのまま使う）
# =========================================================
X_train = train_feat[FEATURE_COLS].copy()
y_train = train_feat[TARGET_COL].astype(float).copy()

X_test  = test_feat[FEATURE_COLS].copy()

# LightGBMに渡すカテゴリ列（存在するものだけ）
CAT_COLS = [c for c in CATEGORICAL_COLS if c in X_train.columns]
for c in CAT_COLS:
    X_train[c] = X_train[c].astype("category")
    X_test[c]  = X_test[c].astype("category")

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("num cat cols:", len(CAT_COLS), CAT_COLS[:10])

X_train: (363924, 176) X_test: (112437, 176)
num cat cols: 1 ['building_type']


In [78]:
# =========================================================
# 3-2. 時系列 valid の作成（README準拠：過去→未来）
#   デフォルトは 2022 を valid にする
# =========================================================
VALID_YEAR = 2022

if "year" not in train_feat.columns:
    raise KeyError("train_feat must have 'year' (created in Step1)")

valid_mask = (train_feat["year"] == VALID_YEAR)
train_mask = (train_feat["year"] < VALID_YEAR)

if valid_mask.sum() == 0:
    raise ValueError(f"No rows found for VALID_YEAR={VALID_YEAR}. Check train_feat['year'].")

if train_mask.sum() == 0:
    raise ValueError(f"No training rows found for year < {VALID_YEAR}. Check train_feat['year'].")

print("train rows:", int(train_mask.sum()), " valid rows:", int(valid_mask.sum()))

train rows: 274358  valid rows: 89566


In [79]:
def select_numeric_features(df):
    return df.select_dtypes(include=[np.number, bool])

X_train_num = select_numeric_features(X_train)
X_test_num  = select_numeric_features(X_test)

print("使用特徴量数:", X_train_num.shape[1])

# =========================
# 2. 念のため NaN / inf をガード（LightGBM的には必須ではないが安全）
# =========================
def final_numeric_guard(df):
    df = df.copy()
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

X_train_num = final_numeric_guard(X_train_num)
X_test_num  = final_numeric_guard(X_test_num)


[drop non-numeric cols] 44 columns
['building_create_date', 'building_modify_date', 'building_name', 'homes_building_name', 'homes_building_name_ruby', 'full_address', 'land_seigen', 'reform_exterior', 'reform_exterior_other', 'reform_common_area', 'building_tag_id', 'unit_name', 'reform_place', 'reform_place_other', 'reform_wet_area', 'reform_wet_area_other', 'reform_interior', 'reform_interior_other', 'reform_etc', 'renovation_date']
[drop non-numeric cols] 45 columns
['building_create_date', 'building_modify_date', 'building_name', 'homes_building_name', 'homes_building_name_ruby', 'full_address', 'land_seigen', 'reform_exterior', 'reform_exterior_other', 'reform_common_area', 'building_tag_id', 'unit_name', 'reform_place', 'reform_place_other', 'reform_wet_area', 'reform_wet_area_other', 'reform_interior', 'reform_interior_other', 'reform_etc', 'renovation_date']


Unnamed: 0,count
float64,103
int64,11
int8,11
Int64,4
Float64,2
category,1


In [80]:
# =========================================================
# 3-3. 学習関数（log1pで学習、expで戻す）
#   ※評価はStep4でMAPEなど多角的に行う
# =========================================================
def train_lgb_model(X_tr, y_tr, X_va, y_va):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "seed": 42,
        "verbosity": -1,
    }

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_valid = lgb.Dataset(X_va, y_va, reference=lgb_train)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=3000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(200),
        ],
    )

    return model

In [81]:
# =========================================================
# 3-4. 単一モデル or タイプ別モデル（切替可能）
#   - まずは安定な「単一モデル」をデフォルトにする
#   - あとで必要ならタイプ別に切り替え可能
# =========================================================
USE_TYPE_SPLIT = False  # まずは False 推奨（完成度・安定性優先）

MODELS = {}  # Step4で使う

# Step4で使う予測保持（raw）
oof_pred_raw = np.full(len(train_feat), np.nan, dtype=float)
test_pred_raw = np.zeros(len(test_feat), dtype=float)

def predict_raw(model: lgb.Booster, X: pd.DataFrame) -> np.ndarray:
    # log1p学習 → expm1で戻す
    pred_log = model.predict(X, num_iteration=model.best_iteration)
    pred = np.expm1(pred_log)
    return pred

if not USE_TYPE_SPLIT:
    # ---- 単一モデル ----
    model_all = train_lgb_model(
        X_train_num.loc[train_mask],
        y_train.loc[train_mask],
        X_train_num.loc[valid_mask],
        y_train.loc[valid_mask],
    )
    MODELS["all"] = model_all

    # valid予測（OOFの一部）
    oof_pred_raw[valid_mask.values] = predict_raw(model_all, X_train.loc[valid_mask])

    # test予測
    y_test_pred = model_all.predict(X_test_num)

else:
    # ---- building_type 別モデル ----
    if BUILDING_TYPE_COL not in train_feat.columns:
        raise KeyError(f"USE_TYPE_SPLIT=True requires {BUILDING_TYPE_COL}")

    for t in sorted(train_feat[BUILDING_TYPE_COL].astype(str).unique()):
        tr_t = train_mask & (train_feat[BUILDING_TYPE_COL].astype(str) == t)
        va_t = valid_mask & (train_feat[BUILDING_TYPE_COL].astype(str) == t)

        if tr_t.sum() == 0 or va_t.sum() == 0:
            print(f"[skip] type={t} has tr={int(tr_t.sum())} va={int(va_t.sum())}")
            continue

        model_t = train_lgb_model(
            X_train.loc[tr_t], y_train.loc[tr_t],
            X_train.loc[va_t], y_train.loc[va_t],
            cat_cols=CAT_COLS,
            seed=SEED + (hash(t) % 1000)
        )
        MODELS[t] = model_t

        oof_pred_raw[va_t.values] = predict_raw(model_t, X_train.loc[va_t])

        # test側も同じtypeのみ予測して埋める
        te_t = (test_feat[BUILDING_TYPE_COL].astype(str) == t)
        if te_t.sum() > 0:
            test_pred_raw[te_t.values] = predict_raw(model_t, X_test_num.loc[te_t])

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: building_create_date: datetime64[ns], building_modify_date: datetime64[ns], building_name: object, homes_building_name: object, homes_building_name_ruby: object, full_address: object, land_seigen: object, reform_exterior: object, reform_exterior_other: object, reform_common_area: object, building_tag_id: object, unit_name: object, reform_place: object, reform_place_other: object, reform_wet_area: object, reform_wet_area_other: object, reform_interior: object, reform_interior_other: object, reform_etc: object, renovation_date: object, renovation_etc: object, unit_tag_id: object, snapshot_create_date: object, new_date: object, snapshot_modify_date: object, timelimit_date: object, empty_contents: object, addr2_name: object, addr3_name: object, rosen_name1: object, eki_name1: object, bus_stop1: object, rosen_name2: object, eki_name2: object, bus_stop2: object, traffic_other: object, money_sonota_str1: object, money_sonota_str2: object, money_sonota_str3: object, parking_memo: object, school_ele_name: object, school_jun_name: object, est_other_name: object, statuses: object

In [None]:
# =========================================================
# 3-5. Step4への受け渡し（列として保存）
# =========================================================
train_feat["pred_valid_raw"] = oof_pred_raw   # valid年（例：2022）だけ値が入り、それ以外はNaN
test_feat["pred_test_raw"]   = test_pred_raw  # test全行

PRED_VALID_COL = "pred_valid_raw"
PRED_TEST_COL  = "pred_test_raw"

print("[OK] Step3 artifacts ready:")
print(" - MODELS keys:", list(MODELS.keys())[:10])
print(" - train_feat[PRED_VALID_COL] non-null:", int(np.isfinite(train_feat[PRED_VALID_COL]).sum()))
print(" - test_feat[PRED_TEST_COL] shape:", test_feat[PRED_TEST_COL].shape)

In [None]:
# =========================================================
# 4) 受け渡し検証・可視化・後処理・提出（完成度重視）
#  - 1/2/3/4 の整合性を最初に厳密チェック
#  - VALID_YEAR(=2022)で多角的検証
#  - NaNフラグ（distance）に関する診断
#  - 後処理（低価格補正など）を「改善したか」まで確認
#  - submit.csv を作成
# =========================================================

import matplotlib.pyplot as plt

In [None]:
# =========================================================
# 4-0. 整合性チェック
# =========================================================
def assert_pipeline_integrity():
    # Step1
    for v in ["train", "test", "ID_COL", "TARGET_COL", "YM_COL", "BUILDING_TYPE_COL",
              "LON_COL", "LAT_COL", "DISTANCE_COLS", "DIST_FILL_QUANTILE"]:
        if v not in globals():
            raise NameError(f"Missing from Step1: {v}")

    # Step2
    for v in ["train_feat", "test_feat", "FEATURE_COLS", "CATEGORICAL_COLS", "DIST_FILL_VALUES"]:
        if v not in globals():
            raise NameError(f"Missing from Step2: {v}")

    # Step3
    for v in ["X_train", "y_train", "X_test", "MODELS", "valid_mask", "train_mask",
              "PRED_VALID_COL", "PRED_TEST_COL"]:
        if v not in globals():
            raise NameError(f"Missing from Step3: {v}")

    # 列存在チェック
    must_train_cols = [ID_COL, TARGET_COL, YM_COL, "year", "month", "elapsed_months", BUILDING_TYPE_COL, PRED_VALID_COL]
    must_test_cols  = [ID_COL, YM_COL, "year", "month", "elapsed_months", BUILDING_TYPE_COL, PRED_TEST_COL]
    for c in must_train_cols:
        if c not in train_feat.columns:
            raise KeyError(f"train_feat missing: {c}")
    for c in must_test_cols:
        if c not in test_feat.columns:
            raise KeyError(f"test_feat missing: {c}")

    # FEATURE_COLS は train/test 両方に存在していること
    missing_train = [c for c in FEATURE_COLS if c not in train_feat.columns]
    missing_test  = [c for c in FEATURE_COLS if c not in test_feat.columns]
    if missing_train:
        raise KeyError(f"train_feat missing FEATURE_COLS: {missing_train[:20]} ... total={len(missing_train)}")
    if missing_test:
        raise KeyError(f"test_feat missing FEATURE_COLS: {missing_test[:20]} ... total={len(missing_test)}")

    # 予測列の NaN 状態：valid年以外はNaNでもOK、valid年は全て埋まっていること
    if train_feat.loc[valid_mask, PRED_VALID_COL].isna().any():
        n = int(train_feat.loc[valid_mask, PRED_VALID_COL].isna().sum())
        raise ValueError(f"Validation predictions contain NaN: {n} rows in VALID_YEAR={VALID_YEAR}")

    # test予測は全行埋まっていること
    if test_feat[PRED_TEST_COL].isna().any():
        n = int(test_feat[PRED_TEST_COL].isna().sum())
        raise ValueError(f"Test predictions contain NaN: {n} rows")

    # 数値のinfチェック（重要）
    def _has_inf(df, cols):
        arr = df[cols].select_dtypes(include=[np.number]).to_numpy()
        return np.isinf(arr).any()

    if _has_inf(train_feat, FEATURE_COLS + [PRED_VALID_COL, TARGET_COL]):
        raise ValueError("train_feat contains inf in features/preds/target")
    if _has_inf(test_feat, FEATURE_COLS + [PRED_TEST_COL]):
        raise ValueError("test_feat contains inf in features/preds")

    print("[OK] Pipeline integrity check passed.")

assert_pipeline_integrity()

In [None]:
# =========================================================
# 4-1. 指標関数
# =========================================================
EPS = 1e-6

def mape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), EPS)
    return np.mean(np.abs(y_true - y_pred) / denom)

def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean(np.abs(y_true - y_pred))

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [None]:
# =========================================================
# 4-2. VALID データセットの切り出し
# =========================================================
valid_df = train_feat.loc[valid_mask, :].copy()

y_true = valid_df[TARGET_COL].values
y_pred_raw = valid_df[PRED_VALID_COL].values

print("\n==== VALID METRICS (RAW) ====")
print("VALID_YEAR:", VALID_YEAR)
print("MAPE:", mape(y_true, y_pred_raw))
print("MAE :", mae(y_true, y_pred_raw))
print("RMSE:", rmse(y_true, y_pred_raw))

In [None]:
# =========================================================
# 4-3. 基本可視化（予測vs真値、残差、APE）
# =========================================================
def plot_pred_vs_true(y_true, y_pred, title):
    plt.figure()
    plt.scatter(y_true, y_pred, s=6, alpha=0.4)
    mn = float(min(y_true.min(), y_pred.min()))
    mx = float(max(y_true.max(), y_pred.max()))
    plt.plot([mn, mx], [mn, mx])
    plt.xlabel("true")
    plt.ylabel("pred")
    plt.title(title)
    plt.show()

def plot_hist(data, bins, title, xlabel):
    plt.figure()
    plt.hist(data, bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("count")
    plt.show()

plot_pred_vs_true(y_true, y_pred_raw, f"VALID {VALID_YEAR}: Pred vs True (RAW)")

resid = y_pred_raw - y_true
ape   = np.abs(resid) / np.maximum(np.abs(y_true), EPS)

plot_hist(resid, bins=60, title=f"VALID {VALID_YEAR}: Residual (pred-true)", xlabel="residual")
plot_hist(ape,   bins=60, title=f"VALID {VALID_YEAR}: APE", xlabel="APE")

# ログ空間でも見る（外れの見え方が変わる）
plot_pred_vs_true(np.log1p(y_true), np.log1p(y_pred_raw), f"VALID {VALID_YEAR}: log1p(Pred) vs log1p(True)")

In [None]:
# =========================================================
# 4-4. 分解検証（改善の見通し用）
#   - 月別 / 建物タイプ別 / 価格帯別（MAPEに効く）
# =========================================================
valid_df["ape"] = ape

# 月別
month_summary = valid_df.groupby("month").agg(
    n=(TARGET_COL, "size"),
    mape=("ape", "mean"),
    true_mean=(TARGET_COL, "mean"),
    pred_mean=(PRED_VALID_COL, "mean"),
)
print("\n---- VALID: month summary ----")
display(month_summary)

plt.figure()
plt.plot(month_summary.index.values, month_summary["mape"].values, marker="o")
plt.xlabel("month")
plt.ylabel("MAPE")
plt.title(f"VALID {VALID_YEAR}: MAPE by month")
plt.show()

# building_type別
type_summary = valid_df.groupby(BUILDING_TYPE_COL).agg(
    n=(TARGET_COL, "size"),
    mape=("ape", "mean"),
    true_mean=(TARGET_COL, "mean"),
    pred_mean=(PRED_VALID_COL, "mean"),
)
print("\n---- VALID: building_type summary ----")
display(type_summary)

# 価格帯別（decile）
valid_df["price_decile"] = pd.qcut(valid_df[TARGET_COL], q=10, duplicates="drop")
price_summary = valid_df.groupby("price_decile").agg(
    n=(TARGET_COL, "size"),
    mape=("ape", "mean"),
    true_mean=(TARGET_COL, "mean"),
    pred_mean=(PRED_VALID_COL, "mean"),
)
print("\n---- VALID: price decile summary ----")
display(price_summary)

plt.figure()
plt.plot(np.arange(len(price_summary)), price_summary["mape"].values, marker="o")
plt.xlabel("price decile (low -> high)")
plt.ylabel("MAPE")
plt.title(f"VALID {VALID_YEAR}: MAPE by price decile")
plt.show()


In [None]:
# =========================================================
# 4-5. distance NaNフラグ診断（重大：今回の改善点の核心）
#   - NaN群と非NaN群でMAPEがどう違うか
#   - どの距離列のNaNが特に効いているかが見える
# =========================================================
nan_diag_rows = []
for c in DISTANCE_COLS:
    flag = f"{c}_is_nan"
    if flag not in valid_df.columns:
        continue

    m_nan = valid_df[flag] == 1
    m_non = valid_df[flag] == 0

    # 両方の群が存在しない場合はスキップ
    if m_nan.sum() == 0 or m_non.sum() == 0:
        continue

    y_nan_true = valid_df.loc[m_nan, TARGET_COL].values
    y_nan_pred = valid_df.loc[m_nan, PRED_VALID_COL].values

    y_non_true = valid_df.loc[m_non, TARGET_COL].values
    y_non_pred = valid_df.loc[m_non, PRED_VALID_COL].values

    nan_diag_rows.append({
        "distance_col": c,
        "nan_ratio": float(m_nan.mean()),
        "mape_nan": float(mape(y_nan_true, y_nan_pred)),
        "mape_non": float(mape(y_non_true, y_non_pred)),
        "delta_mape(nan-non)": float(mape(y_nan_true, y_nan_pred) - mape(y_non_true, y_non_pred)),
        "n_nan": int(m_nan.sum()),
        "n_non": int(m_non.sum()),
    })

nan_diag = pd.DataFrame(nan_diag_rows).sort_values("delta_mape(nan-non)", ascending=False)
print("\n---- VALID: distance NaN diagnostic (top) ----")
display(nan_diag.head(30))

# 代表的なものをプロット（上位5つ）
topk = nan_diag.head(5)["distance_col"].tolist()
for c in topk:
    flag = f"{c}_is_nan"
    plt.figure()
    grp = valid_df.groupby(flag)["ape"].mean()
    plt.bar(grp.index.astype(str), grp.values)
    plt.title(f"VALID {VALID_YEAR}: mean APE by {flag}")
    plt.xlabel(f"{flag} (0=exists, 1=missing)")
    plt.ylabel("mean APE")
    plt.show()

In [None]:
# =========================================================
# 4-6. 外れケースの抽出（次の改善が見える）
#   - APE上位を building_id で追える
# =========================================================
worst = valid_df[[ID_COL, YM_COL, BUILDING_TYPE_COL, TARGET_COL, PRED_VALID_COL, "ape"]].sort_values("ape", ascending=False)
print("\n---- Worst 30 APE (VALID) ----")
display(worst.head(30))

In [None]:
# =========================================================
# 4-7. Feature importance（次の改善の方向性）
#   - 単一モデルなら all
#   - type split なら各モデル
# =========================================================
def show_feature_importance(model, feature_cols, topn=40, title="feature importance"):
    imp = pd.DataFrame({
        "feature": feature_cols,
        "gain": model.feature_importance(importance_type="gain")
    }).sort_values("gain", ascending=False).head(topn)

    plt.figure(figsize=(8, max(6, topn * 0.22)))
    plt.barh(imp["feature"][::-1], imp["gain"][::-1])
    plt.xlabel("gain importance")
    plt.title(title)
    plt.show()
    return imp

if "all" in MODELS:
    imp_all = show_feature_importance(MODELS["all"], FEATURE_COLS, topn=40, title="Model(all): top gain importance")
    print("\n---- Top features (all) ----")
    display(imp_all.head(25))
else:
    for k, mdl in MODELS.items():
        imp_k = show_feature_importance(mdl, FEATURE_COLS, topn=30, title=f"Model({k}): top gain importance")
        print(f"\n---- Top features ({k}) ----")
        display(imp_k.head(20))

In [None]:
# =========================================================
# 4-8. 後処理（例：低価格帯補正）＋「改善したか」を検証
#   ※ここはあなたの状況に応じて最適化ポイント
# =========================================================
LOW_TH = 9_000_000
LOW_SCALE = 0.83

def apply_low_price_scale(pred, th=LOW_TH, scale=LOW_SCALE):
    pred2 = pred.copy().astype(float)
    mask = pred2 <= th
    pred2[mask] *= scale
    return pred2

y_pred_post = apply_low_price_scale(y_pred_raw)

print("\n==== VALID METRICS (POST: low-price scale) ====")
print("MAPE:", mape(y_true, y_pred_post))
print("MAE :", mae(y_true, y_pred_post))
print("RMSE:", rmse(y_true, y_pred_post))

plot_pred_vs_true(y_true, y_pred_post, f"VALID {VALID_YEAR}: Pred vs True (POST)")
plot_hist(np.abs(y_pred_post - y_true)/np.maximum(np.abs(y_true), EPS), bins=60,
          title=f"VALID {VALID_YEAR}: APE (POST)", xlabel="APE")

# 価格帯別に「後処理が効いたか」も見える化
valid_df["ape_post"] = np.abs(y_pred_post - y_true) / np.maximum(np.abs(y_true), EPS)
price_post_summary = valid_df.groupby("price_decile").agg(
    mape_raw=("ape", "mean"),
    mape_post=("ape_post", "mean"),
    n=("ape", "size")
)
print("\n---- VALID: price decile (raw vs post) ----")
display(price_post_summary)

plt.figure()
plt.plot(np.arange(len(price_post_summary)), price_post_summary["mape_raw"].values, marker="o", label="raw")
plt.plot(np.arange(len(price_post_summary)), price_post_summary["mape_post"].values, marker="o", label="post")
plt.xlabel("price decile (low -> high)")
plt.ylabel("MAPE")
plt.title(f"VALID {VALID_YEAR}: MAPE by price decile (raw vs post)")
plt.legend()
plt.show()

In [None]:
# =========================================================
# 4-9. submit作成（最終）
#   - building_id と money_room を出す
#   - raw/post どちらを採用するかはここで決められる
# =========================================================
# ----------------------
# test予測（X_test_num 使用）
# ----------------------
y_test_pred = model_all.predict(X_test_num)

# testの後処理も同じものを適用
test_pred_raw = y_test_pred
test_pred_post = apply_low_price_scale(test_pred_raw)

# 最終採用（まずは post を使う。必要なら raw に戻す）
test_feat["pred_final"] = test_pred_post

# clip（価格として負値はありえない）
test_feat["pred_final"] = test_feat["pred_final"].clip(lower=0)

SUBMIT_PATH = DATA_DIR / "submit.csv"
submit = test_feat[[ID_COL]].copy()
submit[TARGET_COL] = test_feat["pred_final"].values

submit.to_csv(SUBMIT_PATH, index=False, encoding="utf-8")
print(f"\n[OK] saved submit: {SUBMIT_PATH}")
display(submit.head())


In [None]:
# =========================================================
# 4-10. 次の改善が見通せる「自動診断まとめ」
# =========================================================
print("\n==== Next Improvement Hints (auto) ====")

# 1) 月別の弱点
worst_month = month_summary["mape"].idxmax()
print(f"- Worst month (valid): month={int(worst_month)}  mape={month_summary.loc[worst_month, 'mape']:.4f}")

# 2) 価格帯の弱点
worst_decile = price_summary["mape"].idxmax()
print(f"- Worst price decile (valid): {worst_decile}  mape={price_summary.loc[worst_decile, 'mape']:.4f}")

# 3) distance NaN が痛い列
if len(nan_diag) > 0:
    top_bad = nan_diag.head(5)[["distance_col", "nan_ratio", "delta_mape(nan-non)"]]
    print("- distance NaN impact (top 5):")
    display(top_bad)
    print("  Action: deltaが大きい列は、log距離に加えて、距離のbin化（例：0-200m,200-500m,...）や相互作用を検討")

# 4) elapsed_months の重要度が低いなら
if "all" in MODELS:
    # 重要度表に elapsed_months があるか確認
    if "elapsed_months" in imp_all["feature"].values:
        rank = int(np.where(imp_all["feature"].values == "elapsed_months")[0][0] + 1)
        print(f"- elapsed_months is in top list (rank~{rank} within shown topn).")
    else:
        print("- elapsed_months not in shown top importance.")
        print("  Action: 時間水準特徴（例：prefecture×year の平均価格など）を追加すると効く余地が大きい")

print("\n[Recommended next experiments]")
print("1) USE_TYPE_SPLIT=True（building_type別モデル）を試す（CVで確認）")
print("2) distance列のbin化＋is_nanの相互作用（例：is_nan * elapsed_months）")
print("3) 時間水準特徴：地域（pref/市区町村）× year の統計量（平均・中央値）を安全に作る（リークなしの形で）")