<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%EF%BC%91%EF%BC%97%EF%BC%8E%EF%BC%90%E3%81%AE%E3%82%B3%E3%83%BC%E3%83%89base/re2_hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import numpy as np
import pandas as pd
import lightgbm as lgb


In [27]:
# =========================
# パス定義
# =========================
TRAIN_PATH = "/content/train.csv"
TEST_PATH  = "/content/test.csv"

# =========================
# CSV 読み込み
# =========================
df_train_raw = pd.read_csv(
    TRAIN_PATH,
    encoding="shift_jis",        # 日本不動産データで最も安全
    encoding_errors="replace",   # 文字化けは落とさない
    low_memory=False             # dtype 分断防止
)

df_test_raw = pd.read_csv(
    TEST_PATH,
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)

# ---------------------------
#共通ユーティリティ（評価・型・カラム選択）
# ---------------------------

TARGET = "money_room"
TIME_COL = "target_ym"
TYPE_COL = "building_type"

# ---------------------------
# 評価指標（検証用）
# ---------------------------
def regression_metrics(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    mask = np.isfinite(y_true) & np.isfinite(y_pred) & (y_true > 0)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    mae = float(np.mean(np.abs(y_true - y_pred)))
    rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
    mape = float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100.0)
    return {"MAE": mae, "RMSE": rmse, "MAPE(%)": mape}

# ---------------------------
# カラムが存在するものだけ選ぶ
# ---------------------------
def pick_existing(df, cols):
    return [c for c in cols if c in df.columns]

# ---------------------------
# 数値変換（安全）
# ---------------------------
def to_numeric(df, cols):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# ---------------------------
# カテゴリ変換（安全）
# ---------------------------
def to_category(df, cols, na_token="NA"):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype("object").fillna(na_token).astype("category")
    return df

# ---------------------------
# train: 外れ値は削除 / test: 外れ値はclip
# year_built は「築年数(年)」として入っている前提
# ---------------------------
def filter_train_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70):
    df = df.copy()
    if age_col in df.columns:
        df = df.loc[(df[age_col] >= 0) & (df[age_col] <= max_age)]
    if floor_col in df.columns:
        df = df.loc[df[floor_col].isna() | (df[floor_col] <= max_floor)]
    return df

def clip_test_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70):
    df = df.copy()
    if age_col in df.columns:
        df[age_col] = df[age_col].clip(0, max_age)
    if floor_col in df.columns:
        df[floor_col] = df[floor_col].clip(0, max_floor)
    return df

# ---------------------------
# 時系列split（未来対応）
# valid_year=2023 → 2022まで学習、2023で検証
# ---------------------------
def temporal_split(df, valid_year=2023):
    df = df.copy()
    # target_ym が int yyyymm の想定
    split_point = valid_year * 100
    tr = df[df[TIME_COL] < split_point].copy()
    va = df[df[TIME_COL] >= split_point].copy()
    return tr, va


In [28]:
# ---------------------------
#前処理（マンション/アパート：building_type 1,3）
# ---------------------------

def preprocess_mansion(df, is_train: bool):
    df = df.copy()

    # 対象を集合住宅に限定
    df = df[df[TYPE_COL].isin([1, 3])].copy()

    # 必須の型（time）
    df[TIME_COL] = pd.to_numeric(df[TIME_COL], errors="coerce").astype("Int64")
    df = df.dropna(subset=[TIME_COL])

    # 数値候補
    num_candidates = [
        "unit_area", "room_count", "room_floor",
        "floor_count", "walk_distance1",
        "lat", "lon",
        "year_built"  # 築年数(年)
    ]
    df = to_numeric(df, pick_existing(df, num_candidates))

    # train/testの世界線制御
    if is_train:
        df = filter_train_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70)
    else:
        df = clip_test_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70)

    # 欠損補完（意味付き）
    if "walk_distance1" in df.columns:
        df["walk_distance1"] = df["walk_distance1"].fillna(9999)
    for c in ["unit_area", "room_count", "room_floor", "floor_count", "lat", "lon", "year_built"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    # 派生（単純・強い・未来不変）
    if "year_built" in df.columns:
        df["age_sq"] = df["year_built"] ** 2
        df["age_log"] = np.log1p(df["year_built"])

    if ("room_floor" in df.columns) and ("floor_count" in df.columns):
        fc = df["floor_count"].replace(0, np.nan)
        df["floor_ratio"] = (df["room_floor"] / fc).fillna(0)

    if "unit_area" in df.columns:
        df["unit_area_log"] = np.log1p(df["unit_area"])

    # カテゴリ候補（存在するものだけ）
    # madori_kind_all がなければ floor_plan_code
    plan_col = "madori_kind_all" if "madori_kind_all" in df.columns else ("floor_plan_code" if "floor_plan_code" in df.columns else None)

    cat_candidates = [
        "building_structure",
        "addr1_1", "addr1_2",
        "post1",
        "land_youto", "land_toshi",
    ]
    if plan_col:
        cat_candidates.append(plan_col)

    # addr1_1 は 2桁ゼロ埋め（カテゴリとして扱う）
    if "addr1_1" in df.columns:
        df["addr1_1"] = df["addr1_1"].astype("object").fillna("NA")
        df.loc[df["addr1_1"] != "NA", "addr1_1"] = df.loc[df["addr1_1"] != "NA", "addr1_1"].astype(str).str.zfill(2)

    df = to_category(df, pick_existing(df, cat_candidates), na_token="NA")

    # 特徴量リスト確定
    feat_cols = [
        "unit_area", "unit_area_log",
        "room_count",
        "room_floor", "floor_count", "floor_ratio",
        "walk_distance1",
        "lat", "lon",
        "year_built", "age_sq", "age_log",
        "building_structure",
        "addr1_1", "addr1_2", "post1",
        "land_youto", "land_toshi",
    ]
    if plan_col:
        feat_cols.append(plan_col)

    feat_cols = pick_existing(df, feat_cols)

    if is_train:
        df = df.dropna(subset=[TARGET])
        X = df[feat_cols].copy()
        y = df[TARGET].copy()
        return df, X, y, feat_cols
    else:
        X = df[feat_cols].copy()
        return df, X, feat_cols


In [29]:
# ---------------------------
#前処理（戸建：building_type 4）
# ---------------------------
def preprocess_house(df, is_train: bool):
    df = df.copy()

    # 対象を戸建に限定
    df = df[df[TYPE_COL] == 4].copy()

    # 必須の型（time）
    df[TIME_COL] = pd.to_numeric(df[TIME_COL], errors="coerce").astype("Int64")
    df = df.dropna(subset=[TIME_COL])

    num_candidates = [
        "house_area", "unit_area",
        "walk_distance1",
        "lat", "lon",
        "year_built",
        "land_kenpei", "land_youseki",
        "building_land_area", "land_area_all"
    ]
    df = to_numeric(df, pick_existing(df, num_candidates))

    # train/testの世界線制御（年齢）
    if is_train:
        df = filter_train_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70)
    else:
        df = clip_test_world(df, age_col="year_built", max_age=150, floor_col="floor_count", max_floor=70)

    # 面積の異常（戸建は土地の極端値が混じりやすいので保守的）
    # ここは「削除（train）」だけ。testはclipしない（提出件数保持）。
    if is_train and "house_area" in df.columns:
        df = df.loc[df["house_area"].isna() | (df["house_area"] <= 2000)]

    # 欠損補完（意味付き）
    if "walk_distance1" in df.columns:
        df["walk_distance1"] = df["walk_distance1"].fillna(9999)
    for c in ["house_area", "unit_area", "lat", "lon", "year_built", "land_kenpei", "land_youseki", "building_land_area", "land_area_all"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    # 派生
    if "year_built" in df.columns:
        df["age_sq"] = df["year_built"] ** 2
        df["age_log"] = np.log1p(df["year_built"])

    # 面積のlog（戸建でも効く）
    if "house_area" in df.columns:
        df["house_area_log"] = np.log1p(df["house_area"])
    if "unit_area" in df.columns:
        df["unit_area_log"] = np.log1p(df["unit_area"])

    # カテゴリ
    plan_col = "madori_kind_all" if "madori_kind_all" in df.columns else ("floor_plan_code" if "floor_plan_code" in df.columns else None)
    cat_candidates = [
        "building_structure",
        "addr1_1", "addr1_2",
        "post1",
        "land_youto", "land_toshi",
    ]
    if plan_col:
        cat_candidates.append(plan_col)

    if "addr1_1" in df.columns:
        df["addr1_1"] = df["addr1_1"].astype("object").fillna("NA")
        df.loc[df["addr1_1"] != "NA", "addr1_1"] = df.loc[df["addr1_1"] != "NA", "addr1_1"].astype(str).str.zfill(2)

    df = to_category(df, pick_existing(df, cat_candidates), na_token="NA")

    # 特徴量（戸建は土地系を優先）
    feat_cols = [
        "house_area", "house_area_log",
        "unit_area", "unit_area_log",
        "building_land_area", "land_area_all",
        "land_kenpei", "land_youseki",
        "walk_distance1",
        "lat", "lon",
        "year_built", "age_sq", "age_log",
        "building_structure",
        "addr1_1", "addr1_2", "post1",
        "land_youto", "land_toshi",
    ]
    if plan_col:
        feat_cols.append(plan_col)

    feat_cols = pick_existing(df, feat_cols)

    if is_train:
        df = df.dropna(subset=[TARGET])
        X = df[feat_cols].copy()
        y = df[TARGET].copy()
        return df, X, y, feat_cols
    else:
        X = df[feat_cols].copy()
        return df, X, feat_cols


In [30]:
# ---------------------------
#学習関数（fair loss）
# ---------------------------
def train_lgb_fair(X_train, y_train, X_valid, y_valid, cat_cols, fair_c=1.0):

    dtr = lgb.Dataset(
        X_train, y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    dva = lgb.Dataset(
        X_valid, y_valid,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "fair",
        "fair_c": fair_c,
        "metric": "mae",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "min_data_in_leaf": 30,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "verbosity": -1,
        "seed": 42,
    }

    model = lgb.train(
        params,
        dtr,
        valid_sets=[dtr, dva],
        valid_names=["train", "valid"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(50),
        ]
    )

    return model


In [31]:
def run_validation(df_train_raw, valid_year=2023, fair_c=1.0):
    # target_ymはintに寄せる（安全）
    df_train_raw = df_train_raw.copy()
    df_train_raw[TIME_COL] = pd.to_numeric(df_train_raw[TIME_COL], errors="coerce").astype("Int64")

    # -----------------------
    # 1) 集合住宅（1,3）
    # -----------------------
    mansion_df, X_m, y_m, feat_m = preprocess_mansion(df_train_raw, is_train=True)
    tr_m, va_m = temporal_split(mansion_df, valid_year=valid_year)

    # split後にX,yを作り直す（漏れ防止）
    X_tr_m = tr_m[feat_m].copy()
    y_tr_m = tr_m[TARGET].copy()
    X_va_m = va_m[feat_m].copy()
    y_va_m = va_m[TARGET].copy()

    cat_m = list(X_tr_m.select_dtypes(include=["category"]).columns)

    model_m = train_lgb_fair(X_tr_m, y_tr_m, X_va_m, y_va_m, cat_cols=cat_m, fair_c=fair_c)

    pred_m = model_m.predict(X_va_m, num_iteration=model_m.best_iteration)
    met_m = regression_metrics(y_va_m.values, pred_m)

    print("=== Mansion/Apt (building_type in [1,3]) Validation ===")
    for k, v in met_m.items():
        print(f"{k}: {v:,.4f}")

    # -----------------------
    # 2) 戸建（4）
    # -----------------------
    house_df, X_h, y_h, feat_h = preprocess_house(df_train_raw, is_train=True)
    tr_h, va_h = temporal_split(house_df, valid_year=valid_year)

    X_tr_h = tr_h[feat_h].copy()
    y_tr_h = tr_h[TARGET].copy()
    X_va_h = va_h[feat_h].copy()
    y_va_h = va_h[TARGET].copy()

    cat_h = list(X_tr_h.select_dtypes(include=["category"]).columns)

    model_h = train_lgb_fair(X_tr_h, y_tr_h, X_va_h, y_va_h, cat_cols=cat_h, fair_c=fair_c)

    pred_h = model_h.predict(X_va_h, num_iteration=model_h.best_iteration)
    met_h = regression_metrics(y_va_h.values, pred_h)

    print("\n=== House (building_type == 4) Validation ===")
    for k, v in met_h.items():
        print(f"{k}: {v:,.4f}")

    return {
        "mansion_model": model_m,
        "house_model": model_h,
        "mansion_features": feat_m,
        "house_features": feat_h,
        "mansion_metrics": met_m,
        "house_metrics": met_h,
    }

# 実行（例）
results = run_validation(df_train_raw, valid_year=2023, fair_c=1.0)


ValueError: Input data must be 2 dimensional and non empty.

In [None]:
#マンションモデル重要特徴量
import matplotlib.pyplot as plt
import lightgbm as lgb

def plot_importance(model, title, max_num=20):
    plt.figure(figsize=(8, 6))
    lgb.plot_importance(
        model,
        max_num_features=max_num,
        importance_type="gain"
    )
    plt.title(title)
    plt.tight_layout()
    plt.show()

plot_importance(results["mansion_model"], "Mansion Feature Importance")
plot_importance(results["house_model"], "House Feature Importance")


In [None]:
#予測 vs 実測（散布図）
def plot_pred_vs_true(y_true, y_pred, title):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, s=5, alpha=0.3)
    max_v = max(y_true.max(), y_pred.max())
    plt.plot([0, max_v], [0, max_v], "--", color="gray")
    plt.xlabel("True Price")
    plt.ylabel("Predicted Price")
    plt.title(title)
    plt.tight_layout()
    plt.show()


In [None]:
# mansion valid 再取得
m_df, _, _, feat_m = preprocess_mansion(df_train_raw, is_train=True)
_, m_valid = temporal_split(m_df, valid_year=2023)

X_mv = m_valid[feat_m]
y_mv = m_valid["money_room"]
y_mp = results["mansion_model"].predict(X_mv)

plot_pred_vs_true(y_mv, y_mp, "Mansion: Predicted vs True")


In [None]:
def plot_error_hist(y_true, y_pred, title):
    errors = y_pred - y_true
    plt.figure(figsize=(7, 4))
    plt.hist(errors, bins=100)
    plt.title(title)
    plt.xlabel("Prediction Error")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()


In [None]:
plot_error_hist(y_mv, y_mp, "Mansion Prediction Error Distribution")


In [None]:
def plot_mape_by_price(y_true, y_pred, bins=10, title="MAPE by Price Bin"):
    df = pd.DataFrame({
        "y": y_true,
        "pred": y_pred
    })
    df["abs_pct_err"] = np.abs(df["pred"] - df["y"]) / df["y"]

    df["bin"] = pd.qcut(df["y"], q=bins, duplicates="drop")
    mape_by_bin = df.groupby("bin")["abs_pct_err"].mean() * 100

    plt.figure(figsize=(8, 4))
    mape_by_bin.plot(kind="bar")
    plt.ylabel("MAPE (%)")
    plt.title(title)
    plt.tight_layout()
    plt.show()


In [None]:
plot_mape_by_price(y_mv.values, y_mp, title="Mansion MAPE by Price Range")


In [None]:
# house valid
h_df, _, _, feat_h = preprocess_house(df_train_raw, is_train=True)
_, h_valid = temporal_split(h_df, valid_year=2023)

X_hv = h_valid[feat_h]
y_hv = h_valid["money_room"]
y_hp = results["house_model"].predict(X_hv)

plot_pred_vs_true(y_hv, y_hp, "House: Predicted vs True")
plot_error_hist(y_hv, y_hp, "House Prediction Error Distribution")
plot_mape_by_price(y_hv.values, y_hp, title="House MAPE by Price Range")
