<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%EF%BC%91%EF%BC%97%EF%BC%8E%EF%BC%90%E3%81%AE%E3%82%B3%E3%83%BC%E3%83%89base/16_1_mash_m2_mobile2_hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from glob import glob

In [None]:
# =========================
# 0. train/test 読み込み
# =========================
train = pd.read_csv("/content/train.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)
test  = pd.read_csv("/content/test.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)

In [None]:
# =========================
# 1. floor_count の異常値削除
# =========================
# 日本で一番高いマンションは64階なので70階以上は明らかに異常
train = train.loc[train['floor_count'] <= 70]

In [None]:
# =========================
# 0. グローバル設定
# =========================
SEEDS = [42, 100, 202, 777, 999]


In [None]:
# =========================
# 人口メッシュ GeoJSON 読み込み
# =========================
mesh_files = glob(
    "/content/drive/MyDrive/1km_mesh_2024_GEOJSON/**/*.geojson",
    recursive=True
)

gdfs = []
for f in mesh_files:
    gdf = gpd.read_file(f)

    if gdf.crs is None:
        gdf = gdf.set_crs("EPSG:4326")
    else:
        gdf = gdf.to_crs("EPSG:4326")

    gdfs.append(gdf)

pop_mesh = gpd.GeoDataFrame(
    pd.concat(gdfs, ignore_index=True),
    crs="EPSG:4326"
)

POP_COLS = [
    "PT02_2025",
    "PT08_2025",
    "PT10_2025",
]

pop_mesh = pop_mesh[["geometry"] + POP_COLS]
pop_mesh = pop_mesh.drop_duplicates(subset=["geometry"])

def to_gdf(df):
    return gpd.GeoDataFrame(
        df.copy(),
        geometry=gpd.points_from_xy(df["lon"], df["lat"]),
        crs="EPSG:4326"
    )

train_gdf = to_gdf(train)
test_gdf  = to_gdf(test)

train = gpd.sjoin(
    train_gdf,
    pop_mesh,
    how="left",
    predicate="within"
).drop(columns=["geometry", "index_right"])

test = gpd.sjoin(
    test_gdf,
    pop_mesh,
    how="left",
    predicate="within"
).drop(columns=["geometry", "index_right"])

for c in POP_COLS:
    train[c] = train[c].fillna(0)
    test[c]  = test[c].fillna(0)

assert len(train) == len(train_gdf)
assert len(test)  == len(test_gdf)


In [None]:
# =========================
# 都道府県地価調査（2023）
# =========================
pref_land_gdf = gpd.read_file("/content/L02-23.geojson")

# 念のため CRS 統一（地価公示と同じ）
pref_land_gdf = pref_land_gdf.to_crs(epsg=6668)

# 調査価格
PRICE_COL = "L02_006"

pref_land_gdf = pref_land_gdf[
    ["geometry", PRICE_COL]
].rename(columns={
    PRICE_COL: "pref_land_price"
})

#train / test を GeoDataFrame に変換
for df in [train, test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])

train_gdf = gpd.GeoDataFrame(
    train,
    geometry="geometry",
    crs="EPSG:4326"
).to_crs(epsg=6668)

test_gdf = gpd.GeoDataFrame(
    test,
    geometry="geometry",
    crs="EPSG:4326"
).to_crs(epsg=6668)

# 都道府県地価調査の座標
pref_xy = np.vstack([
    pref_land_gdf.geometry.x.values,
    pref_land_gdf.geometry.y.values
]).T

pref_tree = KDTree(pref_xy)
pref_prices = pref_land_gdf["pref_land_price"].values

#最近傍価格を取得する関数
def nearest_pref_land_price(pt, tree, prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return prices[idx[0][0]]

#train / test に付与
train_gdf["nearest_pref_land_price"] = train_gdf["geometry"].apply(
    lambda pt: nearest_pref_land_price(pt, pref_tree, pref_prices)
)

test_gdf["nearest_pref_land_price"] = test_gdf["geometry"].apply(
    lambda pt: nearest_pref_land_price(pt, pref_tree, pref_prices)
)

# DataFrame に戻す
train["nearest_pref_land_price"] = train_gdf["nearest_pref_land_price"].values
test["nearest_pref_land_price"]  = test_gdf["nearest_pref_land_price"].values

#欠損処理 + log 特徴量
for df in [train, test]:
    df["nearest_pref_land_price"] = df["nearest_pref_land_price"].fillna(0)
    df["nearest_pref_land_price_log"] = np.log1p(df["nearest_pref_land_price"])



In [None]:
# =========================
# DID データ読み込み
# =========================
did_gdf = gpd.read_file("/content/A16-20_00_DID.shp")

# 人口・面積カラム
DID_POP_COL  = "A16_005"
DID_AREA_COL = "A16_006"

# 密度
did_gdf["DID_density"] = did_gdf[DID_POP_COL] / (did_gdf[DID_AREA_COL] + 1e-6)

# CRS統一
did_gdf = did_gdf.to_crs(epsg=3857)

#train / test を GeoDataFrame に変換
train_gdf = gpd.GeoDataFrame(
    train,
    geometry=gpd.points_from_xy(train["lon"], train["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

test_gdf = gpd.GeoDataFrame(
    test,
    geometry=gpd.points_from_xy(test["lon"], test["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

#KDTree用の座標準備
# DIDポリゴンの代表点（centroid）
did_coords = np.vstack([
    did_gdf.geometry.centroid.x,
    did_gdf.geometry.centroid.y
]).T

tree = KDTree(did_coords)

#最近傍DIDを割り当てる関数
def attach_DID_features(base_gdf, did_gdf, tree):
    coords = np.vstack([
        base_gdf.geometry.x,
        base_gdf.geometry.y
    ]).T

    _, idx = tree.query(coords, k=1)

    base_gdf["DID_population"] = did_gdf.iloc[idx.flatten()][DID_POP_COL].values
    base_gdf["DID_area"]       = did_gdf.iloc[idx.flatten()][DID_AREA_COL].values
    base_gdf["DID_density"]    = did_gdf.iloc[idx.flatten()]["DID_density"].values

    return base_gdf

#train / test にDID付与
train_gdf = attach_DID_features(train_gdf, did_gdf, tree)
test_gdf  = attach_DID_features(test_gdf,  did_gdf, tree)

#geometryを落として DataFrame に戻す
train = pd.DataFrame(train_gdf.drop(columns="geometry"))
test  = pd.DataFrame(test_gdf.drop(columns="geometry"))

#欠損処理
for col in ["DID_population", "DID_area", "DID_density"]:
    train[col] = train[col].fillna(0)
    test[col]  = test[col].fillna(0)

    train[f"{col}_log"] = np.log1p(train[col])
    test[f"{col}_log"]  = np.log1p(test[col])

In [None]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"]==1)&(df["S12_038"]==1)]
station_2019 = df_2019.groupby("S12_001c", as_index=False).agg(passengers_2019=("S12_041","sum")).rename(columns={"S12_001c":"station_code"})


In [None]:

# =========================
# 3. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)
stations = station_point_gdf.merge(station_2019, left_on="S12_001c", right_on="station_code", how="left")
stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [None]:
# =========================
# 4. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326").to_crs(epsg=3857)

    # 既存の index_right を削除
    if "index_right" in gdf.columns:
        gdf = gdf.drop(columns=["index_right"])
    if "index_right" in stations_gdf.columns:
        stations_gdf = stations_gdf.drop(columns=["index_right"])

    joined = gpd.sjoin(gdf, stations_gdf, how="left", predicate="dwithin", distance=radius)

    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )

    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col != "building_id":
            df[col] = df[col].fillna(0)
            df[col + "_log"] = np.log1p(df[col])
    return df

In [None]:

# =========================
# 5. train/testに駅特徴量付与
# =========================
for radius in [500,1000]:
    train = add_station_features(train, stations_gdf, radius)
    test  = add_station_features(test, stations_gdf, radius)

In [None]:
# =========================================================
# 4. 築年 → age（target_ymからyear_built）
# year_builtはYYYYMM形式
# =========================================================
for df in [train,test]:
    df["sale_year"] = df["target_ym"] // 100
    df["age"] = (df["sale_year"] - (df["year_built"] // 100)).clip(0,100)

In [None]:
# =========================
# 7. 地価公示データ
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson").to_crs(epsg=4326)
for df in [train,test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326").to_crs(epsg=6668)
test_gdf  = gpd.GeoDataFrame(test,  geometry="geometry", crs="EPSG:4326").to_crs(epsg=6668)

land_gdf  = land_gdf.to_crs(epsg=6668)

land_xy = np.vstack([land_gdf.geometry.x.values, land_gdf.geometry.y.values]).T
tree = KDTree(land_xy)
land_prices = land_gdf['L01_006'].values

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

train_gdf['nearest_land_price'] = train_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
test_gdf['nearest_land_price']  = test_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
train['final_land_price'] = train_gdf['nearest_land_price'].values
test['final_land_price']  = test_gdf['nearest_land_price'].values

In [None]:
# =========================================================
# 6. 時期特徴量（年×マンション、年×首都圏、年×DID）
# =========================================================
CAPITAL_PREF_CODES = [13,14,12,11]  # 東京・神奈川・千葉・埼玉
for df in [train,test]:
    # base年は任意、ageに依存させるので安全
    df["is_mansion"] = (df["building_type"] == 1).astype(int)
    df["is_capital"] = df["addr1_1"].isin(CAPITAL_PREF_CODES).astype(int)
    df["is_DID"] = (df["DID_population"] > 0).astype(int)

    # 年×特徴量は age を使う
    df["year_x_mansion"] = df["age"] * df["is_mansion"]
    df["year_x_capital"] = df["age"] * df["is_capital"]
    df["year_x_DID"] = df["age"] * df["is_DID"]

In [None]:
# =========================
# 1. マンション/戸建てに分割
# =========================
train_mansion = train[train['building_type'] == 1].copy()
train_house   = train[train['building_type'] == 4].copy()
test_mansion  = test[test['building_type'] == 1].copy()
test_house    = test[test['building_type'] == 4].copy()

In [None]:
# =========================================================
# 8. 特徴量リスト
# =========================================================
common_features = [
    'lon','lat',
    'drugstore_distance','bank_distance','shopping_street_distance',
    'parking_keiyaku','money_hoshou_company','free_rent_duration','free_rent_gen_timing',
    'addr1','addr2','addr3','post1','post2',"nearest_pref_land_price",
    "nearest_pref_land_price_log"
]
common_features += POP_COLS
common_features += ["year_x_capital","year_x_DID","age"]

mansion_features = common_features + [
    'house_area','floor','room_count','total_units','building_structure','has_elevator','has_gym','maintenance_fee',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean',
    'year_x_mansion'
]

house_features = common_features + [
    'house_area','land_area','floor_count','room_count','building_structure',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean'
]


In [None]:
# =========================
# 3. 学習用データ作成関数
# =========================

DROP_COLS = [
    "building_id",
    "money_room",
    "money_hoshou_company"
]

def prepare_Xy(df, features, is_train=True):
    X = df[[c for c in features if c in df.columns]].copy()

    # geometry が紛れ込んでも必ず落とす
    if "geometry" in X.columns:
        X = X.drop(columns=["geometry"])

    # 数値型だけに限定（最終防衛ライン）
    X = X.select_dtypes(include=[np.number])

    if is_train:
        y = np.log1p(df["money_room"])
        return X, y
    else:
        return X


X_mansion, y_mansion = prepare_Xy(train_mansion, mansion_features)
X_house, y_house     = prepare_Xy(train_house, house_features)

X_test_mansion = prepare_Xy(
    test_mansion,
    mansion_features,
    is_train=False
)

X_test_house = prepare_Xy(
    test_house,
    house_features,
    is_train=False
)


In [None]:
# ##テスト！！！！！！！！！！！！！！！！！！！！！！！！！！！
# fair_c_values = [0.05, 0.08, 0.1, 0.12, 0.15, 0.2]

# def evaluate_fair_c_split(X, y, category_name='Mansion'):
#     from sklearn.model_selection import train_test_split

#     # train/valid分割
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#     results = []
#     for fair_c in fair_c_values:
#         params = {
#             'objective': 'fair',
#             'metric': 'rmse',
#             'boosting_type': 'gbdt',
#             'num_leaves': 31,
#             'fair_c': fair_c,
#             'verbose': -1,
#             'random_state': 42
#         }

#         train_data = lgb.Dataset(X_train, label=y_train)
#         valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

#         model = lgb.train(
#             params,
#             train_data,
#             num_boost_round=3000,
#             valid_sets=[valid_data],
#             early_stopping_rounds=200,
#             verbose_eval=False
#         )

#         y_pred = model.predict(X_val, num_iteration=model.best_iteration)
#         mape = np.mean(np.abs((y_val - y_pred) / y_val))
#         results.append((fair_c, mape))
#         print(f"{category_name} - fair_c: {fair_c:.2f}, MAPE: {mape:.6f}")

#     # 最良値
#     best_fair_c, best_mape = min(results, key=lambda x: x[1])
#     print(f"==> {category_name} 最適 fair_c: {best_fair_c}, MAPE: {best_mape:.6f}\n")
#     return results, (best_fair_c, best_mape)

# # =========================================================
# # マンション・戸建で評価
# # =========================================================
# print("マンション fair_c スキャン")
# mansion_results, mansion_best = evaluate_fair_c_split(X_mansion, y_mansion, 'マンション')

# print("戸建 fair_c スキャン")
# house_results, house_best = evaluate_fair_c_split(X_house, y_house, '戸建')



In [None]:
# =========================
# 4. 学習関数
# =========================
def train_lgb(X, y, seed):
    model = lgb.LGBMRegressor(
        n_estimators=5000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=seed,   # ★ ここが重要

        objective="fair",
        fair_c=0.1,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=0.1
    )

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y,
        test_size=0.2,
        random_state=seed     # ★ split も seed 揃える
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="mape",
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(0)
        ]
    )

    return model


In [None]:
# =========================
# 5. モデル学習
# =========================
models_mansion = []
models_house   = []

for seed in SEEDS:
    print(f"Training seed={seed}")
    models_mansion.append(train_lgb(X_mansion, y_mansion, seed))


Training seed=42
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14786
[LightGBM] [Info] Number of data points in the train set: 155660, number of used features: 108
[LightGBM] [Info] Start training from score 16.925227
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's mape: 0.00662085	valid_0's fair: 0.00459029
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12774
[LightGBM] [Info] Number of data points in the train set: 121743, number of used features: 105
[LightGBM] [Info] Start training from score 16

In [None]:
# =========================
# 1. 学習時の列を保存
# =========================
cols_mansion = X_mansion.columns.tolist()
cols_house   = X_house.columns.tolist()

# =========================
# アンサンブル予測関数
# =========================
def ensemble_predict(models, X):
    preds = []
    for model in models:
        preds.append(model.predict(X))
    return np.mean(preds, axis=0)


In [None]:

# =========================
# test を building_type で3分割
# =========================
test_mansion = test[test["building_type"] == 1].copy()
test_house   = test[test["building_type"] == 4].copy()
test_other   = test[
    ~test.index.isin(test_mansion.index) &
    ~test.index.isin(test_house.index)
].copy()

In [None]:
# =========================
# test の特徴量を「学習時列」と完全一致
# =========================
X_test_mansion = test_mansion.reindex(
    columns=cols_mansion,
    fill_value=0
)

X_test_house = test_house.reindex(
    columns=cols_house,
    fill_value=0
)

X_test_other = test_other.reindex(
    columns=cols_mansion,
    fill_value=0

)

In [None]:
##テスト！最適なスケール値を探す。全部コメント化

# import numpy as np
# from sklearn.metrics import mean_absolute_percentage_error

# # =========================
# # 1. 予測値（補正前）を作成
# # =========================
# y_pred_mansion_raw = np.expm1(model_mansion.predict(X_mansion))
# y_pred_house_raw   = np.expm1(model_house.predict(X_house))

# y_true_mansion = np.expm1(y_mansion)
# y_true_house   = np.expm1(y_house)

# LOW_TH_MANSION = 9_000_000
# LOW_TH_HOUSE   = 9_000_000

# # =========================
# # 2. 0.70 ~ 1.00 の範囲でスケールをテスト
# # =========================
# scales = np.arange(0.70, 1.01, 0.01)  # 0.01刻みで細かく探索

# results_mansion = []
# results_house   = []

# for scale in scales:
#     # --- マンション ---
#     y_pred = y_pred_mansion_raw.copy()
#     mask_low = y_pred <= LOW_TH_MANSION
#     y_pred[mask_low] *= scale
#     mape = mean_absolute_percentage_error(y_true_mansion, y_pred)
#     results_mansion.append((scale, mape))

#     # --- 戸建 ---
#     y_pred = y_pred_house_raw.copy()
#     mask_low = y_pred <= LOW_TH_HOUSE
#     y_pred[mask_low] *= scale
#     mape = mean_absolute_percentage_error(y_true_house, y_pred)
#     results_house.append((scale, mape))

# # =========================
# # 3. 結果を確認
# # =========================
# results_mansion = sorted(results_mansion, key=lambda x: x[1])
# results_house   = sorted(results_house, key=lambda x: x[1])

# print("マンション: 最適スケールとMAPE上位5")
# for s, m in results_mansion[:5]:
#     print(f"scale={s:.2f}, MAPE={m:.6f}")

# print("\n戸建: 最適スケールとMAPE上位5")
# for s, m in results_house[:5]:
#     print(f"scale={s:.2f}, MAPE={m:.6f}")


In [None]:
# =========================
# 3. 低価格帯補正つき予測
# =========================
LOW_TH_MANSION = 9_000_000
LOW_TH_HOUSE   = 9_000_000

LOW_SCALE_MANSION = 0.98
LOW_SCALE_HOUSE   = 0.98


def predict_with_low_scale(model, X, low_th, low_scale):
    y_pred = model.predict(X)

    # まず予測値側をガード
    y_pred = np.nan_to_num(
        y_pred,
        nan=0.0,
        posinf=20,   # log空間での上限
        neginf=0.0
    )

    # log → 元スケール
    y_pred = np.expm1(y_pred)

    # 再度ガード
    y_pred = np.nan_to_num(
        y_pred,
        nan=0.0,
        posinf=1e9,
        neginf=0.0
    )

    # 下限クリップ
    y_pred = np.clip(y_pred, 1, 1e9)

    # 低価格帯補正
    mask_low = y_pred <= low_th
    y_pred[mask_low] *= low_scale

    return y_pred
#マンション
y_pred_test_mansion_log = ensemble_predict(
    models_mansion,
    X_test_mansion
)

y_pred_test_mansion = np.expm1(y_pred_test_mansion_log)

mask_low = y_pred_test_mansion <= LOW_TH_MANSION
y_pred_test_mansion[mask_low] *= LOW_SCALE_MANSION

#戸建て
y_pred_test_house_log = ensemble_predict(
    models_house,
    X_test_house
)

y_pred_test_house = np.expm1(y_pred_test_house_log)

mask_low = y_pred_test_house <= LOW_TH_HOUSE
y_pred_test_house[mask_low] *= LOW_SCALE_HOUSE

#other
y_pred_test_other_log = ensemble_predict(
    models_mansion,
    X_test_other
)

y_pred_test_other = np.expm1(y_pred_test_other_log)

mask_low = y_pred_test_other <= LOW_TH_MANSION
y_pred_test_other[mask_low] *= LOW_SCALE_MANSION


In [None]:
# =========================
# 4. test DataFrame に書き戻す
# =========================
test.loc[test_mansion.index, "money_room"] = y_pred_test_mansion
test.loc[test_house.index,   "money_room"] = y_pred_test_house
test.loc[test_other.index,   "money_room"] = y_pred_test_other

test["money_room"] = test["money_room"].fillna(
    test["money_room"].median()
)

In [None]:
# =========================
# 5. submit.csv 作成
# =========================
submit = test[["id", "money_room"]].sort_values("id")
submit.to_csv("submit.csv", index=False, header=False)

# NaN 確認
import pandas as pd
pd.read_csv("submit.csv").isna().sum()

Unnamed: 0,0
0.0,0
14027808.173575142,0


In [None]:
# submit作成前に重複チェック
test[['id']].duplicated().sum()
# 0 なら id の重複はない


np.int64(0)