<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/main/hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb

from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [74]:
# =========================
# 1. train / test 読み込み
# =========================

train = pd.read_csv(
    "/content/train.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)

test = pd.read_csv(
    "/content/test.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)




In [75]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")

df_2019 = df[
    (df["S12_039"] == 1) &  # データ有
    (df["S12_038"] == 1)    # 重複なし
]

station_2019 = (
    df_2019
    .groupby("S12_001c", as_index=False)
    .agg(passengers_2019=("S12_041", "sum"))
    .rename(columns={"S12_001c": "station_code"})
)

In [76]:
# =========================
# 3. 駅ポイント（lon / lat）
# =========================

station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")

# 型合わせ（merge対策）
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)

# 乗降客数と結合
stations = station_point_gdf.merge(
    station_2019,
    left_on="S12_001c",
    right_on="station_code",
    how="left"
)

#先にメートル系へ変換
stations = stations.to_crs(epsg=3857)

# centroid を Point として使う
stations["geometry"] = stations.geometry.centroid

# これで stations は「駅ポイントの GeoDataFrame」
stations_gdf = stations[[
    "S12_001c",
    "passengers_2019",
    "geometry"
]].copy()

stations_gdf.crs = "EPSG:3857"


In [77]:
def add_station_features(df, stations_gdf, radius=500):

    df = df.copy()

    # 物件をGeoDataFrame化
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df["lon"], df["lat"]),
        crs="EPSG:4326"
    ).to_crs(epsg=3857)

    # 空間結合
    joined = gpd.sjoin(
        gdf,
        stations_gdf,
        how="left",
        predicate="dwithin",
        distance=radius
    )


    # 集計
    feat = (
        joined
        .groupby("building_id", as_index=False)
        .agg(
            **{
                f"station_passengers_{radius}m_sum":   ("passengers_2019", "sum"),
                f"station_passengers_{radius}m_max":   ("passengers_2019", "max"),
                f"station_passengers_{radius}m_mean":  ("passengers_2019", "mean"),
            }
        )
    )

    # マージ
    df = df.merge(feat, on="building_id", how="left")

    # 欠損処理 + log
    for col in feat.columns:
        if col != "building_id":
            df[col] = df[col].fillna(0)
            df[col + "_log"] = np.log1p(df[col])

    return df


In [78]:
# =========================
# 5. train / test に付与
# =========================
# 500m
train = add_station_features(train, stations_gdf, radius=500)
test  = add_station_features(test, stations_gdf, radius=500)

# 1000m（追加）
train = add_station_features(train, stations_gdf, radius=1000)
test  = add_station_features(test, stations_gdf, radius=1000)

[c for c in train.columns if "station_passengers" in c]

['station_passengers_500m_sum',
 'station_passengers_500m_max',
 'station_passengers_500m_mean',
 'station_passengers_500m_sum_log',
 'station_passengers_500m_max_log',
 'station_passengers_500m_mean_log',
 'station_passengers_1000m_sum',
 'station_passengers_1000m_max',
 'station_passengers_1000m_mean',
 'station_passengers_1000m_sum_log',
 'station_passengers_1000m_max_log',
 'station_passengers_1000m_mean_log']

In [79]:
# =========================
# 6. 築年 → 築年数
# =========================
for df in [train, test]:
    df["building_create_date"] = pd.to_numeric(
        df["building_create_date"], errors="coerce"
    )
    df["age"] = (2023 - df["building_create_date"]).clip(0, 100)


In [80]:
# =========================
# 7.地価公示データ読み込み
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson")
land_gdf = land_gdf.to_crs(epsg=4326)  # 緯度経度に統一

# train/testをGeoDataFrame化
for df in [train, test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326")
test_gdf  = gpd.GeoDataFrame(test, geometry="geometry", crs="EPSG:4326")

# 距離計算用に平面直角座標系に変換（メートル単位）
train_gdf = train_gdf.to_crs(epsg=6668)
test_gdf  = test_gdf.to_crs(epsg=6668)
land_gdf  = land_gdf.to_crs(epsg=6668)

In [81]:
# =========================
# 緯度経度最近傍（距離減衰）
# =========================
# 座標配列（メートル座標系）
land_xy = np.vstack([
    land_gdf.geometry.x.values,
    land_gdf.geometry.y.values
]).T

tree = KDTree(land_xy)

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

land_prices = land_gdf['L01_006'].values

train_gdf['nearest_land_price'] = train_gdf['geometry'].apply(
    lambda pt: nearest_land_price_fast(pt, tree, land_prices)
)

test_gdf['nearest_land_price'] = test_gdf['geometry'].apply(
    lambda pt: nearest_land_price_fast(pt, tree, land_prices)
)


In [82]:
# =========================
# 最終地価特徴量
# =========================
train_gdf['final_land_price'] = train_gdf['nearest_land_price']
test_gdf['final_land_price']  = test_gdf['nearest_land_price']


In [83]:
# =========================
# train/testに反映
# =========================
# train/test に反映
train['final_land_price'] = train_gdf['final_land_price'].values
test['final_land_price']  = test_gdf['final_land_price'].values

In [84]:
# =========================
# 売買時点の築年数（true_age）
# =========================

def add_true_age(df):
    df = df.copy()

    # 売買年（YYYY）
    df['sale_year'] = df['target_ym'] // 100

    # 建築年を数値化（保険）
    df['building_create_date'] = pd.to_numeric(
        df['building_create_date'], errors='coerce'
    )

    # 売買時点の築年数
    df['true_age'] = df['sale_year'] - df['building_create_date']

    # 異常値対策
    df['true_age'] = df['true_age'].clip(lower=0, upper=100)

    # 非線形補助
    df['true_age_log'] = np.log1p(df['true_age'])

    return df


# train / test に適用
train = add_true_age(train)
test  = add_true_age(test)


In [85]:
# =========================
# 価格帯別に予測
# =========================
def price_band(y):
    if y <= 9_000_000:
        return "low"
    elif y <= 30_000_000:
        return "mid"
    else:
        return "high"

train["price_band"] = train["money_room"].apply(price_band)




In [86]:
# =========================
# 価格帯ごとにモデルを学習する
# =========================
train_low  = train[train["price_band"] == "low"]
train_mid  = train[train["price_band"] == "mid"]
train_high = train[train["price_band"] == "high"]


In [87]:
# =========================
# 8. 特徴量リスト作成
# =========================
features = [
    c for c in train.columns
    if c not in [
        'id',
        'money_room',
        'geometry',
        'money_hoshou_company',
        'building_create_date',
        'age',
        'price_band'   # ← 忘れがち
    ]
    and train[c].dtype != 'object'
]


In [88]:
# =========================
# 9. price_band 作成（trainのみ）
# =========================
def price_band(y):
    if y <= 9_000_000:
        return "low"
    elif y <= 30_000_000:
        return "mid"
    else:
        return "high"

train["price_band"] = train["money_room"].apply(price_band)


In [89]:
# =========================
# 10. 価格帯別に train 分割
# =========================
train_low  = train[train["price_band"] == "low"]
train_mid  = train[train["price_band"] == "mid"]
train_high = train[train["price_band"] == "high"]


In [90]:
# =========================
# 11. price_band 別 学習データ作成
# =========================
X_low  = train_low[features]
X_mid  = train_mid[features]
X_high = train_high[features]

y_low  = np.log1p(train_low["money_room"])
y_mid  = np.log1p(train_mid["money_room"])
y_high = np.log1p(train_high["money_room"])


In [91]:
# =========================
# 12.price_band 別 LightGBM 学習
# =========================
params = dict(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model_low  = lgb.LGBMRegressor(**params)
model_mid  = lgb.LGBMRegressor(**params)
model_high = lgb.LGBMRegressor(**params)

model_low.fit(X_low, y_low)
model_mid.fit(X_mid, y_mid)
model_high.fit(X_high, y_high)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12277
[LightGBM] [Info] Number of data points in the train set: 37433, number of used features: 98
[LightGBM] [Info] Start training from score 15.764828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.117333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13681
[LightGBM] [Info] Number of data points in the train set: 225610, number of used features: 98
[LightGBM] [Info] Start training from score 16.725235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 

In [92]:
# =========================
# 13.仮予測用（全体モデル）
# =========================
X_all = train[features]
y_all = np.log1p(train["money_room"])

model_all = lgb.LGBMRegressor(**params)
model_all.fit(X_all, y_all)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.179303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13837
[LightGBM] [Info] Number of data points in the train set: 363924, number of used features: 98
[LightGBM] [Info] Start training from score 16.872850


In [93]:
# =========================
# 14. test に仮予測 → price_band 付与
# =========================
test["y_pred_tmp"] = np.expm1(model_all.predict(test[features]))
test["price_band"] = test["y_pred_tmp"].apply(price_band)


In [94]:
# =========================
# 15. price_band 別 本予測
# =========================
test["money_room"] = 0.0

test.loc[test["price_band"]=="low", "money_room"] = np.expm1(
    model_low.predict(test.loc[test["price_band"]=="low", features])
)

test.loc[test["price_band"]=="mid", "money_room"] = np.expm1(
    model_mid.predict(test.loc[test["price_band"]=="mid", features])
)

test.loc[test["price_band"]=="high", "money_room"] = np.expm1(
    model_high.predict(test.loc[test["price_band"]=="high", features])
)


In [95]:
# 学習後（分析用・提出には使わない）
train["y_pred"] = np.expm1(model_all.predict(train[features]))
train["ape"] = np.abs(train["money_room"] - train["y_pred"]) / train["money_room"]


In [96]:
# low帯の予測が9,000,000円を超えたら止める
train.loc[
    (train["price_band"] == "low") &
    (train["y_pred"] > 9_000_000),
    "y_pred"
] = 9_000_000


In [97]:
train.groupby("price_band")["ape"].quantile(
    [0.5, 0.9, 0.99]
)


Unnamed: 0_level_0,Unnamed: 1_level_0,ape
price_band,Unnamed: 1_level_1,Unnamed: 2_level_1
high,0.5,0.091711
high,0.9,0.236944
high,0.99,0.420479
low,0.5,0.170929
low,0.9,0.527803
low,0.99,1.058059
mid,0.5,0.104525
mid,0.9,0.277222
mid,0.99,0.546508


In [98]:
# =========================
# 16. 予測・提出
# =========================
submit = test[["id", "money_room"]]
submit.to_csv("submit.csv", index=False, header=False)
print("submit.csv を出力しました")

submit.csv を出力しました
