<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%E3%83%86%E3%82%B9%E3%83%88/hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb

from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [24]:
# =========================
# 1. train / test 読み込み
# =========================
train = pd.read_csv(
    "/content/train.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)
test = pd.read_csv(
    "/content/test.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)

In [25]:
# =========================
# 2. 欠損値フラグ、値定義
# =========================
target = "money_room"
base_missing_cols = [
    "money_room", "target_ym",
    "lon", "lat",
    "year_built", "house_area", "unit_area",
    "floor_count", "room_kaisuu",
    "walk_distance1", "walk_distance2",
    "convenience_distance", "super_distance",
    "drugstore_distance", "bank_distance",
    "shopping_street_distance"
]


In [26]:
# =========================
# 3. 低価格帯欠損値処理
# =========================
low_th = train["money_room"].quantile(0.2)  # 下位20%
train["is_low_price"] = (train["money_room"] <= low_th).astype(int)
test["is_low_price"] = 0  # testは未知なので0

for c in [
    "shopping_street_distance",
    "bank_distance",
    "drugstore_distance",
    "convenience_distance",
    "super_distance"
]:
    train[f"{c}_low_isna"] = (
        train[c].isna() & (train["is_low_price"] == 1)
    ).astype(int)
    test[f"{c}_low_isna"] = test[c].isna().astype(int) * 0

In [27]:
# =========================
# 4. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"] == 1) & (df["S12_038"] == 1)]
station_2019 = (
    df_2019
    .groupby("S12_001c", as_index=False)
    .agg(passengers_2019=("S12_041", "sum"))
    .rename(columns={"S12_001c": "station_code"})
)

In [28]:
# =========================
# 5. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)

stations = station_point_gdf.merge(
    station_2019,
    left_on="S12_001c",
    right_on="station_code",
    how="left"
)

stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [29]:
# =========================
# 6. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df["lon"], df["lat"]),
        crs="EPSG:4326"
    ).to_crs(epsg=3857)

    joined = gpd.sjoin(
        gdf,
        stations_gdf,
        how="left",
        predicate="dwithin",
        distance=radius
    )

    feat = (
        joined
        .groupby("building_id", as_index=False)
        .agg(
            **{
                f"station_passengers_{radius}m_sum":   ("passengers_2019", "sum"),
                f"station_passengers_{radius}m_max":   ("passengers_2019", "max"),
                f"station_passengers_{radius}m_mean":  ("passengers_2019", "mean"),
            }
        )
    )

    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col != "building_id":
            df[col] = df[col].fillna(0)
            df[col + "_log"] = np.log1p(df[col])
    return df

In [30]:
# =========================
# 7. train / test に駅特徴量付与
# =========================
train = add_station_features(train, stations_gdf, radius=500)
test  = add_station_features(test, stations_gdf, radius=500)
train = add_station_features(train, stations_gdf, radius=1000)
test  = add_station_features(test, stations_gdf, radius=1000)

In [31]:
# =========================
# 8. 築年 → 築年数
# =========================
for df in [train, test]:
    df["building_create_date"] = pd.to_numeric(df["building_create_date"], errors="coerce")
    df["age"] = (2023 - df["building_create_date"]).clip(0,100)

def add_true_age(df):
  df = df.copy()
  df['sale_year'] = df['target_ym'] // 100
  df['true_age'] = df['sale_year'] - df['building_create_date']
  df['true_age'] = df['true_age'].clip(lower=0, upper=100)
  df['true_age_log'] = np.log1p(df['true_age'])
  return df

train = add_true_age(train)
test  = add_true_age(test)

In [32]:
# =========================
# 9. 価格帯分類（train のみ）
# =========================
def price_band(y):
    if y <= 9_000_000:
        return "low"
    elif y <= 30_000_000:
        return "mid"
    else:
        return "high"

train["price_band"] = train["money_room"].apply(price_band)

In [33]:

# =========================
# 10. 特徴量リスト作成
# =========================
features = [
    c for c in train.columns
    if c not in ['id','money_room','geometry','money_hoshou_company','building_create_date','age','price_band']
    and train[c].dtype != 'object'
]

In [34]:

# =========================
# 11. 価格帯別 train 分割
# =========================
train_low  = train[train["price_band"]=="low"]
train_mid  = train[train["price_band"]=="mid"]
train_high = train[train["price_band"]=="high"]

X_low  = train_low[features]
X_mid  = train_mid[features]
X_high = train_high[features]

y_low  = np.log1p(train_low["money_room"])
y_mid  = np.log1p(train_mid["money_room"])
y_high = np.log1p(train_high["money_room"])

In [35]:
# =========================
# 12. LightGBM 学習
# =========================
params = dict(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model_low  = lgb.LGBMRegressor(**params)
model_mid  = lgb.LGBMRegressor(**params)
model_high = lgb.LGBMRegressor(**params)

model_low.fit(X_low, y_low)
model_mid.fit(X_mid, y_mid)
model_high.fit(X_high, y_high)

In [36]:
# =========================
# 13. 全体モデルで仮予測（test の price_band 用）
# =========================
X_all = train[features]
y_all = np.log1p(train["money_room"])
model_all = lgb.LGBMRegressor(**params)
model_all.fit(X_all, y_all)

y_pred_tmp = np.expm1(model_all.predict(test[features]))

# 仮予測値に下限を設定（train_low の最小値）
y_pred_tmp = np.maximum(y_pred_tmp, train_low["money_room"].min())
test["y_pred_tmp"] = y_pred_tmp

# 価格帯付与
test["price_band"] = test["y_pred_tmp"].apply(price_band)


In [37]:
# =========================
# 14. 価格帯別本予測
# =========================
test["money_room"] = 0.0

test.loc[test["price_band"]=="low", "money_room"] = np.expm1(
    model_low.predict(test.loc[test["price_band"]=="low", features])
)

test.loc[test["price_band"]=="mid", "money_room"] = np.expm1(
    model_mid.predict(test.loc[test["price_band"]=="mid", features])
)

test.loc[test["price_band"]=="high", "money_room"] = np.expm1(
    model_high.predict(test.loc[test["price_band"]=="high", features])
)

In [38]:
# =========================
# 15. low 帯 clip（任意）
# =========================
low_cap = train_low["money_room"].quantile(0.02)
mask_low = test["price_band"]=="low"
test.loc[mask_low, "money_room"] = np.clip(test.loc[mask_low, "money_room"], low_cap, None)


In [39]:
# =========================
# 16. 提出用 CSV 出力
# =========================
submit = test[["id", "money_room"]]
submit.to_csv("submit.csv", index=False, header=False)
print("submit.csv を出力しました")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12287
[LightGBM] [Info] Number of data points in the train set: 37433, number of used features: 103
[LightGBM] [Info] Start training from score 15.764828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13693
[LightGBM] [Info] Number of data points in the train set: 225610, number of used features: 104
[LightGBM] [Info] Start training from score 16.725235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is no