<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%E3%83%86%E3%82%B9%E3%83%88/hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================
# 0. ライブラリ読み込み
# =========================
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [2]:
# =========================
# 1. train / test 読み込み
# =========================
train = pd.read_csv(
    "/content/train.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)

test = pd.read_csv(
    "/content/test.csv",
    encoding="shift_jis",
    encoding_errors="replace",
    low_memory=False
)

In [3]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"]==1)&(df["S12_038"]==1)]
station_2019 = df_2019.groupby("S12_001c", as_index=False).agg(passengers_2019=("S12_041","sum")).rename(columns={"S12_001c":"station_code"})

In [4]:
# =========================
# 3. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)
stations = station_point_gdf.merge(station_2019, left_on="S12_001c", right_on="station_code", how="left")
stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [5]:
# =========================
# 4. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326").to_crs(epsg=3857)
    joined = gpd.sjoin(gdf, stations_gdf, how="left", predicate="dwithin", distance=radius)
    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )
    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col!="building_id":
            df[col] = df[col].fillna(0)
            df[col+"_log"] = np.log1p(df[col])
    return df

In [6]:
# =========================
# 5. train/testに駅特徴量付与
# =========================
for radius in [500,1000]:
    train = add_station_features(train, stations_gdf, radius)
    test  = add_station_features(test, stations_gdf, radius)

In [7]:
# =========================
# 6. 築年 → 築年数
# =========================
for df in [train,test]:
    df["building_create_date"] = pd.to_numeric(df["building_create_date"], errors="coerce")
    df["age"] = (2023 - df["building_create_date"]).clip(0,100)


In [8]:
# =========================
# 7. 地価公示データ
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson").to_crs(epsg=4326)
for df in [train,test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326")
test_gdf  = gpd.GeoDataFrame(test, geometry="geometry", crs="EPSG:4326")
train_gdf = train_gdf.to_crs(epsg=6668)
test_gdf  = test_gdf.to_crs(epsg=6668)
land_gdf  = land_gdf.to_crs(epsg=6668)

land_xy = np.vstack([land_gdf.geometry.x.values, land_gdf.geometry.y.values]).T
tree = KDTree(land_xy)
land_prices = land_gdf['L01_006'].values

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

train_gdf['nearest_land_price'] = train_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
test_gdf['nearest_land_price']  = test_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
train['final_land_price'] = train_gdf['nearest_land_price'].values
test['final_land_price']  = test_gdf['nearest_land_price'].values

In [9]:
# =========================
# 9. 学習データ作成
# =========================
y = np.log1p(train["money_room"])
# 数値カラムのみを使用
features = train.select_dtypes(include=[np.number]).columns.tolist()

# 目的変数は除外
features = [c for c in features if c not in ["money_room", "money_hoshou_company"]]
X = train[features]
X_test = test[features]

In [10]:

# =========================
# 10. LightGBM 学習
# =========================
model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="mape")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13823
[LightGBM] [Info] Number of data points in the train set: 291139, number of used features: 97
[LightGBM] [Info] Start training from score 16.872611


In [11]:
# =========================
# low帯スケーリング補正
# =========================

y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)

LOW_TH = 9_000_000
LOW_SCALE = 0.95  # ← 後で調整

mask_low = y_pred < LOW_TH
y_pred[mask_low] *= LOW_SCALE


In [14]:
# =========================
# 予測後補正
# =========================
target = "money_room"

train["pred"] = np.expm1(model.predict(X_train))
train["ape"] = np.abs(train["pred"] - train["money_room"]) / train["money_room"]
train["price_bin"] = pd.qcut(
    train[target],
    q=5,# まずは5分割
    duplicates="drop"
)

bin_summary = (
    train .groupby("price_bin")
    .apply(lambda df: (df[target] / df["pred"])
    .median())
)

bin_summary

test["pred"] = np.expm1(model.predict(X_test))

test["price_bin"] = pd.qcut( test["pred"], q=5, duplicates="drop" )

test["pred_corrected"] = test.apply( lambda r: r["pred"] * bin_summary.loc[r["price_bin"]], axis=1 )

test["pred_corrected"] = test["pred_corrected"].clip( lower=test["pred"] * 0.9, upper=test["pred"] * 1.1 )

train["pred_corrected"] = train.apply( lambda r: r["pred"] * bin_summary.loc[r["price_bin"]], axis=1 )

train["ape_corrected"] = np.abs( train["pred_corrected"] - train["money_room"] ) / train["money_room"]

train.groupby("price_bin")[["ape", "ape_corrected"]].mean()


ValueError: Length of values (291139) does not match length of index (363924)

In [None]:
# =========================
# 12. 予測・提出
# =========================
pred = np.expm1(model.predict(X_test))
submit = pd.DataFrame({"id": test["id"], "money_room": pred})
submit.to_csv("submit.csv", index=False, header=False)
print("submit.csv を出力しました")