<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/main/hasegawa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import geopandas as gpd
from shapely.geometry import Point


In [None]:
train = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

  train = pd.read_csv("/content/train.csv")
  test  = pd.read_csv("/content/test.csv")


In [None]:
#国土数値情報を読み込み、乗降客数を取得

df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")

# 2019年フィルタ
df_2019 = df[
    (df["S12_039"] == 1) &  # データ有
    (df["S12_038"] == 1)    # 重複なし
]

# 1駅1行に集約
station_2019 = (
    df_2019
    .groupby(["S12_001c", "S12_001"], as_index=False)
    .agg({"S12_041": "sum"})
    .rename(columns={
        "S12_001c": "station_code",
        "S12_001": "station_name",
        "S12_041": "passengers_2019"
    })
)


In [None]:
# 駅データ
stations = station_2019.copy()
stations_gdf = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations["lon"], stations["lat"]),
    crs="EPSG:4326"
)

# 物件データ
properties_gdf = gpd.GeoDataFrame(
    train,
    geometry=gpd.points_from_xy(train["lon"], train["lat"]),
    crs="EPSG:4326"
)


In [None]:
#メートル単位に変換
stations_gdf = stations_gdf.to_crs(epsg=3857)
properties_gdf = properties_gdf.to_crs(epsg=3857)


In [None]:
#500m以内の駅を結合

joined = gpd.sjoin(
    properties_gdf,
    stations_gdf,
    how="left",
    predicate="dwithin",
    distance=500
)

#500m以内の乗降客数を合計
feat_500m = (
    joined
    .groupby("id", as_index=False)
    .agg({"passengers_2019": ["sum", "max", "mean"]})
    .rename(columns={"passengers_2019": "station_passengers_500m"})
)

#元データに戻す
properties = properties.merge(
    feat_500m,
    on="id",
    how="left"
)

# 駅が無い物件は0に
properties["station_passengers_500m"] = (
    properties["station_passengers_500m"].fillna(0)
)



In [None]:
#築年から築年数に
train["building_create_date"] = pd.to_numeric(
    train["building_create_date"], errors="coerce"
)
test["building_create_date"] = pd.to_numeric(
    test["building_create_date"], errors="coerce"
)

train["age"] = 2023 - train["building_create_date"]
test["age"]  = 2023 - test["building_create_date"]


In [None]:
features = train.select_dtypes(include=[np.number]).columns.tolist()
features.remove("money_room")


In [None]:
# 目的変数
y = np.log1p(train["money_room"])

# train / test を結合
all_data = pd.concat([
    train.drop(columns=["money_room"]),
    test
])

# 数値列だけ取得
features = all_data.select_dtypes(include=[np.number]).columns.tolist()

# id は特徴量に使わないので除外
features.remove("id")

# 学習・予測用
X = train[features]
X_test = test[features]




In [None]:
#モデル設定
model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

model.fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.181726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13577
[LightGBM] [Info] Number of data points in the train set: 363924, number of used features: 96
[LightGBM] [Info] Start training from score 16.872850


In [None]:
pred = np.expm1(model.predict(X_test))

submit = pd.DataFrame({
    "id": test["id"],
    "money_room": pred
})

submit.to_csv("submit.csv", index=False, header=False)

