<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%E3%83%9E%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%B3%E3%80%81%E6%88%B8%E5%BB%BA%E3%81%A6%E5%88%86%E3%81%91%E3%81%A6/hasegawa2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [None]:
# =========================
# 0. train/test 読み込み
# =========================
train = pd.read_csv("/content/train.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)
test  = pd.read_csv("/content/test.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)

In [None]:
# =========================
# DID
# =========================

import geopandas as gpd
import pandas as pd

# =========================
# 1. DID shp 読み込み
# =========================
did_gdf = gpd.read_file("/content/A16-20_00_DID.shp")  # アップロードした shp
did_gdf = did_gdf.to_crs(epsg=4326)  # train/test と CRS を合わせる

# DID の識別子列
did_id_col = "A16_001"

# 参考に人口や面積の列を選定
did_pop_col = "A16_005"  # 人口
did_area_col = "A16_006"  # 面積
did_gdf["DID_density"] = did_gdf[did_pop_col] / (did_gdf[did_area_col] + 1e-6)

# =========================
# 2. train/test GeoDataFrame化
# =========================
train_gdf = gpd.GeoDataFrame(
    train,
    geometry=gpd.points_from_xy(train["lon"], train["lat"]),
    crs="EPSG:4326"
)
test_gdf = gpd.GeoDataFrame(
    test,
    geometry=gpd.points_from_xy(test["lon"], test["lat"]),
    crs="EPSG:4326"
)

# =========================
# 3. 空間結合: DID 割り当て
# =========================
train_gdf = gpd.sjoin(train_gdf, did_gdf[[did_id_col, did_pop_col, did_area_col, "DID_density", "geometry"]], how="left", predicate="within")
test_gdf  = gpd.sjoin(test_gdf,  did_gdf[[did_id_col, did_pop_col, did_area_col, "DID_density", "geometry"]], how="left", predicate="within")

# 列名を整える
train_gdf.rename(columns={
    did_pop_col: "DID_population",
    did_area_col: "DID_area"
}, inplace=True)
test_gdf.rename(columns={
    did_pop_col: "DID_population",
    did_area_col: "DID_area"
}, inplace=True)

# =========================
# 4. 欠損処理
# =========================
for col in ["DID_population", "DID_area", "DID_density"]:
    train_gdf[col] = train_gdf[col].fillna(0)
    test_gdf[col]  = test_gdf[col].fillna(0)

# =========================
# 5. train/test に反映
# =========================
train = pd.DataFrame(train_gdf.drop(columns="geometry"))
test  = pd.DataFrame(test_gdf.drop(columns="geometry"))

# 確認
train[["DID_population", "DID_area", "DID_density"]].head()

# =========================
# DID特徴量をtrain/testに結合
# =========================
# 例: DID_population, DID_area, DID_density
# DIDのGeoDataFrameを gpd.read_file("DID.shp") で読み込んでいる前提
# train/testのgeometryはすでにEPSG:4326になっていると仮定

from shapely.geometry import Point

# =========================
# DID特徴量をKDTreeで割り当てる関数（修正版）
# =========================
def add_DID_features(df, df_geometry, did_gdf, did_columns=["A16_005","A16_006","A16_010"]):
    """
    df: train/testのDataFrame
    df_geometry: df のジオメトリ列 (すでにEPSG:3857で座標系変換済み)
    did_gdf: DIDのGeoDataFrame (EPSG:3857)
    did_columns: 割り当てる列名
    """
    df = df.copy()
    # DIDの重心座標
    did_xy = np.vstack([did_gdf.geometry.centroid.x, did_gdf.geometry.centroid.y]).T
    tree = KDTree(did_xy)

    # 最も近いDIDインデックスを取得
    assigned = []
    for pt in df_geometry:
        dist, idx = tree.query([[pt.x, pt.y]], k=1)
        assigned.append(idx[0][0])

    assigned_df = did_gdf.iloc[assigned][did_columns].reset_index(drop=True)
    assigned_df.index = df.index
    df[did_columns] = assigned_df
    return df

# =========================
# EPSG:3857に変換して使用
# =========================
train_gdf_3857 = train_gdf.to_crs(epsg=3857)
test_gdf_3857  = test_gdf.to_crs(epsg=3857)
did_gdf_3857   = did_gdf.to_crs(epsg=3857)

# train/test に割り当て
train = add_DID_features(train, train_gdf_3857.geometry, did_gdf_3857, did_columns=["A16_005","A16_006","A16_010"])
test  = add_DID_features(test,  test_gdf_3857.geometry,  did_gdf_3857, did_columns=["A16_005","A16_006","A16_010"])


In [None]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"]==1)&(df["S12_038"]==1)]
station_2019 = df_2019.groupby("S12_001c", as_index=False).agg(passengers_2019=("S12_041","sum")).rename(columns={"S12_001c":"station_code"})


In [None]:

# =========================
# 3. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)
stations = station_point_gdf.merge(station_2019, left_on="S12_001c", right_on="station_code", how="left")
stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [None]:
# =========================
# 4. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326").to_crs(epsg=3857)

    # 既存の index_right を削除
    if "index_right" in gdf.columns:
        gdf = gdf.drop(columns=["index_right"])
    if "index_right" in stations_gdf.columns:
        stations_gdf = stations_gdf.drop(columns=["index_right"])

    joined = gpd.sjoin(gdf, stations_gdf, how="left", predicate="dwithin", distance=radius)

    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )

    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col != "building_id":
            df[col] = df[col].fillna(0)
            df[col + "_log"] = np.log1p(df[col])
    return df

In [None]:

# =========================
# 5. train/testに駅特徴量付与
# =========================
for radius in [500,1000]:
    train = add_station_features(train, stations_gdf, radius)
    test  = add_station_features(test, stations_gdf, radius)

In [None]:

# =========================
# 6. 築年 → 築年数
# =========================
for df in [train,test]:
    df["building_create_date"] = pd.to_numeric(df["building_create_date"], errors="coerce")
    df["age"] = (2023 - df["building_create_date"]).clip(0,100)


In [None]:
# =========================
# 7. 地価公示データ
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson").to_crs(epsg=4326)
for df in [train,test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326")
test_gdf  = gpd.GeoDataFrame(test, geometry="geometry", crs="EPSG:4326")
train_gdf = train_gdf.to_crs(epsg=6668)
test_gdf  = test_gdf.to_crs(epsg=6668)
land_gdf  = land_gdf.to_crs(epsg=6668)

land_xy = np.vstack([land_gdf.geometry.x.values, land_gdf.geometry.y.values]).T
tree = KDTree(land_xy)
land_prices = land_gdf['L01_006'].values

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

train_gdf['nearest_land_price'] = train_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
test_gdf['nearest_land_price']  = test_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
train['final_land_price'] = train_gdf['nearest_land_price'].values
test['final_land_price']  = test_gdf['nearest_land_price'].values

In [None]:
# =========================
# 1. マンション/戸建てに分割
# =========================
train_mansion = train[train['building_type'] == 1].copy()
train_house   = train[train['building_type'] == 4].copy()
test_mansion  = test[test['building_type'] == 1].copy()
test_house    = test[test['building_type'] == 4].copy()

In [None]:
# =========================
# 2. 共通特徴量
# =========================
common_features = [
    'target_ym',
    'lon','lat',
    'drugstore_distance','bank_distance','shopping_street_distance',
    'parking_keiyaku','money_hoshou_company','free_rent_duration','free_rent_gen_timing',
    'addr1','addr2','addr3','post1','post2'
]

# マンション特徴量
mansion_features = common_features + [
    'house_area','floor','room_count','total_units','building_structure','has_elevator','has_gym','maintenance_fee',
    'DID_population','DID_area','DID_density',
    'final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean'
]

# 戸建て特徴量
house_features = common_features + [
    'house_area','land_area','floor_count','room_count','building_structure',
    'DID_population','DID_area','DID_density',
    'final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean'
]

In [None]:
# =========================
# 3. 学習用データ作成関数
# =========================
def prepare_Xy(df, features):
    y = np.log1p(df["money_room"])
    X = df[features].copy()
    # 数値列のみ
    X = X.select_dtypes(include=[np.number])
    return X, y

X_mansion, y_mansion = prepare_Xy(train_mansion, mansion_features)
X_house, y_house     = prepare_Xy(train_house, house_features)
X_test_mansion, _    = prepare_Xy(test_mansion, mansion_features)
X_test_house, _      = prepare_Xy(test_house, house_features)

In [None]:
# =========================
# 4. 学習関数
# =========================
def train_lgb(X, y):
    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        objective="fair",
        fair_c=1
    )
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="mape", verbose=100)
    return model

In [None]:
# =========================
# 5. モデル学習
# =========================
model_mansion = train_lgb(X_mansion, y_mansion)
model_house   = train_lgb(X_house, y_house)

In [None]:

# =========================
# 6. 低価格帯補正
# =========================
LOW_TH = 9_000_000
LOW_SCALE = 0.83

def predict_with_low_scale(model, X, low_th=LOW_TH, low_scale=LOW_SCALE):
    y_pred = np.expm1(model.predict(X))
    mask_low = y_pred <= low_th
    y_pred[mask_low] *= low_scale
    return y_pred

y_pred_train_mansion = predict_with_low_scale(model_mansion, X_mansion)
y_pred_train_house   = predict_with_low_scale(model_house, X_house)
y_pred_test_mansion  = predict_with_low_scale(model_mansion, X_test_mansion)
y_pred_test_house    = predict_with_low_scale(model_house, X_test_house)

In [None]:
# =========================
# 7. submit 作成
# =========================
submit = pd.concat([
    pd.DataFrame({"id": test_mansion["id"], "money_room": y_pred_test_mansion}),
    pd.DataFrame({"id": test_house["id"],   "money_room": y_pred_test_house})
]).sort_values("id")

submit.to_csv("submit.csv", index=False, header=False)
print("submit.csv を出力しました")