<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%E3%83%9E%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%B3%E3%80%81%E6%88%B8%E5%BB%BA%E3%81%A6%E5%88%86%E3%81%91%E3%81%A6/hasegawa2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [2]:
# =========================
# 0. train/test 読み込み
# =========================
train = pd.read_csv("/content/train.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)
test  = pd.read_csv("/content/test.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)

In [3]:
# =========================
# DID データ読み込み
# =========================
did_gdf = gpd.read_file("/content/A16-20_00_DID.shp")

# 人口・面積カラム
DID_POP_COL  = "A16_005"
DID_AREA_COL = "A16_006"

# 密度
did_gdf["DID_density"] = did_gdf[DID_POP_COL] / (did_gdf[DID_AREA_COL] + 1e-6)

# CRS統一
did_gdf = did_gdf.to_crs(epsg=3857)

#train / test を GeoDataFrame に変換
train_gdf = gpd.GeoDataFrame(
    train,
    geometry=gpd.points_from_xy(train["lon"], train["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

test_gdf = gpd.GeoDataFrame(
    test,
    geometry=gpd.points_from_xy(test["lon"], test["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

#KDTree用の座標準備
# DIDポリゴンの代表点（centroid）
did_coords = np.vstack([
    did_gdf.geometry.centroid.x,
    did_gdf.geometry.centroid.y
]).T

tree = KDTree(did_coords)

#最近傍DIDを割り当てる関数
def attach_DID_features(base_gdf, did_gdf, tree):
    coords = np.vstack([
        base_gdf.geometry.x,
        base_gdf.geometry.y
    ]).T

    _, idx = tree.query(coords, k=1)

    base_gdf["DID_population"] = did_gdf.iloc[idx.flatten()][DID_POP_COL].values
    base_gdf["DID_area"]       = did_gdf.iloc[idx.flatten()][DID_AREA_COL].values
    base_gdf["DID_density"]    = did_gdf.iloc[idx.flatten()]["DID_density"].values

    return base_gdf

#train / test にDID付与
train_gdf = attach_DID_features(train_gdf, did_gdf, tree)
test_gdf  = attach_DID_features(test_gdf,  did_gdf, tree)

#geometryを落として DataFrame に戻す
train = pd.DataFrame(train_gdf.drop(columns="geometry"))
test  = pd.DataFrame(test_gdf.drop(columns="geometry"))

#欠損処理
for col in ["DID_population", "DID_area", "DID_density"]:
    train[col] = train[col].fillna(0)
    test[col]  = test[col].fillna(0)

    train[f"{col}_log"] = np.log1p(train[col])
    test[f"{col}_log"]  = np.log1p(test[col])


#特徴量に追加
did_features = ["DID_population", "DID_area", "DID_density"]

In [4]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"]==1)&(df["S12_038"]==1)]
station_2019 = df_2019.groupby("S12_001c", as_index=False).agg(passengers_2019=("S12_041","sum")).rename(columns={"S12_001c":"station_code"})


In [5]:

# =========================
# 3. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)
stations = station_point_gdf.merge(station_2019, left_on="S12_001c", right_on="station_code", how="left")
stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [6]:
# =========================
# 4. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326").to_crs(epsg=3857)

    # 既存の index_right を削除
    if "index_right" in gdf.columns:
        gdf = gdf.drop(columns=["index_right"])
    if "index_right" in stations_gdf.columns:
        stations_gdf = stations_gdf.drop(columns=["index_right"])

    joined = gpd.sjoin(gdf, stations_gdf, how="left", predicate="dwithin", distance=radius)

    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )

    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col != "building_id":
            df[col] = df[col].fillna(0)
            df[col + "_log"] = np.log1p(df[col])
    return df

In [7]:

# =========================
# 5. train/testに駅特徴量付与
# =========================
for radius in [500,1000]:
    train = add_station_features(train, stations_gdf, radius)
    test  = add_station_features(test, stations_gdf, radius)

In [8]:

# =========================
# 6. 築年 → 築年数
# =========================
for df in [train,test]:
    df["building_create_date"] = pd.to_numeric(df["building_create_date"], errors="coerce")
    df["age"] = (2023 - df["building_create_date"]).clip(0,100)


In [9]:
# =========================
# 7. 地価公示データ
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson").to_crs(epsg=4326)
for df in [train,test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326")
test_gdf  = gpd.GeoDataFrame(test, geometry="geometry", crs="EPSG:4326")
train_gdf = train_gdf.to_crs(epsg=6668)
test_gdf  = test_gdf.to_crs(epsg=6668)
land_gdf  = land_gdf.to_crs(epsg=6668)

land_xy = np.vstack([land_gdf.geometry.x.values, land_gdf.geometry.y.values]).T
tree = KDTree(land_xy)
land_prices = land_gdf['L01_006'].values

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

train_gdf['nearest_land_price'] = train_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
test_gdf['nearest_land_price']  = test_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree, land_prices))
train['final_land_price'] = train_gdf['nearest_land_price'].values
test['final_land_price']  = test_gdf['nearest_land_price'].values

In [10]:
# =========================
# 1. マンション/戸建てに分割
# =========================
train_mansion = train[train['building_type'] == 1].copy()
train_house   = train[train['building_type'] == 4].copy()
test_mansion  = test[test['building_type'] == 1].copy()
test_house    = test[test['building_type'] == 4].copy()

In [11]:
# =========================
# 2. 共通特徴量
# =========================
common_features = [
    'target_ym','lon','lat',
    'drugstore_distance','bank_distance','shopping_street_distance',
    'parking_keiyaku','money_hoshou_company','free_rent_duration','free_rent_gen_timing',
    'addr1','addr2','addr3','post1','post2'
]

mansion_features = common_features + [
    'house_area','floor','room_count','total_units','building_structure','has_elevator','has_gym','maintenance_fee',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean'
]

house_features = common_features + [
    'house_area','land_area','floor_count','room_count','building_structure',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean'
]

features = [
    c for c in train.columns
    if c not in ["id", "money_room"]
    and train[c].dtype != "object"
]


In [12]:
# =========================
# 3. 学習用データ作成関数
# =========================
def prepare_Xy(df, features, is_train=True):
    X = df[[c for c in features if c in df.columns]].copy()
    X = X.select_dtypes(include=[np.number])

    if is_train:
        y = np.log1p(df["money_room"])
        return X, y
    else:
        return X


X_mansion, y_mansion = prepare_Xy(train_mansion, mansion_features)
X_house, y_house     = prepare_Xy(train_house, house_features)
X_test_mansion, _    = prepare_Xy(test_mansion, mansion_features)
X_test_house, _      = prepare_Xy(test_house, house_features)

KeyError: 'money_room'

In [None]:
# =========================
# 4. 学習関数
# =========================
def train_lgb(X, y):
    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        objective="fair",
        fair_c=1
    )
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="mape", verbose=100)
    return model

In [None]:
# =========================
# 5. モデル学習
# =========================
model_mansion = train_lgb(X_mansion, y_mansion)
model_house   = train_lgb(X_house, y_house)

In [None]:

# =========================
# 6. 低価格帯補正
# =========================
LOW_TH = 9_000_000
LOW_SCALE = 0.83

def predict_with_low_scale(model, X, low_th=LOW_TH, low_scale=LOW_SCALE):
    y_pred = np.expm1(model.predict(X))
    mask_low = y_pred <= low_th
    y_pred[mask_low] *= low_scale
    return y_pred

y_pred_train_mansion = predict_with_low_scale(model_mansion, X_mansion)
y_pred_train_house   = predict_with_low_scale(model_house, X_house)
y_pred_test_mansion  = predict_with_low_scale(model_mansion, X_test_mansion)
y_pred_test_house    = predict_with_low_scale(model_house, X_test_house)

In [None]:
# =========================
# 7. submit 作成
# =========================
submit = pd.concat([
    pd.DataFrame({"id": test_mansion["id"], "money_room": y_pred_test_mansion}),
    pd.DataFrame({"id": test_house["id"],   "money_room": y_pred_test_house})
]).sort_values("id")

submit.to_csv("submit.csv", index=False, header=False)
print("submit.csv を出力しました")