<a href="https://colab.research.google.com/github/dondonrocket/kokudo/blob/%E3%83%86%E3%82%B9%E3%83%88/hasegawa5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
import geopandas as gpd
import lightgbm as lgb
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
import matplotlib.pyplot as plt

In [15]:
# =========================
# 0. train/test 読み込み
# =========================
train = pd.read_csv("/content/train.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)
test  = pd.read_csv("/content/test.csv", encoding="shift_jis", encoding_errors="replace", low_memory=False)

In [16]:
# =========================
# 0-1. 都道府県コードを追加
# =========================
train['prefecture_code'] = train['addr1_1']
test['prefecture_code']  = test['addr1_1']

# =========================
# 0-2. 地域サンプル重み（地方を強める）
# =========================
weights = train['prefecture_code'].value_counts(normalize=True)
train['sample_weight'] = train['prefecture_code'].map(lambda x: 1/weights[x])

# =========================
# 1. year_built（年 or 年月）を正規化
# =========================
def normalize_year_month(x):
    if pd.isna(x):
        return np.nan

    x = float(x)
    year = int(x)
    month = round((x - year) * 100)

    if 1 <= month <= 12:
        return year + month / 12
    else:
        return float(year)

for df in [train, test]:
    df["year_built_adj"] = (
        pd.to_numeric(df["year_built"], errors="coerce")
        .apply(normalize_year_month)
    )

# =========================
# 3. 新耐震フラグ（1981年基準）
# =========================

def classify_seismic(ym):
  if pd.isna(ym):
      return -1
  if ym >= 198106:
      return 2
  elif ym >= 195012:
      return 1
  else:
      return 0

for df in [train, test]:
    df["seismic_class"] = df["year_built_adj"].apply(classify_seismic)

# =========================
# 2. 築年数（age）
# =========================
CURRENT_YEAR = 2023

for df in [train, test]:
    df["age"] = (CURRENT_YEAR - df["year_built_adj"]).clip(0, 100)
    df["age_sq"] = df["age"] ** 2
    df["age_log"] = np.log1p(df["age"])
    df["age_x_seismic"] = df["age"] * df["seismic_class"]

# =========================
# 4. house_area 欠損処理（建物タイプ別）
# =========================
for df in [train, test]:
    if "house_area" in df.columns:
        df["house_area"] = pd.to_numeric(df["house_area"], errors="coerce")

        for bt in df["building_type"].dropna().unique():
            median_area = df.loc[df["building_type"] == bt, "house_area"].median()
            df.loc[
                (df["building_type"] == bt) & (df["house_area"].isna()),
                "house_area"
            ] = median_area

        # 最終ガード
        df["house_area"] = df["house_area"].fillna(df["house_area"].median())


# =========================
# 6. house_area前処理
# =========================
for df in [train, test]:
    df["house_area"] = pd.to_numeric(df["house_area"], errors="coerce")
    df["house_area"] = df["house_area"].fillna(df["house_area"].median())

  df.loc[


In [17]:
# =========================
# 2. 駅乗降客数（2019年）
# =========================
df = pd.read_csv("S12-24_NumberOfPassengers_utf8.csv")
df_2019 = df[(df["S12_039"]==1)&(df["S12_038"]==1)]
station_2019 = df_2019.groupby("S12_001c", as_index=False).agg(passengers_2019=("S12_041","sum")).rename(columns={"S12_001c":"station_code"})


In [18]:

# =========================
# 3. 駅ポイント（lon / lat）
# =========================
station_point_gdf = gpd.read_file("/content/S12-24_NumberOfPassengers.geojson")
station_point_gdf["S12_001c"] = station_point_gdf["S12_001c"].astype(str)
station_2019["station_code"] = station_2019["station_code"].astype(str)
stations = station_point_gdf.merge(station_2019, left_on="S12_001c", right_on="station_code", how="left")
stations = stations.to_crs(epsg=3857)
stations["geometry"] = stations.geometry.centroid
stations_gdf = stations[["S12_001c","passengers_2019","geometry"]].copy()
stations_gdf.crs = "EPSG:3857"

In [19]:
# =========================
# 4. 駅特徴量作成関数
# =========================
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326").to_crs(epsg=3857)

    # 既存の index_right を削除
    if "index_right" in gdf.columns:
        gdf = gdf.drop(columns=["index_right"])
    if "index_right" in stations_gdf.columns:
        stations_gdf = stations_gdf.drop(columns=["index_right"])

    joined = gpd.sjoin(gdf, stations_gdf, how="left", predicate="dwithin", distance=radius)

    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )

    df = df.merge(feat, on="building_id", how="left")
    for col in feat.columns:
        if col != "building_id":
          mean_val = df[col].mean()  # 全国平均
          df[col] = df[col].fillna(mean_val)
          df[col + "_log"] = np.log1p(df[col])
    return df

In [20]:
# =========================
# DID データ読み込み
# =========================
did_gdf = gpd.read_file("/content/A16-20_00_DID.shp")

# 人口・面積カラム
DID_POP_COL  = "A16_005"
DID_AREA_COL = "A16_006"

# 密度
did_gdf["DID_density"] = did_gdf[DID_POP_COL] / (did_gdf[DID_AREA_COL] + 1e-6)

# CRS統一
did_gdf = did_gdf.to_crs(epsg=3857)

#train / test を GeoDataFrame に変換
train_gdf = gpd.GeoDataFrame(
    train,
    geometry=gpd.points_from_xy(train["lon"], train["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)

test_gdf = gpd.GeoDataFrame(
    test,
    geometry=gpd.points_from_xy(test["lon"], test["lat"]),
    crs="EPSG:4326"
).to_crs(epsg=3857)


#KDTree用の座標準備
# DIDポリゴンの代表点（centroid）
did_coords = np.vstack([did_gdf.geometry.centroid.x, did_gdf.geometry.centroid.y]).T
tree_did = KDTree(did_coords)

#最近傍DIDを割り当てる関数
def attach_DID_features(base_gdf, did_gdf, tree):
    coords = np.vstack([
        base_gdf.geometry.x,
        base_gdf.geometry.y
    ]).T

    _, idx = tree.query(coords, k=1)

    base_gdf["DID_population"] = did_gdf.iloc[idx.flatten()][DID_POP_COL].values
    base_gdf["DID_area"]       = did_gdf.iloc[idx.flatten()][DID_AREA_COL].values
    base_gdf["DID_density"]    = did_gdf.iloc[idx.flatten()]["DID_density"].values

    return base_gdf

#train / test にDID付与
train_gdf = attach_DID_features(train_gdf, did_gdf, tree_did)
test_gdf  = attach_DID_features(test_gdf, did_gdf, tree_did)

#geometryを落として DataFrame に戻す
train = pd.DataFrame(train_gdf.drop(columns="geometry"))
test  = pd.DataFrame(test_gdf.drop(columns="geometry"))

#欠損処理
for col in ["DID_population", "DID_area", "DID_density"]:
    for df in [train, test]:
        df[col] = df[col].fillna(df[col].mean())
        df[col + "_log"] = np.log1p(df[col])

#house_area × DID
for df in [train, test]:
    df["area_weighted_by_urban"] = df["house_area"] * (1 + df["DID_density_log"])

# -------------------------
# 駅特徴量作成関数（全国対応）
# -------------------------
def add_station_features(df, stations_gdf, radius=500):
    df = df.copy()
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df["lon"], df["lat"]),
        crs="EPSG:4326"
    ).to_crs(epsg=3857)

    # sjoin
    joined = gpd.sjoin(
        gdf, stations_gdf, how="left", predicate="dwithin", distance=radius
    )

    # building_idで集計
    feat = joined.groupby("building_id", as_index=False).agg(
        **{
            f"station_passengers_{radius}m_sum": ("passengers_2019","sum"),
            f"station_passengers_{radius}m_max": ("passengers_2019","max"),
            f"station_passengers_{radius}m_mean": ("passengers_2019","mean")
        }
    )

    df = df.merge(feat, on="building_id", how="left")

    # 全国平均で補完＆ログ列作成
    for col in [f"station_passengers_{radius}m_sum",
                f"station_passengers_{radius}m_max",
                f"station_passengers_{radius}m_mean"]:
        if col not in df.columns:
            df[col] = 0
        df[col].fillna(df[col].mean(), inplace=True)
        df[col + "_log"] = np.log1p(df[col])

    return df


#特徴量に追加
did_area_features = [
    "DID_population",
    "DID_area",
    "DID_density",
    "DID_density_log",
    "area_x_log_DID_density",
    "area_weighted_by_urban"
]


In [21]:

# =========================
# 5. train/testに駅特徴量付与
# =========================
for radius in [500,1000]:
    train = add_station_features(train, stations_gdf, radius)
    test  = add_station_features(test, stations_gdf, radius)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [22]:
# =========================
# 7. 地価公示データ
# =========================
land_gdf = gpd.read_file("/content/L01-23.geojson").to_crs(epsg=4326)
for df in [train,test]:
    df["geometry"] = gpd.points_from_xy(df["lon"], df["lat"])
train_gdf = gpd.GeoDataFrame(train, geometry="geometry", crs="EPSG:4326").to_crs(epsg=6668)
test_gdf  = gpd.GeoDataFrame(test, geometry="geometry", crs="EPSG:6668").to_crs(epsg=6668)
land_gdf  = land_gdf.to_crs(epsg=6668)

# KDTree
land_xy = np.vstack([land_gdf.geometry.x.values, land_gdf.geometry.y.values]).T
tree_land = KDTree(land_xy)
land_prices = land_gdf['L01_006'].values

def nearest_land_price_fast(pt, tree, land_prices):
    dist, idx = tree.query([[pt.x, pt.y]], k=1)
    return land_prices[idx[0][0]]

train['final_land_price'] = train_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree_land, land_prices))
test['final_land_price']  = test_gdf['geometry'].apply(lambda pt: nearest_land_price_fast(pt, tree_land, land_prices))

# 欠損は全国平均で補完
for df in [train, test]:
    df["final_land_price"].fillna(df["final_land_price"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["final_land_price"].fillna(df["final_land_price"].mean(), inplace=True)


In [23]:
# =========================
# 1. マンション/戸建てに分割
# =========================
train_mansion = train[train['building_type'] == 1].copy()
train_house   = train[train['building_type'] == 4].copy()
test_mansion  = test[test['building_type'] == 1].copy()
test_house    = test[test['building_type'] == 4].copy()

In [24]:
# =========================
# 2. 共通特徴量
# =========================
common_features = [
    'target_ym','lon','lat',
    'drugstore_distance','bank_distance','shopping_street_distance',
    'parking_keiyaku','money_hoshou_company','free_rent_duration','free_rent_gen_timing',
        'year_built_adj','age','seismic_class','age','age_sq','age_log','age_x_seismic'
]

mansion_features = common_features + [
    'house_area','floor','room_count','total_units','building_structure','has_elevator','has_gym','maintenance_fee',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean',"DID_density_log","area_weighted_by_urban"
]

house_features = common_features + [
    'house_area','land_area','floor_count','room_count','building_structure',
    'DID_population','DID_area','DID_density','final_land_price',
    'station_passengers_500m_sum','station_passengers_500m_max','station_passengers_500m_mean',
    'station_passengers_1000m_sum','station_passengers_1000m_max','station_passengers_1000m_mean',"DID_density_log","area_weighted_by_urban"
]


In [26]:
# =========================
# 3. 学習用データ作成関数
# =========================

DROP_COLS = [
    "id",
    "money_room",
    "money_hoshou_company",
    "sample_weight"
]

def make_features(df):
    return [
        c for c in df.columns
        if c not in DROP_COLS
        and df[c].dtype != "object"
    ]

mansion_features = make_features(train_mansion)
house_features   = make_features(train_house)

def prepare_Xy(df, features, is_train=True):
    X = df[[c for c in features if c in df.columns]].copy()

    # geometry が紛れ込んでも必ず落とす
    if "geometry" in X.columns:
        X = X.drop(columns=["geometry"])

    # 数値型だけに限定（最終防衛ライン）
    X = X.select_dtypes(include=[np.number])

    if is_train:
        y = np.log1p(df["money_room"])
        return X, y
    else:
        return X


X_mansion, y_mansion = prepare_Xy(train_mansion, mansion_features)
X_house, y_house     = prepare_Xy(train_house, house_features)

X_test_mansion = test_mansion[mansion_features].copy()
X_test_house   = test_house[house_features].copy()



In [29]:
# =========================
# 4. 学習関数
# =========================
def train_lgb(X, y, sample_weight=None):
    model = lgb.LGBMRegressor(
        n_estimators=5000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        objective="fair",
        fair_c=0.1,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=0.1
    )

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42
    )

    if sample_weight is not None:
        w_train, w_valid = train_test_split(sample_weight, test_size=0.2, random_state=42)
    else:
        w_train = w_valid = None

    model.fit(
        X_train, y_train,
        sample_weight=w_train,
        eval_set=[(X_valid, y_valid)],
        eval_sample_weight=[w_valid] if sample_weight is not None else None,
        eval_metric="mape",
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(200)
        ]
    )

    return model


In [30]:
# =========================
# 5. モデル学習
# =========================
model_mansion = train_lgb(X_mansion, y_mansion, sample_weight=train_mansion['sample_weight'])
model_house   = train_lgb(X_house, y_house, sample_weight=train_house['sample_weight'])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14831
[LightGBM] [Info] Number of data points in the train set: 155669, number of used features: 109
[LightGBM] [Info] Start training from score 16.734532
Training until validation scores don't improve for 200 rounds
[200]	valid_0's mape: 0.00856018	valid_0's fair: 0.00658296
[400]	valid_0's mape: 0.00800346	valid_0's fair: 0.00600873
[600]	valid_0's mape: 0.00775235	valid_0's fair: 0.00575608
[800]	valid_0's mape: 0.00758445	valid_0's fair: 0.00558823
[1000]	valid_0's mape: 0.00746004	valid_0's fair: 0.00546401
[1200]	valid_0's mape: 0.00737258	valid_0's fair: 0.00537747
[1400]	valid_0's mape: 0.00729838	valid_0's fair: 0.00530491
[1600]	valid_0's mape: 0.00723796	valid_0's fair: 0.00524688
[1800]	valid_0's mape: 0.00718054	valid_0's 

In [31]:
# =========================
# 1. 学習時の列を保存
# =========================
cols_mansion = X_mansion.columns.tolist()
cols_house   = X_house.columns.tolist()

In [32]:
# =========================
# 2. test の列を学習時と完全一致させる
# =========================
X_test_mansion = X_test_mansion.reindex(
    columns=cols_mansion,
    fill_value=0
)

X_test_house = X_test_house.reindex(
    columns=cols_house,
    fill_value=0
)

In [33]:
# =========================
# 3. 低価格帯補正つき予測
# =========================
LOW_TH_MANSION = 9_000_000
LOW_TH_HOUSE   = 9_000_000

LOW_SCALE_MANSION = 0.83
LOW_SCALE_HOUSE   = 0.83


def predict_with_low_scale(model, X, low_th, low_scale):
    y_pred = model.predict(X)

    # まず予測値側をガード
    y_pred = np.nan_to_num(
        y_pred,
        nan=0.0,
        posinf=20,   # log空間での上限
        neginf=0.0
    )

    # log → 元スケール
    y_pred = np.expm1(y_pred)

    # 再度ガード
    y_pred = np.nan_to_num(
        y_pred,
        nan=0.0,
        posinf=1e9,
        neginf=0.0
    )

    # 下限クリップ
    y_pred = np.clip(y_pred, 1, 1e9)

    # 低価格帯補正
    mask_low = y_pred <= low_th
    y_pred[mask_low] *= low_scale

    return y_pred



y_pred_test_mansion = predict_with_low_scale(
    model_mansion,
    X_test_mansion,
    LOW_TH_MANSION,
    LOW_SCALE_MANSION
)

y_pred_test_house = predict_with_low_scale(
    model_house,
    X_test_house,
    LOW_TH_HOUSE,
    LOW_SCALE_HOUSE
)

In [34]:
# =========================
# 4. test DataFrame に書き戻す
# =========================
test.loc[test_mansion.index, "money_room"] = y_pred_test_mansion
test.loc[test_house.index,   "money_room"] = y_pred_test_house

mask_other = test["money_room"].isna()

X_other = test.loc[mask_other].reindex(columns=cols_mansion, fill_value=0)

test.loc[mask_other, "money_room"] = predict_with_low_scale(
    model_mansion, X_other, LOW_TH_MANSION, LOW_SCALE_MANSION
)

# =========================
# 8. 最終ガード（超重要）
# =========================
test["money_room"] = (
    test["money_room"]
    .replace([np.inf, -np.inf], np.nan)
    .fillna(test["money_room"].median())
    .clip(1, 1e9)
)

In [35]:
# =========================
# 5. submit.csv 作成
# =========================
submit = test[["id", "money_room"]].sort_values("id")
submit.to_csv("submit.csv", index=False, header=False)

print("submit.csv を出力しました")

submit.csv を出力しました


In [36]:
import pandas as pd

df = pd.read_csv("/content/submit.csv")

df.isna().sum().sum()

np.int64(0)