# 0005 戸建て・マンション unit_id 出現回数集計

`data/interim/01_02_split_by_type/{train,test}_{kodate,mansion}.parquet` を対象に、train/test それぞれで `unit_id` 出現回数を `data_id` ベースで集計する。train 側は「同一 `unit_id` を持つ他レコード数（自分自身を除外）」を 0/1/2/3/4+ のカテゴリにまとめ、test 側は train 内の登場回数ベースで同じカテゴリに丸める。


In [16]:
from pathlib import Path
from typing import Tuple

import pandas as pd

pd.options.display.max_rows = 400

COLUMNS = ["data_id", "unit_id", "building_id"]


def find_project_root(start: Path) -> Path:
    """データディレクトリが存在するリポジトリルートを上方探索で取得"""
    for path in [start, *start.parents]:
        if (path / "data").is_dir() and (path / "notebooks").is_dir():
            return path
    raise FileNotFoundError("プロジェクトルートを特定できません")


def load_split(property_type: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_path = SPLIT_DIR / f"train_{property_type}.parquet"
    test_path = SPLIT_DIR / f"test_{property_type}.parquet"
    train_df = pd.read_parquet(train_path, columns=COLUMNS).drop_duplicates(subset="data_id")
    test_df = pd.read_parquet(test_path, columns=COLUMNS).drop_duplicates(subset="data_id")
    return train_df, test_df


PROJECT_ROOT = find_project_root(Path.cwd().resolve())
DATA_DIR = PROJECT_ROOT / "data"
SPLIT_DIR = DATA_DIR / "interim" / "01_02_split_by_type"

train_kodate, test_kodate = load_split("kodate")
train_mansion, test_mansion = load_split("mansion")

print(f"train戸建て: {len(train_kodate):,} data_id / test戸建て : {len(test_kodate):,} data_id")
print(f"trainマンション: {len(train_mansion):,} data_id / testマンション: {len(test_mansion):,} data_id")


train戸建て: 165,310 data_id / test戸建て : 52,892 data_id
trainマンション: 198,614 data_id / testマンション: 59,545 data_id


### test → train 内 unit_id 登場回数


In [17]:
def bucketize(count: int) -> str:
    """train での unit_id 出現回数を 0/1/2/3/4+ のカテゴリに丸める"""
    if count >= 4:
        return "4+"
    return str(count)

train_unit_counts = (
    train_kodate.dropna(subset=["unit_id"])
    .groupby("unit_id")["data_id"]
    .nunique()
    .astype(int)
)

test_with_counts = test_kodate.assign(
    train_unit_count=test_kodate["unit_id"].map(train_unit_counts).fillna(0).astype(int)
)

test_with_counts["train_unit_count_bucket"] = test_with_counts["train_unit_count"].apply(bucketize)

bucket_order = ["0", "1", "2", "3", "4+"]

summary = (
    test_with_counts["train_unit_count_bucket"]
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

unique_test_data_ids = int(test_kodate["data_id"].nunique())
summary["ratio_pct"] = (
    summary["data_id_count"] / unique_test_data_ids * 100
).map(lambda x: f"{x:.2f}%")

print(f"test data_id ユニーク件数: {unique_test_data_ids:,}")
print(f"集計合計: {summary['data_id_count'].sum():,}")

summary


test data_id ユニーク件数: 52,892
集計合計: 52,892


Unnamed: 0_level_0,data_id_count,ratio_pct
train_unit_count_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38461,72.72%
1,7655,14.47%
2,2831,5.35%
3,1610,3.04%
4+,2335,4.41%


In [18]:
train_same_unit_others = (
    train_kodate["unit_id"].map(train_unit_counts).fillna(0).astype(int)
)
train_same_unit_others = train_same_unit_others.where(
    train_same_unit_others == 0, train_same_unit_others - 1
)

train_summary = (
    train_same_unit_others.map(bucketize)
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

train_total = len(train_kodate)
train_summary["ratio_pct"] = (
    train_summary["data_id_count"] / train_total * 100
).map(lambda x: f"{x:.2f}%")

print(f"train data_id ユニーク件数: {train_total:,}")
print(f"同一unit_id別合計: {train_summary['data_id_count'].sum():,}")
train_summary


train data_id ユニーク件数: 165,310
同一unit_id別合計: 165,310


Unnamed: 0_level_0,data_id_count,ratio_pct
unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71225,43.09%
1,42524,25.72%
2,23121,13.99%
3,12576,7.61%
4+,15864,9.60%


## マンション (Mansion)


### test → train 内 unit_id 登場回数


In [21]:
train_unit_counts_mansion = (
    train_mansion.dropna(subset=["unit_id"])
    .groupby("unit_id")["data_id"]
    .nunique()
    .astype(int)
)

mansion_test_with_counts = test_mansion.assign(
    train_unit_count=test_mansion["unit_id"].map(train_unit_counts_mansion).fillna(0).astype(int)
)

mansion_test_with_counts["train_unit_count_bucket"] = (
    mansion_test_with_counts["train_unit_count"].apply(bucketize)
)

mansion_test_summary = (
    mansion_test_with_counts["train_unit_count_bucket"]
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

unique_test_mansion = int(test_mansion["data_id"].nunique())
mansion_test_summary["ratio_pct"] = (
    mansion_test_summary["data_id_count"] / unique_test_mansion * 100
).map(lambda x: f"{x:.2f}%")

print(f"test data_id ユニーク件数: {unique_test_mansion:,}")
print(f"集計合計: {mansion_test_summary['data_id_count'].sum():,}")

mansion_test_summary


test data_id ユニーク件数: 59,545
集計合計: 59,545


Unnamed: 0_level_0,data_id_count,ratio_pct
train_unit_count_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45979,77.22%
1,8851,14.86%
2,2553,4.29%
3,1055,1.77%
4+,1107,1.86%


### train → 同一 unit_id の他データ件数


In [22]:
mansion_train_same_unit_others = (
    train_mansion["unit_id"].map(train_unit_counts_mansion).fillna(0).astype(int)
)
mansion_train_same_unit_others = mansion_train_same_unit_others.where(
    mansion_train_same_unit_others == 0, mansion_train_same_unit_others - 1
)

mansion_train_summary = (
    mansion_train_same_unit_others.map(bucketize)
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

mansion_train_total = len(train_mansion)
mansion_train_summary["ratio_pct"] = (
    mansion_train_summary["data_id_count"] / mansion_train_total * 100
).map(lambda x: f"{x:.2f}%")

print(f"train data_id ユニーク件数: {mansion_train_total:,}")
print(f"同一unit_id別合計: {mansion_train_summary['data_id_count'].sum():,}")

mansion_train_summary


train data_id ユニーク件数: 198,614
同一unit_id別合計: 198,614


Unnamed: 0_level_0,data_id_count,ratio_pct
unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,107657,54.20%
1,51086,25.72%
2,21096,10.62%
3,9592,4.83%
4+,9183,4.62%


In [23]:
train_building_counts_mansion = (
    train_mansion.dropna(subset=["building_id"])
    .groupby("building_id")["data_id"]
    .nunique()
    .astype(int)
)

mansion_test_with_building_counts = test_mansion.assign(
    train_building_count=test_mansion["building_id"].map(train_building_counts_mansion).fillna(0).astype(int)
)

mansion_test_with_building_counts["train_building_count_bucket"] = (
    mansion_test_with_building_counts["train_building_count"].apply(bucketize)
)

mansion_test_building_summary = (
    mansion_test_with_building_counts["train_building_count_bucket"]
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

mansion_test_building_summary["ratio_pct"] = (
    mansion_test_building_summary["data_id_count"] / unique_test_mansion * 100
).map(lambda x: f"{x:.2f}%")

print(f"test data_id ユニーク件数: {unique_test_mansion:,}")
print(f"集計合計: {mansion_test_building_summary['data_id_count'].sum():,}")

mansion_test_building_summary


test data_id ユニーク件数: 59,545
集計合計: 59,545


Unnamed: 0_level_0,data_id_count,ratio_pct
train_building_count_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12710,21.35%
1,10615,17.83%
2,8247,13.85%
3,6540,10.98%
4+,21433,35.99%


In [24]:
mansion_train_same_building_others = (
    train_mansion["building_id"].map(train_building_counts_mansion).fillna(0).astype(int)
)
mansion_train_same_building_others = mansion_train_same_building_others.where(
    mansion_train_same_building_others == 0,
    mansion_train_same_building_others - 1,
)

mansion_train_building_summary = (
    mansion_train_same_building_others.map(bucketize)
    .value_counts()
    .reindex(bucket_order, fill_value=0)
    .rename("data_id_count")
    .to_frame()
)

mansion_train_building_summary["ratio_pct"] = (
    mansion_train_building_summary["data_id_count"] / mansion_train_total * 100
).map(lambda x: f"{x:.2f}%")

print(f"train data_id ユニーク件数: {mansion_train_total:,}")
print(f"同一building_id別合計: {mansion_train_building_summary['data_id_count'].sum():,}")

mansion_train_building_summary


train data_id ユニーク件数: 198,614
同一building_id別合計: 198,614


Unnamed: 0_level_0,data_id_count,ratio_pct
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26560,13.37%
1,31076,15.65%
2,28836,14.52%
3,23924,12.05%
4+,88218,44.42%


In [25]:
mansion_test_cross = mansion_test_with_counts[["data_id", "train_unit_count_bucket"]].copy()
mansion_test_cross["train_building_count_bucket"] = mansion_test_with_building_counts[
    "train_building_count_bucket"
].values

mansion_test_cross_table = (
    mansion_test_cross.groupby([
        "train_building_count_bucket",
        "train_unit_count_bucket",
    ])
    ["data_id"]
    .nunique()
    .unstack(fill_value=0)
    .reindex(index=bucket_order, columns=bucket_order, fill_value=0)
)

mansion_test_cross_pct = (
    mansion_test_cross_table / unique_test_mansion * 100
).map(lambda x: f"{x:.2f}%")

print(
    f"クロス集計対象 data_id: {mansion_test_cross_table.values.sum():,} / {unique_test_mansion:,}"
)

mansion_test_cross_pct


クロス集計対象 data_id: 59,545 / 59,545


train_unit_count_bucket,0,1,2,3,4+
train_building_count_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,21.35%,0.00%,0.00%,0.00%,0.00%
1,14.18%,3.65%,0.00%,0.00%,0.00%
2,10.38%,2.51%,0.97%,0.00%,0.00%
3,7.69%,2.08%,0.77%,0.44%,0.00%
4+,23.63%,6.62%,2.55%,1.34%,1.86%


In [26]:
mansion_train_cross = pd.DataFrame(
    {
        "data_id": train_mansion["data_id"],
        "unit_bucket": mansion_train_same_unit_others.map(bucketize),
        "building_bucket": mansion_train_same_building_others.map(bucketize),
    }
)

mansion_train_cross_table = (
    mansion_train_cross.groupby(["building_bucket", "unit_bucket"])["data_id"]
    .nunique()
    .unstack(fill_value=0)
    .reindex(index=bucket_order, columns=bucket_order, fill_value=0)
)

mansion_train_cross_pct = (
    mansion_train_cross_table / mansion_train_total * 100
).map(lambda x: f"{x:.2f}%")

print(
    f"クロス集計対象 data_id: {mansion_train_cross_table.values.sum():,} / {mansion_train_total:,}"
)

mansion_train_cross_pct


クロス集計対象 data_id: 198,614 / 198,614


unit_bucket,0,1,2,3,4+
building_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,13.37%,0.00%,0.00%,0.00%,0.00%
1,9.84%,5.81%,0.00%,0.00%,0.00%
2,7.65%,4.36%,2.51%,0.00%,0.00%
3,5.71%,3.56%,1.71%,1.07%,0.00%
4+,17.64%,12.00%,6.40%,3.76%,4.62%
