In [None]:
import pandas as pd
from itertools import combinations

people_path = "npb_People_combined.csv"
people = pd.read_csv(people_path, low_memory=False)

people["nameFirst_norm"] = (
    people["nameFirst"].astype(str).str.strip().str.upper()
)
people["nameLast_norm"] = (
    people["nameLast"].astype(str).str.strip().str.upper()
)

people["birthDate_key"] = (
    people["birthYear"].astype(str).str.zfill(4) + "-" +
    people["birthMonth"].astype(str).str.zfill(2) + "-" +
    people["birthDay"].astype(str).str.zfill(2)
)

records = []
for birthdate, grp in people.groupby("birthDate_key"):
    if len(grp) < 2:
        continue
    for idx1, idx2 in combinations(grp.index, 2):
        row1, row2 = people.loc[idx1], people.loc[idx2]
        first_match = row1["nameFirst_norm"] == row2["nameFirst_norm"]
        last_match  = row1["nameLast_norm"] == row2["nameLast_norm"]
        if first_match or last_match:
            records.append({
                "ID_1": row1["SearchID"] if "SearchID" in row1 else pd.NA,
                "PlayerID_1": row1["Player ID"],
                "Name_1": f"{row1['nameFirst']} {row1['nameLast']}",
                "ID_2": row2["SearchID"] if "SearchID" in row2 else pd.NA,
                "PlayerID_2": row2["Player ID"]})

suspects_df = pd.DataFrame(records)
print(suspects_df)


        ID_1  PlayerID_1              Name_1      ID_2  PlayerID_2
0   100178.0         NaN  Hirochika Yamamoto  100293.0         NaN
1   100384.0         NaN    Hiroshi Miyazaki  100385.0         NaN
2   101919.0         NaN   Yoshinori Kadooka  102018.0         NaN
3   103449.0         NaN    Hitoshi Hatayama  103944.0         NaN
4   104199.0         NaN          Ken Suzuki  104336.0         NaN
5   105222.0         NaN     Atsushi Okamoto  105386.0         NaN
6   105753.0         NaN        Yohei Oshima       NaN   1305131.0
7   105930.0         NaN           Yudai Ono       NaN  11515133.0
8   105642.0         NaN          Hikaru Ito       NaN  81185116.0
9   106662.0         NaN      Matt Dominguez  107140.0         NaN
10  105841.0         NaN  Yoshitomo Tsutsugo       NaN  41945131.0
11  106382.0         NaN       Taisuke Kondo       NaN  81785132.0
12  106186.0         NaN        Daichi Osera       NaN  61365139.0
13  105947.0         NaN     Shota Dobayashi       NaN  619651

In [None]:
pairs = [
    ("105753","1305131"),   # Yohei Oshima
    ("105930", "11515133"),  # Yudai Ono
    ("105642", "81185116"),  # Hikaru Ito
    ("105841", "41945131"),  # Yoshitomo Tsutsugo
    ("106382","81785132"),  # Taisuke Kondo
    ("106186",  "61365139"),  # Daichi Osera
    ("105947", "61965131"),  # Shota Dobayashi
    ("106178", "71775139"),  # Aren Kuri
    ("106322", "41545130"),  # Takuma Kato
    ("106384",  "41545132"),  # Takayuki Kato
    ("106583", "73575132"),  # Chiahao Sung
    ("106618", "21325136"),  # Takumi Oshiro
    ("105854","81785133"),  # Shunta Goto
    ("106400", "11515132"),  # Koji Oshiro
    ("106003", "41745135"),  # Kensuke Kondo
    ("106567","21325134"),  # Yusuke Oyama
    ("106522", "71575134"),  # Takuya Kato
    ("106755", "1305138"),   # Shinichi Onuki
    ("106794","71375138"),  # Hikaru Ota
    ("106595","51655130"),  # Koya Fujii
    ("106785",  "81385138"),  # Yuya Ogo
    ("106823","41145138"),  # Yukiya Ito
    ("106739", "81785138"),  # Ryusei Sato
    ("106575","81385130"),  # Kengo Ota
    ("106605", "91795134"),  # Takuya Kori
    ("106677", "23125136"),  # Yudai Yamamoto
    ("106643", "1005134"),   # Koya Takahashi
    ("106740", "71375134"),  # Ryusei Oe
    ("106874","61465151"),  # Syota Hamaya
    ("106817","21225136"),  # Koo Nakagawa
    ("106795","71375136"),  # Atsushi Endo
    ("106774", "81785136"),  # Kosuke Sakaguchi
    ("107110",  "51555155"),  # Koki Kajiwara
    ("106931","1405138"),   # Kouta Hayashi
    ("106940","61665151"),  # Kouki Matsuoka
    ("106832","91395138"),  # Ryo Ota
    ("107151","81785153"),  # Junpei Kawarada
    ("107073","53155155"),  # Ko Matsukawa
]

In [4]:
import pandas as pd
from pathlib import Path

people_path = Path("npb_People_combined.csv")
people = pd.read_csv(people_path, low_memory=False)

spelling_df = pd.DataFrame(pairs, columns=["SearchID_keep", "PlayerID_fill"])
spelling_df.to_csv("npb_People_spelling.csv", index=False)

def norm(val):
    if pd.isna(val):
        return pd.NA
    val = str(val).strip()
    return val[:-2] if val.endswith(".0") else val

people["SearchID"] = people["SearchID"].apply(norm)
people["Player ID"] = people["Player ID"].apply(norm)

In [5]:
rows_to_drop = []

for sid_keep, pid_fill in pairs:
    keeper_idx = people.index[people["SearchID"] == sid_keep]
    if keeper_idx.empty:
        continue
    keeper_idx = keeper_idx[0]

    twin_idx = people.index[people["Player ID"] == str(pid_fill)]
    if twin_idx.empty:
        continue
    twin_idx = twin_idx[0]

    if pd.isna(people.at[keeper_idx, "Player ID"]):
        people.at[keeper_idx, "Player ID"] = str(pid_fill)
    if pd.isna(people.at[keeper_idx, "SearchID"]):
        people.at[keeper_idx, "SearchID"] = sid_keep

    rows_to_drop.append(twin_idx)

people_clean = people.drop(index=rows_to_drop).reset_index(drop=True)
people_clean.to_csv("npb_People_combined_cleaned.csv", index=False)