# Matching - Fighter IDs

In [1]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
clean_data_dir = os.path.join(data_dir, "clean")

In [2]:
# Filter for only UFC events (so exclude WEC, Pride, etc. that were merged into UFC later)
ufcstats_events = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"))
event_ids = (
    ufcstats_events.loc[ufcstats_events["is_ufc_event"] == 1, "id"].unique().tolist()
)

# Subset UFC-only bouts
ufcstats_bouts = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "bouts.csv"))
ufcstats_bouts = ufcstats_bouts.loc[ufcstats_bouts["event_id"].isin(event_ids)]

# Get unique fighter IDs and subset UFC Stats fighters
fighter_ids = set(ufcstats_bouts["red_fighter_id"]) | set(
    ufcstats_bouts["blue_fighter_id"]
)
ufcstats_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "UFC Stats", "fighters.csv"),
    parse_dates=["date_of_birth"],
)
ufcstats_fighters = ufcstats_fighters.loc[ufcstats_fighters["id"].isin(fighter_ids)]

## Tapology, Sherdog, Best Fight Odds

We can exploit Tapology's aggregation of links to other sites to take a shortcut.

In [3]:
tapology_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv"),
    parse_dates=["date_of_birth"],
)
tapology_fighters["bestfightodds_id"] = tapology_fighters["bestfightodds_id"].astype(
    "Int64"
)
tapology_fighters = tapology_fighters.rename(columns={"id": "tapology_id"})

# Match where ufcstats id is available
temp_match1 = ufcstats_fighters[["id"]].merge(
    tapology_fighters, left_on="id", right_on="ufcstats_id", how="inner"
)
temp_match1 = temp_match1[["ufcstats_id", "tapology_id"]]

# Match on full name
ufcstats_fighters2 = ufcstats_fighters.loc[
    ~ufcstats_fighters["id"].isin(temp_match1["ufcstats_id"])
].copy()
tapology_fighters2 = tapology_fighters.loc[
    ~tapology_fighters["tapology_id"].isin(temp_match1["tapology_id"])
].copy()
temp_match2 = ufcstats_fighters2[["id", "name"]].merge(
    tapology_fighters2, on="name", how="inner"
)
temp_match2 = temp_match2[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

# Match on last name
ufcstats_fighters3 = ufcstats_fighters2.loc[
    ~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])
].copy()
ufcstats_fighters3["last_name"] = ufcstats_fighters3["name"].str.split().str[-1]
tapology_fighters3 = tapology_fighters2.loc[
    ~tapology_fighters2["tapology_id"].isin(temp_match2["tapology_id"])
].copy()
tapology_fighters3["last_name"] = tapology_fighters3["name"].str.split().str[-1]
temp_match3 = ufcstats_fighters3[["id", "last_name"]].merge(
    tapology_fighters3, on="last_name", how="inner"
)
temp_match3 = temp_match3[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

# Match on first name
ufcstats_fighters4 = ufcstats_fighters3.loc[
    ~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])
].copy()
ufcstats_fighters4["first_name"] = ufcstats_fighters4["name"].str.split().str[0]
tapology_fighters4 = tapology_fighters3.loc[
    ~tapology_fighters3["tapology_id"].isin(temp_match3["tapology_id"])
].copy()
tapology_fighters4["first_name"] = tapology_fighters4["name"].str.split().str[0]
temp_match4 = ufcstats_fighters4[["id", "first_name"]].merge(
    tapology_fighters4, on="first_name", how="inner"
)
temp_match4 = temp_match4[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

all_matches = (
    pd.concat([temp_match1, temp_match2, temp_match3, temp_match4])
    .sort_values("ufcstats_id")
    .reset_index(drop=True)
)

matching1 = all_matches.merge(
    tapology_fighters[["tapology_id", "sherdog_id", "bestfightodds_id"]],
    on="tapology_id",
    how="left",
)

## Fight Matrix

Fight Matrix links to Sherdog, so we can use the first matching from above.

In [4]:
fightmatrix_events = pd.read_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "events.csv")
)
event_ids = (
    fightmatrix_events.loc[fightmatrix_events["is_ufc_event"] == 1, "id"]
    .unique()
    .tolist()
)
fightmatrix_bouts = pd.read_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "bouts.csv")
)
fightmatrix_bouts = fightmatrix_bouts.loc[fightmatrix_bouts["event_id"].isin(event_ids)]
fighter_ids = set(fightmatrix_bouts["fighter_1_id"]) | set(
    fightmatrix_bouts["fighter_2_id"]
)
fightmatrix_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "fighters.csv"),
    parse_dates=["pro_debut_date", "ufc_debut_date"],
)
fightmatrix_fighters = fightmatrix_fighters.loc[
    fightmatrix_fighters["id"].isin(fighter_ids)
]
fightmatrix_fighters["sherdog_id"] = fightmatrix_fighters["sherdog_id"].astype(int)
fightmatrix_fighters = fightmatrix_fighters.rename(columns={"id": "fightmatrix_id"})

matching2 = matching1.merge(
    fightmatrix_fighters[["fightmatrix_id", "sherdog_id"]], on="sherdog_id", how="left"
)
matching2 = matching2[["ufcstats_id", "fightmatrix_id"]]

## Bet MMA

Bet MMA also links to Sherdog, so we again use the first matching.

In [5]:
betmma_events = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"))
event_ids = (
    betmma_events.loc[betmma_events["is_ufc_event"] == 1, "id"].unique().tolist()
)
betmma_bouts = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "bouts.csv"))
betmma_bouts = betmma_bouts.loc[betmma_bouts["event_id"].isin(event_ids)]
fighter_ids = set(betmma_bouts["fighter_1_id"]) | set(betmma_bouts["fighter_2_id"])
betmma_fighters = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "fighters.csv"))
betmma_fighters = betmma_fighters.loc[betmma_fighters["id"].isin(fighter_ids)]
betmma_fighters["sherdog_id"] = betmma_fighters["sherdog_id"].astype(int)
betmma_fighters = betmma_fighters.rename(columns={"id": "betmma_id"})

matching3 = matching1.merge(
    betmma_fighters[["betmma_id", "sherdog_id"]], on="sherdog_id", how="inner"
)
matching3 = matching3[["ufcstats_id", "betmma_id"]]

## ESPN

In [6]:
espn_bouts = pd.read_csv(os.path.join(clean_data_dir, "ESPN", "bouts.csv"))
fighter_ids = set(espn_bouts["fighter_1_id"]) | set(espn_bouts["fighter_2_id"])
espn_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "ESPN", "fighters.csv"), parse_dates=["date_of_birth"]
)
espn_fighters = espn_fighters.loc[espn_fighters["id"].isin(fighter_ids)]
espn_fighters = espn_fighters.rename(columns={"id": "espn_id"})

# Match on full name
espn_fighters1 = espn_fighters.copy()
espn_fighters1 = espn_fighters1.drop_duplicates(subset=["name"], keep=False)
temp_match1 = ufcstats_fighters[["id", "name"]].merge(
    espn_fighters1, on="name", how="inner"
)
temp_match1 = temp_match1[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# Match on last name
espn_fighters2 = espn_fighters.loc[
    ~espn_fighters["espn_id"].isin(temp_match1["espn_id"])
].copy()
espn_fighters2_temp = espn_fighters2.copy()
espn_fighters2_temp["last_name"] = espn_fighters2["name"].str.split().str[-1]
espn_fighters2_temp = espn_fighters2_temp.drop_duplicates(
    subset=["last_name"], keep=False
)
ufcstats_fighters2 = ufcstats_fighters.loc[
    ~ufcstats_fighters["id"].isin(temp_match1["ufcstats_id"])
].copy()
ufcstats_fighters2_temp = ufcstats_fighters2.copy()
ufcstats_fighters2_temp["last_name"] = (
    ufcstats_fighters2_temp["name"].str.split().str[-1]
)
ufcstats_fighters2_temp = ufcstats_fighters2_temp.drop_duplicates(
    subset=["last_name"], keep=False
)
temp_match2 = ufcstats_fighters2_temp[["id", "last_name"]].merge(
    espn_fighters2_temp, on="last_name", how="inner"
)
temp_match2 = temp_match2[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# Match on first name
espn_fighters3 = espn_fighters2.loc[
    ~espn_fighters2["espn_id"].isin(temp_match2["espn_id"])
].copy()
espn_fighters3_temp = espn_fighters3.copy()
espn_fighters3_temp["first_name"] = espn_fighters3["name"].str.split().str[0]
espn_fighters3_temp = espn_fighters3_temp.drop_duplicates(
    subset=["first_name"], keep=False
)
ufcstats_fighters3 = ufcstats_fighters2.loc[
    ~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])
].copy()
ufcstats_fighters3_temp = ufcstats_fighters3.copy()
ufcstats_fighters3_temp["first_name"] = ufcstats_fighters3["name"].str.split().str[0]
ufcstats_fighters3_temp = ufcstats_fighters3_temp.drop_duplicates(
    subset=["first_name"], keep=False
)
temp_match3 = ufcstats_fighters3_temp[["id", "first_name"]].merge(
    espn_fighters3_temp, on="first_name", how="inner"
)
temp_match3 = temp_match3[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# Match on nickname
espn_fighters4 = espn_fighters3.loc[
    ~espn_fighters3["espn_id"].isin(temp_match3["espn_id"])
].copy()
espn_fighters4_temp = espn_fighters4.copy()
espn_fighters4_temp = espn_fighters4_temp.dropna(subset=["nickname"])
ufcstats_fighters4 = ufcstats_fighters3.loc[
    ~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])
].copy()
ufcstats_fighters4_temp = ufcstats_fighters4.copy()
ufcstats_fighters4_temp = ufcstats_fighters4_temp.dropna(subset=["nickname"])
temp_match4 = ufcstats_fighters4_temp[["id", "nickname"]].merge(
    espn_fighters4_temp, on="nickname", how="inner"
)
temp_match4 = temp_match4[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# Join on dob
espn_fighters5 = espn_fighters4.loc[
    ~espn_fighters4["espn_id"].isin(temp_match4["espn_id"])
].copy()
ufcstats_fighters5 = ufcstats_fighters4.loc[
    ~ufcstats_fighters4["id"].isin(temp_match4["ufcstats_id"])
].copy()
temp_match5 = ufcstats_fighters5[["id", "date_of_birth"]].merge(
    espn_fighters5, on="date_of_birth", how="inner"
)
temp_match5 = temp_match5[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

matching4 = (
    pd.concat([temp_match1, temp_match2, temp_match3, temp_match4, temp_match5])
    .sort_values("ufcstats_id")
    .reset_index(drop=True)
)

## FightOdds.io

In [7]:
fightoddsio_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "fighters.csv"),
    parse_dates=["date_of_birth"],
)
fightoddsio_fighters = fightoddsio_fighters.rename(columns={"id": "fightoddsio_id"})

# Match on full name
fightoddsio_fighters1 = fightoddsio_fighters.copy()
ufcstats_fighters1 = ufcstats_fighters.copy()
temp_match1 = (
    ufcstats_fighters1.loc[ufcstats_fighters1["name"].notnull(), ["id", "name"]]
    .drop_duplicates(subset=["name"], keep=False)
    .merge(
        fightoddsio_fighters1.loc[
            fightoddsio_fighters1["name"].notnull()
        ].drop_duplicates(subset=["name"], keep=False),
        on="name",
        how="inner",
    )
)
temp_match1 = (
    temp_match1[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Match on last name
fightoddsio_fighters2 = fightoddsio_fighters1.loc[
    ~fightoddsio_fighters1["fightoddsio_id"].isin(temp_match1["fightoddsio_id"])
].copy()
ufcstats_fighters2 = ufcstats_fighters1.loc[
    ~ufcstats_fighters1["id"].isin(temp_match1["ufcstats_id"])
].copy()
fightoddsio_fighters2_temp = fightoddsio_fighters2.copy()
ufcstats_fighters2_temp = ufcstats_fighters2.copy()
fightoddsio_fighters2_temp["last_name"] = (
    fightoddsio_fighters2_temp["name"].str.split().str[-1]
)
ufcstats_fighters2_temp["last_name"] = (
    ufcstats_fighters2_temp["name"].str.split().str[-1]
)
temp_match2 = (
    ufcstats_fighters2_temp.loc[
        ufcstats_fighters2_temp["id"] != "cdadae5363b66eef", ["id", "last_name"]
    ]
    .drop_duplicates(subset=["last_name"], keep=False)
    .merge(
        fightoddsio_fighters2_temp.drop_duplicates(subset=["last_name"], keep=False),
        on="last_name",
        how="inner",
    )
)
temp_match2 = (
    temp_match2[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Match on first name
fightoddsio_fighters3 = fightoddsio_fighters2.loc[
    ~fightoddsio_fighters2["fightoddsio_id"].isin(temp_match2["fightoddsio_id"])
].copy()
ufcstats_fighters3 = ufcstats_fighters2.loc[
    ~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])
].copy()
fightoddsio_fighters3_temp = fightoddsio_fighters3.copy()
ufcstats_fighters3_temp = ufcstats_fighters3.copy()
fightoddsio_fighters3_temp["first_name"] = (
    fightoddsio_fighters3_temp["name"].str.split().str[0]
)
ufcstats_fighters3_temp["first_name"] = (
    ufcstats_fighters3_temp["name"].str.split().str[0]
)
temp_match3 = (
    ufcstats_fighters3_temp.loc[
        ~ufcstats_fighters3_temp["id"].isin(["f2900678e98f6d6a", "f8f1ec513bcfef43"]),
        ["id", "first_name"],
    ]
    .drop_duplicates(subset=["first_name"], keep=False)
    .merge(
        fightoddsio_fighters3_temp.drop_duplicates(subset=["first_name"], keep=False),
        on="first_name",
        how="inner",
    )
)
temp_match3 = (
    temp_match3[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Match on nickname
fightoddsio_fighters4 = fightoddsio_fighters3.loc[
    ~fightoddsio_fighters3["fightoddsio_id"].isin(temp_match3["fightoddsio_id"])
].copy()
ufcstats_fighters4 = ufcstats_fighters3.loc[
    ~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])
].copy()
temp_match4 = (
    ufcstats_fighters4.loc[ufcstats_fighters4["nickname"].notnull(), ["id", "nickname"]]
    .drop_duplicates(subset=["nickname"], keep=False)
    .merge(
        fightoddsio_fighters4.loc[
            fightoddsio_fighters4["nickname"].notnull()
        ].drop_duplicates(subset=["nickname"], keep=False),
        on="nickname",
        how="inner",
    )
)
temp_match4 = (
    temp_match4[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Match on date of birth
fightoddsio_fighters5 = fightoddsio_fighters4.loc[
    ~fightoddsio_fighters4["fightoddsio_id"].isin(temp_match4["fightoddsio_id"])
].copy()
ufcstats_fighters5 = ufcstats_fighters4.loc[
    ~ufcstats_fighters4["id"].isin(temp_match4["ufcstats_id"])
].copy()
temp_match5 = (
    ufcstats_fighters5.loc[
        ufcstats_fighters5["date_of_birth"].notnull(), ["id", "date_of_birth"]
    ]
    .drop_duplicates(subset=["date_of_birth"], keep=False)
    .merge(
        fightoddsio_fighters5.loc[
            fightoddsio_fighters5["date_of_birth"].notnull()
        ].drop_duplicates(subset=["date_of_birth"], keep=False),
        on="date_of_birth",
        how="inner",
    )
)
temp_match5 = (
    temp_match5[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Swap first and last name order
fightoddsio_fighters6 = fightoddsio_fighters5.loc[
    ~fightoddsio_fighters5["fightoddsio_id"].isin(temp_match5["fightoddsio_id"])
].copy()
ufcstats_fighters6 = ufcstats_fighters5.loc[
    ~ufcstats_fighters5["id"].isin(temp_match5["ufcstats_id"])
].copy()
fightoddsio_fighters6_temp = fightoddsio_fighters6.copy()
ufcstats_fighters6_temp = ufcstats_fighters6.copy()
fightoddsio_fighters6_temp["last_name_first"] = (
    fightoddsio_fighters6_temp["name"].str.split().apply(lambda x: " ".join(x[::-1]))
)
ufcstats_fighters6_temp["last_name_first"] = ufcstats_fighters6_temp["name"].copy()
temp_match6 = (
    ufcstats_fighters6_temp[["id", "last_name_first"]]
    .drop_duplicates(subset=["last_name_first"], keep=False)
    .merge(
        fightoddsio_fighters6_temp.drop_duplicates(
            subset=["last_name_first"], keep=False
        ),
        on="last_name_first",
        how="inner",
    )
)
temp_match6 = (
    temp_match6[["id", "fightoddsio_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["fightoddsio_id"])
)

# Manually match the rest
temp_match7 = pd.DataFrame(
    {
        "ufcstats_id": [
            "0a73acff6325c1e2",
            "225def29ecfe0fc1",
            "2296125b6c362355",
            "28d421729451c8ca",
            "7d420039bbfe7c1a",
            "7debc13b36343605",
            "9edf2c9082cc2cd8",
            "a13d755965a4ec9f",
            "adccbc19b22e19af",
            "f2900678e98f6d6a",
        ],
        "fightoddsio_id": [
            "RmlnaHRlck5vZGU6MzgwMTk=",
            "RmlnaHRlck5vZGU6MzAwMjY=",
            "RmlnaHRlck5vZGU6MjM2MzQ=",
            "RmlnaHRlck5vZGU6MTE0NTk=",
            "RmlnaHRlck5vZGU6Mjg5MDU=",
            "RmlnaHRlck5vZGU6MzA2MzE=",
            "RmlnaHRlck5vZGU6NDI5Mzg=",
            "RmlnaHRlck5vZGU6MjYyMjI=",
            "RmlnaHRlck5vZGU6NDU1NzI=",
            "RmlnaHRlck5vZGU6MzUxNTk=",
        ],
    }
)

matching5 = (
    pd.concat(
        [
            temp_match1,
            temp_match2,
            temp_match3,
            temp_match4,
            temp_match5,
            temp_match6,
            temp_match7,
        ]
    )
    .sort_values("ufcstats_id")
    .reset_index(drop=True)
)

# Matching stats for paper
print(temp_match1.shape[0] / matching5.shape[0])
print(temp_match2.shape[0] / matching5.shape[0])
print(temp_match3.shape[0] / matching5.shape[0])
print(temp_match4.shape[0] / matching5.shape[0])
print(temp_match5.shape[0] / matching5.shape[0])
print(temp_match6.shape[0] / matching5.shape[0])
print(temp_match7.shape[0] / matching5.shape[0])

0.8938503721112416
0.04778691735213474
0.0282021151586369
0.014884449667058363
0.010184097140618879
0.0011750881316098707
0.0039169604386995694


## MMA Decisions

In [8]:
# Subset UFC fighters only
mmadecisions_events = pd.read_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"]
)
mmadecisions_events = mmadecisions_events.loc[
    mmadecisions_events["promotion"] == "UFC"
].reset_index(drop=True)
mmadecisions_bouts = pd.read_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "bouts.csv")
)
mmadecisions_bouts = mmadecisions_bouts.loc[
    mmadecisions_bouts["event_id"].isin(mmadecisions_events["id"])
]
fighter_ids = set(mmadecisions_bouts["fighter_1_id"]) | set(
    mmadecisions_bouts["fighter_2_id"]
)
mmadecisions_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "fighters.csv"),
    parse_dates=["date_of_birth"],
)
mmadecisions_fighters = mmadecisions_fighters.loc[
    mmadecisions_fighters["id"].isin(fighter_ids)
].rename(columns={"id": "mmadecisions_id"})

# Find fighters in UFC Stats that had decisions
event_mapping = pd.read_csv(os.path.join(clean_data_dir, "event_mapping.csv"))
event_ids_subset = event_mapping.loc[
    event_mapping["mmadecisions_id"].notnull(), "ufcstats_id"
].values.tolist()
ufcstats_bouts_subset = ufcstats_bouts.copy()
ufcstats_bouts_subset = ufcstats_bouts_subset.loc[
    (ufcstats_bouts_subset["event_id"].isin(event_ids_subset))
    & (
        (ufcstats_bouts_subset["outcome_method"].str.contains("Decision"))
        | (ufcstats_bouts_subset["outcome_method_details"].str.contains(r"\."))
    )
]
ufcstats_fighters_subset = ufcstats_fighters.loc[
    (
        (ufcstats_fighters["id"].isin(ufcstats_bouts_subset["red_fighter_id"]))
        | (ufcstats_fighters["id"].isin(ufcstats_bouts_subset["blue_fighter_id"]))
        | (ufcstats_fighters["id"] == "429e7d3725852ce9")
    )
    & (
        ~ufcstats_fighters["id"].isin(
            ["326f94d6cfb1bf25", "6cadfd8f1d9e7685", "a54a35a670d8e852"]
        )
    )
].copy()

# Match on full name
mmadecisions_fighters1 = mmadecisions_fighters.copy()
mmadecisions_fighters1 = mmadecisions_fighters1.drop_duplicates(
    subset=["name"], keep=False
)
temp_match1 = (
    ufcstats_fighters_subset[["id", "name"]]
    .drop_duplicates(subset=["name"], keep=False)
    .merge(mmadecisions_fighters1, on="name", how="inner")
)
temp_match1 = (
    temp_match1[["id", "mmadecisions_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["mmadecisions_id"])
)

# Match on date of birth
mmadecisions_fighters2 = mmadecisions_fighters.loc[
    ~mmadecisions_fighters["mmadecisions_id"].isin(temp_match1["mmadecisions_id"])
].copy()
ufcstats_fighters_subset2 = ufcstats_fighters_subset.loc[
    ~ufcstats_fighters_subset["id"].isin(temp_match1["ufcstats_id"])
].copy()
temp_match2 = (
    ufcstats_fighters_subset2.loc[
        ufcstats_fighters_subset2["date_of_birth"].notnull(), ["id", "date_of_birth"]
    ]
    .drop_duplicates(subset=["date_of_birth"], keep=False)
    .merge(
        mmadecisions_fighters2.loc[
            mmadecisions_fighters2["date_of_birth"].notnull()
        ].drop_duplicates(subset=["date_of_birth"], keep=False),
        on="date_of_birth",
        how="inner",
    )
)
temp_match2 = (
    temp_match2[["id", "mmadecisions_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["mmadecisions_id"])
)

# Match on nickname
mmadecisions_fighters3 = mmadecisions_fighters2.loc[
    ~mmadecisions_fighters2["mmadecisions_id"].isin(temp_match2["mmadecisions_id"])
].copy()
ufcstats_fighters_subset3 = ufcstats_fighters_subset2.loc[
    ~ufcstats_fighters_subset2["id"].isin(temp_match2["ufcstats_id"])
].copy()
temp_match3 = (
    ufcstats_fighters_subset3.loc[
        ufcstats_fighters_subset3["nickname"].notnull(), ["id", "nickname"]
    ]
    .drop_duplicates(subset=["nickname"], keep=False)
    .merge(
        mmadecisions_fighters3.loc[
            mmadecisions_fighters3["nicknames"].notnull()
        ].drop_duplicates(subset=["nicknames"], keep=False),
        left_on="nickname",
        right_on="nicknames",
        how="inner",
    )
)
temp_match3 = (
    temp_match3[["id", "mmadecisions_id"]]
    .rename(columns={"id": "ufcstats_id"})
    .drop_duplicates(subset=["ufcstats_id"])
    .drop_duplicates(subset=["mmadecisions_id"])
)

# Manually match rest
temp_match4 = pd.DataFrame(
    {
        "ufcstats_id": [
            "1235b31de15d0c6e",
            "2296125b6c362355",
            "31bbd39c0a075d4e",
            "3ec1e4ba98c9c85a",
            "6cbb7661c3258617",
            "6fd953151d981979",
            "73e09f837f3b5ecc",
            "8b5f9ea38184ded3",
            "8ce87f7e3a9baed2",
            "c4b81cdecd5d6abe",
            "edd02825c29028fe",
        ],
        "mmadecisions_id": [
            5043,
            4552,
            4851,
            3160,
            34,
            4077,
            724,
            2685,
            3062,
            740,
            3643,
        ],
    }
)

matching6 = (
    pd.concat([temp_match1, temp_match2, temp_match3, temp_match4])
    .sort_values("ufcstats_id")
    .reset_index(drop=True)
)

## Combine all

In [9]:
matching = matching1.merge(matching2, on="ufcstats_id", how="left")
matching = matching.merge(matching3, on="ufcstats_id", how="left")
matching = matching.merge(matching4, on="ufcstats_id", how="left")
matching = matching.merge(matching5, on="ufcstats_id", how="left")
matching = matching.merge(matching6, on="ufcstats_id", how="left")
matching["betmma_id"] = matching["betmma_id"].astype("Int64")
matching["mmadecisions_id"] = matching["mmadecisions_id"].astype("Int64")
matching = matching[
    [
        "ufcstats_id",
        "bestfightodds_id",
        "betmma_id",
        "espn_id",
        "fightmatrix_id",
        "fightoddsio_id",
        "mmadecisions_id",
        "sherdog_id",
        "tapology_id",
    ]
]
matching.to_csv(os.path.join(clean_data_dir, "fighter_mapping.csv"), index=False)