In [289]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
clean_data_dir = os.path.join(data_dir, "clean")

In [290]:
ufcstats_events = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"))
event_ids = ufcstats_events.loc[ufcstats_events["is_ufc_event"] == 1, "id"].unique().tolist()
ufcstats_bouts = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "bouts.csv"))
ufcstats_bouts = ufcstats_bouts.loc[ufcstats_bouts["event_id"].isin(event_ids)]
fighter_ids = set(ufcstats_bouts["red_fighter_id"]) | set(ufcstats_bouts["blue_fighter_id"])
ufcstats_fighters = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "fighters.csv"), parse_dates=["date_of_birth"])
ufcstats_fighters = ufcstats_fighters.loc[ufcstats_fighters["id"].isin(fighter_ids)]
ufcstats_fighters

Unnamed: 0,id,name,nickname,height_inches,reach_inches,stance,date_of_birth
0,002ca196477ce572,Gabriel Silva,,66.0,71.0,Orthodox,1994-08-26
1,003d82fa384ca1d0,Aalon Cruz,,72.0,78.0,Switch,1989-09-20
2,0052de90691d4a93,Davi Ramos,Tasmanian Devil,66.0,70.0,Orthodox,1986-11-05
6,008dc37cca279def,Sean McCorkle,,79.0,81.0,Orthodox,1976-07-17
7,008ea710276c9606,Jeff Molina,El Jefe,66.0,69.0,Orthodox,1997-07-17
...,...,...,...,...,...,...,...
3856,ffc088e64fab57e9,Quinton Jackson,Rampage,73.0,73.0,Orthodox,1978-06-20
3857,ffc3e6daaa6da0b7,Johnny Bedford,Brutal,70.0,71.0,Orthodox,1983-01-06
3858,ffd3224638c01b57,Jean Matsumoto,,66.0,68.0,Orthodox,1999-09-09
3859,ffdeb4fbea09ce75,Jessica Rakoczy,The Ragin',67.0,,Orthodox,1977-04-14


## Tapology, Sherdog, Best Fight Odds

In [291]:
tapology_fighters = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv"), parse_dates=["date_of_birth"])
tapology_fighters["bestfightodds_id"] = tapology_fighters["bestfightodds_id"].astype("Int64")
tapology_fighters = tapology_fighters.rename(columns={"id": "tapology_id"})

# match where ufcstats id is available
temp_match1 = ufcstats_fighters[["id"]].merge(tapology_fighters, left_on="id", right_on="ufcstats_id", how="inner")
temp_match1 = temp_match1[["ufcstats_id", "tapology_id"]]

# match on full name
ufcstats_fighters2 = ufcstats_fighters.loc[~ufcstats_fighters["id"].isin(temp_match1["ufcstats_id"])].copy()
tapology_fighters2 = tapology_fighters.loc[~tapology_fighters["tapology_id"].isin(temp_match1["tapology_id"])].copy()
temp_match2 = ufcstats_fighters2[["id", "name"]].merge(tapology_fighters2, on="name", how="inner")
temp_match2 = temp_match2[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

# match on last name
ufcstats_fighters3 = ufcstats_fighters2.loc[~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])].copy()
ufcstats_fighters3["last_name"] = ufcstats_fighters3["name"].str.split().str[-1]
tapology_fighters3 = tapology_fighters2.loc[~tapology_fighters2["tapology_id"].isin(temp_match2["tapology_id"])].copy()
tapology_fighters3["last_name"] = tapology_fighters3["name"].str.split().str[-1]
temp_match3 = ufcstats_fighters3[["id", "last_name"]].merge(tapology_fighters3, on="last_name", how="inner")
temp_match3 = temp_match3[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

# match on first name
ufcstats_fighters4 = ufcstats_fighters3.loc[~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])].copy()
ufcstats_fighters4["first_name"] = ufcstats_fighters4["name"].str.split().str[0]
tapology_fighters4 = tapology_fighters3.loc[~tapology_fighters3["tapology_id"].isin(temp_match3["tapology_id"])].copy()
tapology_fighters4["first_name"] = tapology_fighters4["name"].str.split().str[0]
temp_match4 = ufcstats_fighters4[["id", "first_name"]].merge(tapology_fighters4, on="first_name", how="inner")
temp_match4 = temp_match4[["id", "tapology_id"]].rename(columns={"id": "ufcstats_id"})

all_matches = pd.concat([temp_match1, temp_match2, temp_match3, temp_match4]).sort_values("ufcstats_id").reset_index(drop=True)

matching1 = all_matches.merge(tapology_fighters[["tapology_id", "sherdog_id", "bestfightodds_id"]], on="tapology_id", how="left")

## Fight Matrix

In [292]:
fightmatrix_events = pd.read_csv(os.path.join(clean_data_dir, "Fight Matrix", "events.csv"))
event_ids = fightmatrix_events.loc[fightmatrix_events["is_ufc_event"] == 1, "id"].unique().tolist()
fightmatrix_bouts = pd.read_csv(os.path.join(clean_data_dir, "Fight Matrix", "bouts.csv"))
fightmatrix_bouts = fightmatrix_bouts.loc[fightmatrix_bouts["event_id"].isin(event_ids)]
fighter_ids = set(fightmatrix_bouts["fighter_1_id"]) | set(fightmatrix_bouts["fighter_2_id"])
fightmatrix_fighters = pd.read_csv(os.path.join(clean_data_dir, "Fight Matrix", "fighters.csv"), parse_dates=["pro_debut_date", "ufc_debut_date"])
fightmatrix_fighters = fightmatrix_fighters.loc[fightmatrix_fighters["id"].isin(fighter_ids)]
fightmatrix_fighters["sherdog_id"] = fightmatrix_fighters["sherdog_id"].astype(int)
fightmatrix_fighters = fightmatrix_fighters.rename(columns={"id": "fightmatrix_id"})

matching2 = matching1.merge(fightmatrix_fighters[["fightmatrix_id", "sherdog_id"]], on="sherdog_id", how="left")
matching2 = matching2[["ufcstats_id", "fightmatrix_id"]]

## Bet MMA

In [293]:
betmma_events = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"))
event_ids = betmma_events.loc[betmma_events["is_ufc_event"] == 1, "id"].unique().tolist()
betmma_bouts = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "bouts.csv"))
betmma_bouts = betmma_bouts.loc[betmma_bouts["event_id"].isin(event_ids)]
fighter_ids = set(betmma_bouts["fighter_1_id"]) | set(betmma_bouts["fighter_2_id"])
betmma_fighters = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "fighters.csv"))
betmma_fighters = betmma_fighters.loc[betmma_fighters["id"].isin(fighter_ids)]
betmma_fighters["sherdog_id"] = betmma_fighters["sherdog_id"].astype(int)
betmma_fighters = betmma_fighters.rename(columns={"id": "betmma_id"})

matching3 = matching1.merge(betmma_fighters[["betmma_id", "sherdog_id"]], on="sherdog_id", how="inner")
matching3 = matching3[["ufcstats_id", "betmma_id"]]

## ESPN

In [294]:
espn_bouts = pd.read_csv(os.path.join(clean_data_dir, "ESPN", "bouts.csv"))
fighter_ids = set(espn_bouts["fighter_1_id"]) | set(espn_bouts["fighter_2_id"])
espn_fighters = pd.read_csv(os.path.join(clean_data_dir, "ESPN", "fighters.csv"), parse_dates=["date_of_birth"])
espn_fighters = espn_fighters.loc[espn_fighters["id"].isin(fighter_ids)]
espn_fighters = espn_fighters.rename(columns={"id": "espn_id"})

# match on full name
espn_fighters1 = espn_fighters.copy()
espn_fighters1 = espn_fighters1.drop_duplicates(subset=["name"], keep=False)
temp_match1 = ufcstats_fighters[["id", "name"]].merge(espn_fighters1, on="name", how="inner")
temp_match1 = temp_match1[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# match on last name
espn_fighters2 = espn_fighters.loc[~espn_fighters["espn_id"].isin(temp_match1["espn_id"])].copy()
espn_fighters2_temp = espn_fighters2.copy()
espn_fighters2_temp["last_name"] = espn_fighters2["name"].str.split().str[-1]
espn_fighters2_temp = espn_fighters2_temp.drop_duplicates(subset=["last_name"], keep=False)
ufcstats_fighters2 = ufcstats_fighters.loc[~ufcstats_fighters["id"].isin(temp_match1["ufcstats_id"])].copy()
ufcstats_fighters2_temp = ufcstats_fighters2.copy()
ufcstats_fighters2_temp["last_name"] = ufcstats_fighters2_temp["name"].str.split().str[-1]
ufcstats_fighters2_temp = ufcstats_fighters2_temp.drop_duplicates(subset=["last_name"], keep=False)
temp_match2 = ufcstats_fighters2_temp[["id", "last_name"]].merge(espn_fighters2_temp, on="last_name", how="inner")
temp_match2 = temp_match2[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# match on first name
espn_fighters3 = espn_fighters2.loc[~espn_fighters2["espn_id"].isin(temp_match2["espn_id"])].copy()
espn_fighters3_temp = espn_fighters3.copy()
espn_fighters3_temp["first_name"] = espn_fighters3["name"].str.split().str[0]
espn_fighters3_temp = espn_fighters3_temp.drop_duplicates(subset=["first_name"], keep=False)
ufcstats_fighters3 = ufcstats_fighters2.loc[~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])].copy()
ufcstats_fighters3_temp = ufcstats_fighters3.copy()
ufcstats_fighters3_temp["first_name"] = ufcstats_fighters3["name"].str.split().str[0]
ufcstats_fighters3_temp = ufcstats_fighters3_temp.drop_duplicates(subset=["first_name"], keep=False)
temp_match3 = ufcstats_fighters3_temp[["id", "first_name"]].merge(espn_fighters3_temp, on="first_name", how="inner")
temp_match3 = temp_match3[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# match on nickname
espn_fighters4 = espn_fighters3.loc[~espn_fighters3["espn_id"].isin(temp_match3["espn_id"])].copy()
espn_fighters4_temp = espn_fighters4.copy()
espn_fighters4_temp = espn_fighters4_temp.dropna(subset=["nickname"])
ufcstats_fighters4 = ufcstats_fighters3.loc[~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])].copy()
ufcstats_fighters4_temp = ufcstats_fighters4.copy()
ufcstats_fighters4_temp = ufcstats_fighters4_temp.dropna(subset=["nickname"])
temp_match4 = ufcstats_fighters4_temp[["id", "nickname"]].merge(espn_fighters4_temp, on="nickname", how="inner")
temp_match4 = temp_match4[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

# join on dob
espn_fighters5 = espn_fighters4.loc[~espn_fighters4["espn_id"].isin(temp_match4["espn_id"])].copy()
ufcstats_fighters5 = ufcstats_fighters4.loc[~ufcstats_fighters4["id"].isin(temp_match4["ufcstats_id"])].copy()
temp_match5 = ufcstats_fighters5[["id", "date_of_birth"]].merge(espn_fighters5, on="date_of_birth", how="inner")
temp_match5 = temp_match5[["id", "espn_id"]].rename(columns={"id": "ufcstats_id"})

matching4 = pd.concat([temp_match1, temp_match2, temp_match3, temp_match4, temp_match5]).sort_values("ufcstats_id").reset_index(drop=True)

## FightOdds.io

In [295]:
import sqlite3

# gonna cheat by using some of the work i've done in the past that's matched up a majority of the fighters
conn = sqlite3.connect("fightoddsio.db")
df = pd.read_sql_query("SELECT * FROM fightoddsio_fighter_linkage", conn)
conn.close()
df = df.rename(columns={"UFCSTATS_FIGHTER_ID": "ufcstats_id", "FIGHTODDSIO_FIGHTER_ID": "fightoddsio_pk"})

fightoddsio_fighters_temp = pd.read_csv(os.path.join(clean_data_dir, "FightOdds.io", "fighters.csv"))
fightoddsio_fighters_temp = fightoddsio_fighters_temp[["id", "pk"]].rename(columns={"id": "fightoddsio_id", "pk": "fightoddsio_pk"})

temp_match = df.merge(fightoddsio_fighters_temp, on="fightoddsio_pk", how="inner").drop(columns=["fightoddsio_pk"])

# deal with remaining unmatched fighters
fightoddsio_fighters = pd.read_csv(os.path.join(clean_data_dir, "FightOdds.io", "fighters.csv"), parse_dates=["date_of_birth"])
fightoddsio_fighters = fightoddsio_fighters.rename(columns={"id": "fightoddsio_id"})

# match on full name
fightoddsio_fighters1 = fightoddsio_fighters.loc[~fightoddsio_fighters["fightoddsio_id"].isin(temp_match["fightoddsio_id"])].copy()
ufcstats_fighters1 = ufcstats_fighters.loc[~ufcstats_fighters["id"].isin(temp_match["ufcstats_id"])].copy()
temp_match1 = ufcstats_fighters1[["id", "name"]].merge(fightoddsio_fighters1, on="name", how="inner")
temp_match1 = temp_match1[["id", "fightoddsio_id"]].rename(columns={"id": "ufcstats_id"})

# match on last name
fightoddsio_fighters2 = fightoddsio_fighters1.loc[~fightoddsio_fighters1["fightoddsio_id"].isin(temp_match1["fightoddsio_id"])].copy()
fightoddsio_fighters2_temp = fightoddsio_fighters2.copy()
fightoddsio_fighters2_temp["last_name"] = fightoddsio_fighters2["name"].str.split().str[-1]
fightoddsio_fighters2_temp = fightoddsio_fighters2_temp.drop_duplicates(subset=["last_name"], keep=False)
ufcstats_fighters2 = ufcstats_fighters1.loc[~ufcstats_fighters1["id"].isin(temp_match1["ufcstats_id"])].copy()
ufcstats_fighters2_temp = ufcstats_fighters2.copy()
ufcstats_fighters2_temp["last_name"] = ufcstats_fighters2_temp["name"].str.split().str[-1]
ufcstats_fighters2_temp = ufcstats_fighters2_temp.drop_duplicates(subset=["last_name"], keep=False)
temp_match2 = ufcstats_fighters2_temp[["id", "last_name"]].merge(fightoddsio_fighters2_temp, on="last_name", how="inner")
temp_match2 = temp_match2[["id", "fightoddsio_id"]].rename(columns={"id": "ufcstats_id"})

# match on first name
fightoddsio_fighters3 = fightoddsio_fighters2.loc[~fightoddsio_fighters2["fightoddsio_id"].isin(temp_match2["fightoddsio_id"])].copy()
fightoddsio_fighters3_temp = fightoddsio_fighters3.copy()
fightoddsio_fighters3_temp["first_name"] = fightoddsio_fighters3["name"].str.split().str[0]
fightoddsio_fighters3_temp = fightoddsio_fighters3_temp.drop_duplicates(subset=["first_name"], keep=False)
ufcstats_fighters3 = ufcstats_fighters2.loc[~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])].copy()
ufcstats_fighters3_temp = ufcstats_fighters3.copy()
ufcstats_fighters3_temp["first_name"] = ufcstats_fighters3["name"].str.split().str[0]
ufcstats_fighters3_temp = ufcstats_fighters3_temp.drop_duplicates(subset=["first_name"], keep=False)
temp_match3 = ufcstats_fighters3_temp[["id", "first_name"]].merge(fightoddsio_fighters3_temp, on="first_name", how="inner")
temp_match3 = temp_match3[["id", "fightoddsio_id"]].rename(columns={"id": "ufcstats_id"})

# join on dob
fightoddsio_fighters4 = fightoddsio_fighters3.loc[~fightoddsio_fighters3["fightoddsio_id"].isin(temp_match3["fightoddsio_id"])].copy()
ufcstats_fighters4 = ufcstats_fighters3.loc[~ufcstats_fighters3["id"].isin(temp_match3["ufcstats_id"])].copy()
temp_match4 = ufcstats_fighters4[["id", "date_of_birth"]].merge(fightoddsio_fighters4, on="date_of_birth", how="inner")
temp_match4 = temp_match4[["id", "fightoddsio_id"]].rename(columns={"id": "ufcstats_id"})

# try swapping last name and first name order
fightoddsio_fighters5 = fightoddsio_fighters4.loc[~fightoddsio_fighters4["fightoddsio_id"].isin(temp_match4["fightoddsio_id"])].copy()
fightoddsio_fighters5_temp = fightoddsio_fighters5.copy()
fightoddsio_fighters5_temp["last_name_first"] = fightoddsio_fighters5_temp["name"].str.split().apply(lambda x: " ".join(x[::-1]))
ufcstats_fighters5 = ufcstats_fighters4.loc[~ufcstats_fighters4["id"].isin(temp_match4["ufcstats_id"])].copy()
ufcstats_fighters5_temp = ufcstats_fighters5.copy()
ufcstats_fighters5_temp["last_name_first"] = ufcstats_fighters5_temp["name"].copy()
temp_match5 = ufcstats_fighters5_temp[["id", "last_name_first"]].merge(fightoddsio_fighters5_temp, on="last_name_first", how="inner")
temp_match5 = temp_match5[["id", "fightoddsio_id"]].rename(columns={"id": "ufcstats_id"})

matching5 = pd.concat([temp_match, temp_match1, temp_match2, temp_match3, temp_match4, temp_match5]).sort_values("ufcstats_id").reset_index(drop=True)

## MMA Decisions

In [296]:
# figure out rest
mmadecisions_events = pd.read_csv(os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"])
mmadecisions_events = mmadecisions_events.loc[mmadecisions_events["promotion"] == "UFC"].reset_index(drop=True)
mmadecisions_bouts = pd.read_csv(os.path.join(clean_data_dir, "MMA Decisions", "bouts.csv"))
mmadecisions_bouts = mmadecisions_bouts.loc[mmadecisions_bouts["event_id"].isin(mmadecisions_events["id"])]
fighter_ids = set(mmadecisions_bouts["fighter_1_id"]) | set(mmadecisions_bouts["fighter_2_id"])
mmadecisions_fighters = pd.read_csv(os.path.join(clean_data_dir, "MMA Decisions", "fighters.csv"), parse_dates=["date_of_birth"])
mmadecisions_fighters = mmadecisions_fighters.loc[mmadecisions_fighters["id"].isin(fighter_ids)]

# once again, cheat a little bit and use previous work for different class
temp_df = pd.read_csv("mmadecisions_merged_final.csv")
red = temp_df[["red_ufcstats_fighter_id", "red_mmadecisions_fighter_id"]].rename(columns={"red_ufcstats_fighter_id": "ufcstats_id", "red_mmadecisions_fighter_id": "mmadecisions_id"})
blue = temp_df[["blue_ufcstats_fighter_id", "blue_mmadecisions_fighter_id"]].rename(columns={"blue_ufcstats_fighter_id": "ufcstats_id", "blue_mmadecisions_fighter_id": "mmadecisions_id"})
temp_match = pd.concat([red, blue]).drop_duplicates().reset_index(drop=True)

# deal with remaining unmatched fighters
mmadecisions_fighters1 = mmadecisions_fighters.rename(columns={"id": "mmadecisions_id"})
mmadecisions_fighters1 = mmadecisions_fighters1.loc[~mmadecisions_fighters1["mmadecisions_id"].isin(temp_match["mmadecisions_id"])]

# match on full name
ufcstats_fighters1 = ufcstats_fighters.copy()
ufcstats_fighters1 = ufcstats_fighters1.loc[~ufcstats_fighters1["id"].isin(temp_match["ufcstats_id"])].copy()
ufcstats_fighters1_temp = ufcstats_fighters1.copy()
ufcstats_fighters1_temp = ufcstats_fighters1_temp.drop_duplicates(subset=["name"], keep=False)
mmadecisions_fighters1_temp = mmadecisions_fighters1.copy()
mmadecisions_fighters1_temp = mmadecisions_fighters1_temp.drop_duplicates(subset=["name"], keep=False)
temp_match1 = mmadecisions_fighters1_temp[["mmadecisions_id", "name"]].merge(ufcstats_fighters1_temp, on="name", how="inner")
temp_match1 = temp_match1[["id", "mmadecisions_id"]].rename(columns={"id": "ufcstats_id"})

# hardcode the rest
ufcstats_fighters2 = ufcstats_fighters1.loc[~ufcstats_fighters1["id"].isin(temp_match1["ufcstats_id"])].copy()
mmadecisions_fighters2 = mmadecisions_fighters1.loc[~mmadecisions_fighters1["mmadecisions_id"].isin(temp_match1["mmadecisions_id"])].copy()
temp_match2 = pd.DataFrame(
    {
        "ufcstats_id": [
            "6cbb7661c3258617",
            "1c2f2571b18791b6",
            "2ee09ec2a0695eb9",
            "0f6528b461e47462",
            "6aa1cbc1466e9a0b",
            "821cd80aab70d5f9",
            "2067a177a2842fbf",
            "91d73ee59347ac16",
            "37b1696e6d115957",
            "1cf1310684a841f5",
            "6ef65b6a152e49d5",
            "915c8d39d4a0e7d1",
            "e023fa2e648ea0f1",
            "8163cd7de0342652",
            "a474aade8eb3a8f0",
        ],
        "mmadecisions_id": [
            34,
            141,
            158,
            230,
            425,
            433,
            456,
            614,
            641,
            659,
            1939,
            2582,
            3520,
            3648,
            6805,
        ]
    }
)

# ufcstats_fighters3 = ufcstats_fighters2.loc[~ufcstats_fighters2["id"].isin(temp_match2["ufcstats_id"])].copy()
# mmadecisions_fighters3 = mmadecisions_fighters2.loc[~mmadecisions_fighters2["mmadecisions_id"].isin(temp_match2["mmadecisions_id"])].copy()

matching6 = pd.concat([temp_match, temp_match1, temp_match2]).sort_values("ufcstats_id").reset_index(drop=True)

In [297]:
# create final matching dataframe
matching = matching1.merge(matching2, on="ufcstats_id", how="left")
matching = matching.merge(matching3, on="ufcstats_id", how="left")
matching = matching.merge(matching4, on="ufcstats_id", how="left")
matching = matching.merge(matching5, on="ufcstats_id", how="left")
matching = matching.merge(matching6, on="ufcstats_id", how="left")
matching["betmma_id"] = matching["betmma_id"].astype("Int64")
matching["mmadecisions_id"] = matching["mmadecisions_id"].astype("Int64")
matching = matching[
    [
        "ufcstats_id",
        "bestfightodds_id",
        "betmma_id",
        "espn_id",
        "fightmatrix_id",
        "fightoddsio_id",
        "mmadecisions_id",
        "sherdog_id",
        "tapology_id",
    ]
]
matching.to_csv(os.path.join(clean_data_dir, "fighters_linkage.csv"), index=False)