# Main notebook for data cleaning

In [1]:
# standard library imports
import os
import zipfile

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Best Fight Odds

In [2]:
bestfightodds_prop_odds = pd.read_csv(os.path.join(raw_data_dir, "Best Fight Odds", "closing_with_props.csv"), parse_dates=["Card_Date"])

# drop bad events
bad_events = [
    "UFC 9: Motor City Madness",
    "UFC 14: Showdown",
    "UFC 15: Collision Course",
    "UFC 18: The Road to the Heavyweight Title",
    'UFC 32: Showdown in the Meadowlands',
    'UFC 35: Throwdown',
    'UFC 48: Payback',
    'UFC 53: Heavy Hitters',
    'UFC 56: Full Force',
]
bestfightodds_prop_odds = bestfightodds_prop_odds.loc[~bestfightodds_prop_odds["Event"].isin(bad_events)]

# fix event urls
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 73: Stacked", "url"] = "https://www.bestfightodds.com/events/ufc-73-stacked-1"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 74: Respect", "url"] = "https://www.bestfightodds.com/events/ufc-74-respect-7"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 76: Knockout", "url"] = "https://www.bestfightodds.com/events/ufc-76-knockout-12"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 84: Ill Will", "url"] = "https://www.bestfightodds.com/events/ufc-84-ill-will-47"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 85: Bedlam", "url"] = "https://www.bestfightodds.com/events/ufc-85-bedlam-46"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 87: Seek and Destroy", "url"] = "https://www.bestfightodds.com/events/ufc-87-seek-and-destroy-57"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 99: The Comeback", "url"] = "https://www.bestfightodds.com/events/ufc-99-the-comeback-136"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 100", "url"] = "https://www.bestfightodds.com/events/ufc-100-137"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 101: Declaration", "url"] = "https://www.bestfightodds.com/events/ufc-101-declaration-145"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 109: Relentless", "url"] = "https://www.bestfightodds.com/events/ufc-109-relentless-226"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 112: Invincible", "url"] = "https://www.bestfightodds.com/events/ufc-112-invincible-245"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 119: Mir vs. Cro Cop", "url"] = "https://www.bestfightodds.com/events/ufc-119-mir-vs-cro-cop-296"

bestfightodds_prop_odds["event_id"] = bestfightodds_prop_odds["url"].str.split("/").str[-1].str.split("-").str[-1].astype(int)
bestfightodds_prop_odds = bestfightodds_prop_odds.drop(columns=["url", "Card_Date", "Event"])
bestfightodds_prop_odds = bestfightodds_prop_odds.rename(columns={"William_H": "William Hill"})
bestfightodds_prop_odds["Bet"] = bestfightodds_prop_odds["Bet"].str.strip()

In [236]:
bout_prop_odds = bestfightodds_prop_odds.loc[bestfightodds_prop_odds["bet_type"] == "Prop"].reset_index(drop=True).drop(columns=["bet_type"])
bout_prop_odds["Bet"] = bout_prop_odds["Bet"].str.strip().str.replace("½", ".5", regex=False)

# drop columns with all nan
bout_prop_odds = bout_prop_odds.dropna(axis=1, how="all")

# is_not
bout_prop_odds["is_not"] = bout_prop_odds["Bet"].apply(lambda x: 1 if x.startswith("Not ") or x == "Any other result" else 0)
bout_prop_odds["Bet"] = bout_prop_odds["Bet"].mask(bout_prop_odds["is_not"] == 1, bout_prop_odds["Bet"].shift())

# reference fighter last name
bout_prop_odds["ref_fighter_last_name"] = bout_prop_odds["Bet"].str.split().str[0].str.strip()
bout_prop_odds.loc[bout_prop_odds["ref_fighter_last_name"].isin(["Fight", "Over", "Under", "Wins"]), "ref_fighter_last_name"] = np.nan
other_edge_cases = ['Both fighters are knocked down',
       'Fighters touch gloves before fight',
       'No glove touch before fight', 'Timeout is called',
       'Timeout is not called', 'Either fighter wins by TKO/KO',
       'Either fighter wins by submission',
       'Either fighter is disqualified',
       'Either fighter is stopped by doctor/corner',
       'Clock stoppage due to eye poke',
       'No clock stoppage due to eye poke',
       'Does not announce retirement', 'Does not pull out of the fight']
bout_prop_odds.loc[bout_prop_odds["Bet"].isin(other_edge_cases), "ref_fighter_last_name"] = np.nan

# william hill weirdness
bout_prop_odds.loc[bout_prop_odds["William Hill"] == 100000, "William Hill"] = np.nan
bout_prop_odds.loc[bout_prop_odds["William Hill"] == -10000, "William Hill"] = np.nan

# reference fighter
bout_prop_odds["ref_fighter"] = bout_prop_odds.apply(lambda x: np.nan if pd.isna(x["ref_fighter_last_name"]) else "fighter1" if x["fighter1"].endswith(x["ref_fighter_last_name"]) else "fighter2" if x["fighter2"].endswith(x["ref_fighter_last_name"]) else "PROBLEM", axis=1)

# remove joby sanchez vs roberto sanchez fighter-specific props, literally no way of knowing which is which
bout_prop_odds = bout_prop_odds.loc[~((bout_prop_odds["fighter1"] == "Joby Sanchez") & (bout_prop_odds["fighter2"] == "Roberto Sanchez") & (bout_prop_odds["ref_fighter_last_name"] == "Sanchez"))]

# remove rows with all nan odds
bout_prop_odds = bout_prop_odds.dropna(how="all", subset=["5Dimes", "BetDSI", "BookMaker", "SportBet", "Bet365", "Bovada", "William Hill", "Pinnacle", "SportsInt", "BetOnline", "Intertops"]).reset_index(drop=True)

# remove fighter names from bets
bout_prop_odds["Bet"] = bout_prop_odds.apply(lambda x: x["Bet"] if pd.isna(x["ref_fighter"]) else x["Bet"].replace(x["ref_fighter_last_name"], "").strip().capitalize(), axis=1)
bout_prop_odds = bout_prop_odds.drop(columns=["ref_fighter_last_name"])

# drop cancelled fights
# drop cancelled fights
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1444) &
    (bout_prop_odds["fighter1"] == "Al Iaquinta") &
    (bout_prop_odds["fighter2"] == "Paul Felder"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1289) &
    (bout_prop_odds["fighter1"] == "Aspen Ladd") &
    (bout_prop_odds["fighter2"] == "Jessica Eye"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1562) &
    (bout_prop_odds["fighter1"] == "Brian Kelleher") &
    (bout_prop_odds["fighter2"] == "Montel Jackson"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1647) &
    (bout_prop_odds["fighter1"] == "Diego Ferreira") &
    (bout_prop_odds["fighter2"] == "Francisco Trinaldo"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1713) &
    (bout_prop_odds["fighter1"] == "Giacomo Lemos") &
    (bout_prop_odds["fighter2"] == "Tanner Boser"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1849) &
    (bout_prop_odds["fighter1"] == "Emily Whitmire") &
    (bout_prop_odds["fighter2"] == "Polyana Viana"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1246) &
    (bout_prop_odds["fighter1"] == "Brett Johns") &
    (bout_prop_odds["fighter2"] == "Ian Entwistle"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1680) &
    (bout_prop_odds["fighter1"] == "John Lineker") &
    (bout_prop_odds["fighter2"] == "Rob Font"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1673) &
    (bout_prop_odds["fighter1"] == "Devin Clark") &
    (bout_prop_odds["fighter2"] == "Ivan Shtyrkov"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1467) &
    (bout_prop_odds["fighter1"] == "Jessica Aguilar") &
    (bout_prop_odds["fighter2"] == "Jodie Esquibel"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1478) &
    (bout_prop_odds["fighter1"] == "Davey Grant") &
    (bout_prop_odds["fighter2"] == "Manny Bermudez"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1626) &
    (bout_prop_odds["fighter1"] == "Ian Heinisch") &
    (bout_prop_odds["fighter2"] == "Tom Breese"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1423) &
    (bout_prop_odds["fighter1"] == "Nad Narimani") &
    (bout_prop_odds["fighter2"] == "Nasrat Haqparast"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1687) &
    (bout_prop_odds["fighter1"] == "Jordan Griffin") &
    (bout_prop_odds["fighter2"] == "Vince Murdock"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1637) &
    (bout_prop_odds["fighter1"] == "Jessica Penne") &
    (bout_prop_odds["fighter2"] == "Jodie Esquibel"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1587) &
    (bout_prop_odds["fighter1"] == "Andrea Lee") &
    (bout_prop_odds["fighter2"] == "Jessica-Rose Clark"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 485) &
    (bout_prop_odds["fighter1"] == "Buddy Roberts") &
    (bout_prop_odds["fighter2"] == "Sean Loeffler"))
]
bout_prop_odds = bout_prop_odds.loc[
    ~((bout_prop_odds["event_id"] == 1647) &
    (bout_prop_odds["fighter1"] == "Melissa Gatto") &
    (bout_prop_odds["fighter2"] == "Talita Bernardo"))
].reset_index(drop=True)

bout_prop_odds["fighter1"] = bout_prop_odds["fighter1"].str.strip()
bout_prop_odds["fighter2"] = bout_prop_odds["fighter2"].str.strip()

fighter_event_df = bout_prop_odds[["fighter1", "fighter2", "event_id"]].drop_duplicates().reset_index(drop=True)

tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"), usecols=["id", "bestfightodds_id", "event_order"])
tapology_events = tapology_events.rename(columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"})
tapology_bouts = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"), usecols=["id", "event_id", "fighter_1_id", "fighter_2_id", "bout_order"])
tapology_bouts = tapology_bouts.rename(columns={"id": "tapology_bout_id", "event_id": "tapology_event_id", "fighter_1_id": "tapology_fighter_1_id", "fighter_2_id": "tapololgy_fighter_2_id"})
tapology_fighters = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv"), usecols=["id", "bestfightodds_id", "name"])
tapology_fighters = tapology_fighters.rename(columns={"id": "tapology_fighter_id", "bestfightodds_id": "fighter_id"})

tapology_bouts = tapology_bouts.merge(tapology_events, on="tapology_event_id", how="left").drop(columns=["tapology_event_id"])
tapology_bouts = tapology_bouts.merge(tapology_fighters, left_on="tapology_fighter_1_id", right_on="tapology_fighter_id", how="left")
tapology_bouts = tapology_bouts.rename(columns={"fighter_id": "fighter_1_id", "name": "fighter1"}).drop(columns=["tapology_fighter_id", "tapology_fighter_1_id"])
tapology_bouts = tapology_bouts.merge(tapology_fighters, left_on="tapololgy_fighter_2_id", right_on="tapology_fighter_id", how="left")
tapology_bouts = tapology_bouts.rename(columns={"fighter_id": "fighter_2_id", "name": "fighter2"}).drop(columns=["tapology_fighter_id", "tapololgy_fighter_2_id"])
tapology_bouts = tapology_bouts.loc[tapology_bouts["event_id"].isin(fighter_event_df["event_id"])].reset_index(drop=True)
tapology_bouts[["event_id", "fighter_1_id", "fighter_2_id"]] = tapology_bouts[["event_id", "fighter_1_id", "fighter_2_id"]].astype(int)
tapology_bouts["fighter1"] = tapology_bouts["fighter1"].str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
tapology_bouts["fighter2"] = tapology_bouts["fighter2"].str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")

fighter_event_df_temp = fighter_event_df.copy()
fighter_event_df_temp["fighter1_lower"] = fighter_event_df_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
fighter_event_df_temp["fighter2_lower"] = fighter_event_df_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
tapology_bouts_temp = tapology_bouts.copy()
tapology_bouts_temp["fighter1_lower"] = tapology_bouts_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
tapology_bouts_temp["fighter2_lower"] = tapology_bouts_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
tapology_bouts_temp = tapology_bouts_temp.drop(columns=["fighter1", "fighter2"])
merged1 = fighter_event_df_temp.merge(tapology_bouts_temp, on=["event_id", "fighter1_lower", "fighter2_lower"], how="left")
good_matches1 = merged1.loc[~pd.isna(merged1["tapology_bout_id"])].reset_index(drop=True)
good_matches1 = good_matches1.drop(columns=["fighter1_lower", "fighter2_lower"])
good_matches1[["fighter_1_id", "fighter_2_id"]] = good_matches1[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df2 = fighter_event_df.loc[pd.isna(merged1["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df2_temp = fighter_event_df2.copy()
fighter_event_df2_temp["fighter1_lower"] = fighter_event_df2_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
fighter_event_df2_temp["fighter2_lower"] = fighter_event_df2_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
tapology_bouts2 = tapology_bouts.loc[~tapology_bouts["tapology_bout_id"].isin(good_matches1["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts2_temp = tapology_bouts2.copy()
tapology_bouts2_temp = tapology_bouts2_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts2_temp["fighter1_lower"] = tapology_bouts2_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
tapology_bouts2_temp["fighter2_lower"] = tapology_bouts2_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
tapology_bouts2_temp = tapology_bouts2_temp.drop(columns=["fighter1", "fighter2"])
merged2 = fighter_event_df2_temp.merge(tapology_bouts2_temp, on=["event_id", "fighter1_lower", "fighter2_lower"], how="left")
good_matches2 = merged2.loc[~pd.isna(merged2["tapology_bout_id"])].reset_index(drop=True)
good_matches2 = good_matches2.drop(columns=["fighter1_lower", "fighter2_lower"])
good_matches2[["fighter_1_id", "fighter_2_id"]] = good_matches2[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df3 = fighter_event_df2.loc[pd.isna(merged2["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df3_temp = fighter_event_df3.copy()
tapology_bouts3 = tapology_bouts2.loc[~tapology_bouts2["tapology_bout_id"].isin(good_matches2["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts3_temp = tapology_bouts3.copy()
tapology_bouts3_temp["fighter1_first_name"] = tapology_bouts3_temp["fighter1"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
tapology_bouts3_temp["fighter2_first_name"] = tapology_bouts3_temp["fighter2"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
tapology_bouts3_temp = tapology_bouts3_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df3_temp["fighter1_first_name"] = fighter_event_df3_temp["fighter1"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
fighter_event_df3_temp["fighter2_first_name"] = fighter_event_df3_temp["fighter2"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
fighter_event_df3_temp = fighter_event_df3_temp.drop_duplicates(subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
merged3 = fighter_event_df3_temp.merge(tapology_bouts3_temp, on=["event_id", "fighter1_first_name", "fighter2_first_name"], how="left")
good_matches3 = merged3.loc[~pd.isna(merged3["tapology_bout_id"])].reset_index(drop=True)
good_matches3 = good_matches3.drop(columns=["fighter1_first_name", "fighter2_first_name"])
good_matches3[["fighter_1_id", "fighter_2_id"]] = good_matches3[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df4 = fighter_event_df3.loc[pd.isna(merged3["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df4_temp = fighter_event_df4.copy()
tapology_bouts4 = tapology_bouts3.loc[~tapology_bouts3["tapology_bout_id"].isin(good_matches3["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts4_temp = tapology_bouts4.copy()
tapology_bouts4_temp = tapology_bouts4_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts4_temp["fighter1_first_name"] = tapology_bouts4_temp["fighter1"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
tapology_bouts4_temp["fighter2_first_name"] = tapology_bouts4_temp["fighter2"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
tapology_bouts4_temp = tapology_bouts4_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df4_temp["fighter1_first_name"] = fighter_event_df4_temp["fighter1"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
fighter_event_df4_temp["fighter2_first_name"] = fighter_event_df4_temp["fighter2"].str.lower().str.split().str[0].str.replace(".", "", regex=False)
fighter_event_df4_temp = fighter_event_df4_temp.drop_duplicates(subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
merged4 = fighter_event_df4_temp.merge(tapology_bouts4_temp, on=["event_id", "fighter1_first_name", "fighter2_first_name"], how="left")
good_matches4 = merged4.loc[~pd.isna(merged4["tapology_bout_id"])].reset_index(drop=True)
good_matches4 = good_matches4.drop(columns=["fighter1_first_name", "fighter2_first_name"])
good_matches4[["fighter_1_id", "fighter_2_id"]] = good_matches4[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df5 = fighter_event_df4.loc[pd.isna(merged4["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df5_temp = fighter_event_df5.copy()
tapology_bouts5 = tapology_bouts4.loc[~tapology_bouts4["tapology_bout_id"].isin(good_matches4["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts5_temp = tapology_bouts5.copy()
tapology_bouts5_temp["fighter1_last_name"] = tapology_bouts5_temp["fighter1"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
tapology_bouts5_temp["fighter2_last_name"] = tapology_bouts5_temp["fighter2"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
tapology_bouts5_temp = tapology_bouts5_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df5_temp["fighter1_last_name"] = fighter_event_df5_temp["fighter1"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
fighter_event_df5_temp["fighter2_last_name"] = fighter_event_df5_temp["fighter2"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
fighter_event_df5_temp = fighter_event_df5_temp.drop_duplicates(subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
merged5 = fighter_event_df5_temp.merge(tapology_bouts5_temp, on=["event_id", "fighter1_last_name", "fighter2_last_name"], how="left")
good_matches5 = merged5.loc[~pd.isna(merged5["tapology_bout_id"])].reset_index(drop=True)
good_matches5 = good_matches5.drop(columns=["fighter1_last_name", "fighter2_last_name"])
good_matches5[["fighter_1_id", "fighter_2_id"]] = good_matches5[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df6 = fighter_event_df5.loc[pd.isna(merged5["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df6_temp = fighter_event_df6.copy()
tapology_bouts6 = tapology_bouts5.loc[~tapology_bouts5["tapology_bout_id"].isin(good_matches5["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts6_temp = tapology_bouts6.copy()
tapology_bouts6_temp = tapology_bouts6_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts6_temp["fighter1_last_name"] = tapology_bouts6_temp["fighter1"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
tapology_bouts6_temp["fighter2_last_name"] = tapology_bouts6_temp["fighter2"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
tapology_bouts6_temp = tapology_bouts6_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df6_temp["fighter1_last_name"] = fighter_event_df6_temp["fighter1"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
fighter_event_df6_temp["fighter2_last_name"] = fighter_event_df6_temp["fighter2"].str.lower().str.split().str[-1].str.replace(".", "", regex=False)
fighter_event_df6_temp = fighter_event_df6_temp.drop_duplicates(subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
merged6 = fighter_event_df6_temp.merge(tapology_bouts6_temp, on=["event_id", "fighter1_last_name", "fighter2_last_name"], how="left")
good_matches6 = merged6.loc[~pd.isna(merged6["tapology_bout_id"])].reset_index(drop=True)
good_matches6 = good_matches6.drop(columns=["fighter1_last_name", "fighter2_last_name"])
good_matches6[["fighter_1_id", "fighter_2_id"]] = good_matches6[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df7 = fighter_event_df6.loc[pd.isna(merged6["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df7_temp = fighter_event_df7.copy()
tapology_bouts7 = tapology_bouts6.loc[~tapology_bouts6["tapology_bout_id"].isin(good_matches6["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts7_temp = tapology_bouts7.copy()
tapology_bouts7_temp["fighter1_lower"] = tapology_bouts7_temp["fighter1"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
tapology_bouts7_temp = tapology_bouts7_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df7_temp = fighter_event_df7_temp.copy()
fighter_event_df7_temp["fighter1_lower"] = fighter_event_df7_temp["fighter1"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
fighter_event_df7_temp = fighter_event_df7_temp.drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False).reset_index(drop=True)
merged7 = fighter_event_df7_temp.merge(tapology_bouts7_temp, on=["event_id", "fighter1_lower"], how="left")
good_matches7 = merged7.loc[~pd.isna(merged7["tapology_bout_id"])].reset_index(drop=True)
good_matches7 = good_matches7.drop(columns=["fighter1_lower"])
good_matches7[["fighter_1_id", "fighter_2_id"]] = good_matches7[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df8 = fighter_event_df7.loc[pd.isna(merged7["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df8_temp = fighter_event_df8.copy()
tapology_bouts8 = tapology_bouts7.loc[~tapology_bouts7["tapology_bout_id"].isin(good_matches7["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts8_temp = tapology_bouts8.copy()
tapology_bouts8_temp["fighter2_lower"] = tapology_bouts8_temp["fighter2"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
tapology_bouts8_temp = tapology_bouts8_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df8_temp = fighter_event_df8_temp.copy()
fighter_event_df8_temp["fighter2_lower"] = fighter_event_df8_temp["fighter2"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
fighter_event_df8_temp = fighter_event_df8_temp.drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False).reset_index(drop=True)
merged8 = fighter_event_df8_temp.merge(tapology_bouts8_temp, on=["event_id", "fighter2_lower"], how="left")
good_matches8 = merged8.loc[~pd.isna(merged8["tapology_bout_id"])].reset_index(drop=True)
good_matches8 = good_matches8.drop(columns=["fighter2_lower"])
good_matches8[["fighter_1_id", "fighter_2_id"]] = good_matches8[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df9 = fighter_event_df8.loc[pd.isna(merged8["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df9_temp = fighter_event_df9.copy()
tapology_bouts9 = tapology_bouts8.loc[~tapology_bouts8["tapology_bout_id"].isin(good_matches8["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts9_temp = tapology_bouts9.copy()
tapology_bouts9_temp = tapology_bouts9_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts9_temp["fighter1_lower"] = tapology_bouts9_temp["fighter1"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
tapology_bouts9_temp = tapology_bouts9_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df9_temp = fighter_event_df9_temp.copy()
fighter_event_df9_temp["fighter1_lower"] = fighter_event_df9_temp["fighter1"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
fighter_event_df9_temp = fighter_event_df9_temp.drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False).reset_index(drop=True)
merged9 = fighter_event_df9_temp.merge(tapology_bouts9_temp, on=["event_id", "fighter1_lower"], how="left")
good_matches9 = merged9.loc[~pd.isna(merged9["tapology_bout_id"])].reset_index(drop=True)
good_matches9 = good_matches9.drop(columns=["fighter1_lower"])
good_matches9[["fighter_1_id", "fighter_2_id"]] = good_matches9[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df10 = fighter_event_df9.loc[pd.isna(merged9["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df10_temp = fighter_event_df10.copy()
tapology_bouts10 = tapology_bouts9.loc[~tapology_bouts9["tapology_bout_id"].isin(good_matches9["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts10_temp = tapology_bouts10.copy()
tapology_bouts10_temp = tapology_bouts10_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts10_temp["fighter2_lower"] = tapology_bouts10_temp["fighter2"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
tapology_bouts10_temp = tapology_bouts10_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df10_temp = fighter_event_df10_temp.copy()
fighter_event_df10_temp["fighter2_lower"] = fighter_event_df10_temp["fighter2"].str.lower().str.replace(".", "", regex=False).str.replace("jr", "", regex=False).str.strip().str.replace("dos santos", "", regex=False).str.strip()
fighter_event_df10_temp = fighter_event_df10_temp.drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False).reset_index(drop=True)
merged10 = fighter_event_df10_temp.merge(tapology_bouts10_temp, on=["event_id", "fighter2_lower"], how="left")
good_matches10 = merged10.loc[~pd.isna(merged10["tapology_bout_id"])].reset_index(drop=True)
good_matches10 = good_matches10.drop(columns=["fighter2_lower"])
good_matches10[["fighter_1_id", "fighter_2_id"]] = good_matches10[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df11 = fighter_event_df10.loc[pd.isna(merged10["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df11_temp = fighter_event_df11.copy()
tapology_bouts11 = tapology_bouts10.loc[~tapology_bouts10["tapology_bout_id"].isin(good_matches10["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts11_temp = tapology_bouts11.copy()
tapology_bouts11_temp["fighter1_first_name"] = tapology_bouts11_temp["fighter1"].str.lower().str.split().str[0]
tapology_bouts11_temp = tapology_bouts11_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter1_first_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df11_temp = fighter_event_df11_temp.copy()
fighter_event_df11_temp["fighter1_first_name"] = fighter_event_df11_temp["fighter1"].str.lower().str.split().str[0]
fighter_event_df11_temp = fighter_event_df11_temp.drop_duplicates(subset=["fighter1_first_name", "event_id"], keep=False).reset_index(drop=True)
merged11 = fighter_event_df11_temp.merge(tapology_bouts11_temp, on=["event_id", "fighter1_first_name"], how="left")
good_matches11 = merged11.loc[~pd.isna(merged11["tapology_bout_id"])].reset_index(drop=True)
good_matches11 = good_matches11.drop(columns=["fighter1_first_name"])
good_matches11[["fighter_1_id", "fighter_2_id"]] = good_matches11[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df12 = fighter_event_df11.loc[pd.isna(merged11["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df12_temp = fighter_event_df12.copy()
tapology_bouts12 = tapology_bouts11.loc[~tapology_bouts11["tapology_bout_id"].isin(good_matches11["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts12_temp = tapology_bouts12.copy()
tapology_bouts12_temp["fighter2_first_name"] = tapology_bouts12_temp["fighter2"].str.lower().str.split().str[0]
tapology_bouts12_temp = tapology_bouts12_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df12_temp = fighter_event_df12_temp.copy()
fighter_event_df12_temp["fighter2_first_name"] = fighter_event_df12_temp["fighter2"].str.lower().str.split().str[0]
fighter_event_df12_temp = fighter_event_df12_temp.drop_duplicates(subset=["fighter2_first_name", "event_id"], keep=False).reset_index(drop=True)
merged12 = fighter_event_df12_temp.merge(tapology_bouts12_temp, on=["event_id", "fighter2_first_name"], how="left")
good_matches12 = merged12.loc[~pd.isna(merged12["tapology_bout_id"])].reset_index(drop=True)
good_matches12 = good_matches12.drop(columns=["fighter2_first_name"])
good_matches12[["fighter_1_id", "fighter_2_id"]] = good_matches12[["fighter_1_id", "fighter_2_id"]].astype(int)

fighter_event_df13 = fighter_event_df12.loc[pd.isna(merged12["tapology_bout_id"])].reset_index(drop=True)
fighter_event_df13_temp = fighter_event_df13.copy()
tapology_bouts13 = tapology_bouts12.loc[~tapology_bouts12["tapology_bout_id"].isin(good_matches12["tapology_bout_id"])].reset_index(drop=True)
tapology_bouts13_temp = tapology_bouts13.copy()
tapology_bouts13_temp = tapology_bouts13_temp.rename(columns={"fighter_1_id": "fighter_2_id", "fighter_2_id": "fighter_1_id", "fighter1": "fighter2", "fighter2": "fighter1"})
tapology_bouts13_temp["fighter2_last_name"] = tapology_bouts13_temp["fighter2"].str.lower().str.split().str[-1]
tapology_bouts13_temp = tapology_bouts13_temp.drop(columns=["fighter1", "fighter2"]).drop_duplicates(subset=["fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
fighter_event_df13_temp = fighter_event_df13_temp.copy()
fighter_event_df13_temp["fighter2_last_name"] = fighter_event_df13_temp["fighter2"].str.lower().str.split().str[-1]
fighter_event_df13_temp = fighter_event_df13_temp.drop_duplicates(subset=["fighter2_last_name", "event_id"], keep=False).reset_index(drop=True)
merged13 = fighter_event_df13_temp.merge(tapology_bouts13_temp, on=["event_id", "fighter2_last_name"], how="left")
good_matches13 = merged13.loc[~pd.isna(merged13["tapology_bout_id"])].reset_index(drop=True)
good_matches13 = good_matches13.drop(columns=["fighter2_last_name"])
good_matches13[["fighter_1_id", "fighter_2_id"]] = good_matches13[["fighter_1_id", "fighter_2_id"]].astype(int)

good_matches14 = pd.DataFrame(
    {
        "fighter1": ["Danaa Batgerel", "Ning Guangyou"],
        "fighter2": ["Heili Alateng", "Yang Jianping"],
        "event_id": [1733, 835],
        "tapology_bout_id": ["448800-ufc-on-espn-15-batgerel-danaa-vs-heili-alateng", "146518-ufc-fight-night-48-guangyou-smasher-ning-vs-jianping-tiger-yang"],
        "event_order": [491, 286],
        "bout_order": [2, 7],
        "fighter_1_id": [9468, 4699],
        "fighter_2_id": [9469, 4700],
    }
)
all_good_matches = pd.concat([good_matches1, good_matches2, good_matches3, good_matches4, good_matches5, good_matches6, good_matches7, good_matches8, good_matches9, good_matches10, good_matches11, good_matches12, good_matches13, good_matches14], ignore_index=True)
all_good_matches[["bout_order", "event_order"]] = all_good_matches[["bout_order", "event_order"]].astype(int)

bout_prop_odds = bout_prop_odds.merge(all_good_matches, on=["event_id", "fighter1", "fighter2"], how="left")
bout_prop_odds["fighter_id"] = bout_prop_odds.apply(lambda x: np.nan if pd.isna(x["ref_fighter"]) else x["fighter_1_id"] if x["ref_fighter"] == "fighter1" else x["fighter_2_id"] if x["ref_fighter"] == "fighter2" else "PROBLEM", axis=1)
bout_prop_odds = bout_prop_odds.drop(columns=["ref_fighter", "fighter_1_id", "fighter_2_id", "fighter1", "fighter2"])

bout_prop_odds = bout_prop_odds.melt(id_vars=["tapology_bout_id", "event_id", "event_order", "bout_order", "fighter_id", "Bet", "is_not"], value_vars=["5Dimes", "BetDSI", "BookMaker", "SportBet", "Bet365", "Bovada", "William Hill", "Pinnacle", "SportsInt", "BetOnline", "Intertops"], var_name="betsite", value_name="odds")
bout_prop_odds = bout_prop_odds.dropna(subset=["odds"]).reset_index(drop=True)
bout_prop_odds = bout_prop_odds.sort_values(by=["event_order", "bout_order", "fighter_id", "Bet", "is_not", "betsite"], na_position="first").reset_index(drop=True)
bout_prop_odds = bout_prop_odds.drop(columns=["event_order", "bout_order"]).rename(columns={"Bet": "description"})
bout_prop_odds["fighter_id"] = bout_prop_odds["fighter_id"].astype("Int64")
bout_prop_odds["odds"] = bout_prop_odds["odds"].astype(int)
bout_prop_odds.to_csv(os.path.join(clean_data_dir, "Best Fight Odds", "bout_proposition_odds.csv"), index=False)

In [61]:
event_prop_odds = bestfightodds_prop_odds.loc[bestfightodds_prop_odds["bet_type"] == "Event Prop"].reset_index(drop=True).drop(columns=["bet_type", "fighter1", "fighter2"])

# not bets
event_prop_odds["Bet"] = event_prop_odds["Bet"].replace("Any other result", pd.NA).fillna("(NOT) " + event_prop_odds["Bet"].shift())
event_prop_odds["is_not"] = event_prop_odds["Bet"].str.startswith("(NOT) ").astype(int)
event_prop_odds["Bet"] = event_prop_odds["Bet"].str.replace("(NOT) ", "", regex=False).str.replace("½", ".5", regex=False)

# melt
event_prop_odds = event_prop_odds.melt(id_vars=["event_id", "Bet", "is_not"], var_name="betsite", value_name="odds").dropna().reset_index(drop=True)
event_prop_odds["odds"] = event_prop_odds["odds"].astype(int)

# tapology events
tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[["bestfightodds_id", "event_order"]].dropna().rename(columns={"bestfightodds_id": "event_id"}).reset_index(drop=True)
event_prop_odds = event_prop_odds.merge(tapology_events, on="event_id", how="left")
event_prop_odds = event_prop_odds.sort_values(by=["event_order", "Bet", "is_not", "betsite"]).reset_index(drop=True)
event_prop_odds = event_prop_odds.drop(columns=["event_order"]).rename(columns={"Bet": "description"})

event_prop_odds.to_csv(os.path.join(clean_data_dir, "Best Fight Odds", "event_proposition_odds.csv"), index=False)

In [None]:
df_list = []

# erroneous scrapes
bad_files = [
    "ordinarybet_datatest_UFC_9__Motor_City_Madness.csv",
    "ordinarybet_datatest_UFC_14__Showdown.csv",
    "ordinarybet_datatest_UFC_15__Collision_Course.csv",
    "ordinarybet_datatest_UFC_18__The_Road_to_the_Heavyweight_Title.csv",
    "ordinarybet_datatest_UFC_32__Showdown_in_the_Meadowlands.csv",
    "ordinarybet_datatest_UFC_35__Throwdown.csv",
    "ordinarybet_datatest_UFC_48__Payback.csv",
    "ordinarybet_datatest_UFC_53__Heavy_Hitters.csv",
    "ordinarybet_datatest_UFC_56__Full_Force.csv",
]

# loop through all files in the zip file
with zipfile.ZipFile(os.path.join(raw_data_dir, "Best Fight Odds", "straight_over_time.zip")) as z:
    for filename in z.namelist():
        if filename in bad_files:
            continue
        with z.open(filename) as f:
            df = pd.read_csv(f)
            event_name = filename.split(".csv")[0].replace("ordinarybet_datatest_", "").replace("__", ": ").replace("_", " ")
            df["event_name"] = event_name

            df_list.append(df)

bestfightodds_moneyline_odds = pd.concat(df_list).reset_index(drop=True)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(columns=["class"])

# replace urls
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 73: Stacked", "url"] = "https://www.bestfightodds.com/events/ufc-73-stacked-1"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 74: Respect", "url"] = "https://www.bestfightodds.com/events/ufc-74-respect-7"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 76: Knockout", "url"] = "https://www.bestfightodds.com/events/ufc-76-knockout-12"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 84: Ill Will", "url"] = "https://www.bestfightodds.com/events/ufc-84-ill-will-47"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 85: Bedlam", "url"] = "https://www.bestfightodds.com/events/ufc-85-bedlam-46"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 87: Seek and Destroy", "url"] = "https://www.bestfightodds.com/events/ufc-87-seek-and-destroy-57"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 99: The Comeback", "url"] = "https://www.bestfightodds.com/events/ufc-99-the-comeback-136"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 100", "url"] = "https://www.bestfightodds.com/events/ufc-100-137"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 101: Declaration", "url"] = "https://www.bestfightodds.com/events/ufc-101-declaration-145"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 109: Relentless", "url"] = "https://www.bestfightodds.com/events/ufc-109-relentless-226"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 112: Invincible", "url"] = "https://www.bestfightodds.com/events/ufc-112-invincible-245"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["event_name"] == "UFC 119: Mir vs: Cro Cop", "url"] = "https://www.bestfightodds.com/events/ufc-119-mir-vs-cro-cop-296"

# drop event name and get event id
bestfightodds_moneyline_odds["event_id"] = bestfightodds_moneyline_odds["url"].str.split("/").str[-1].str.split("-").str[-1].astype("Int64")
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(columns=["event_name", "url"])

# drop cancelled fights
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1444) &
    (bestfightodds_moneyline_odds["fighter1"] == "Al Iaquinta") &
    (bestfightodds_moneyline_odds["fighter2"] == "Paul Felder"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1289) &
    (bestfightodds_moneyline_odds["fighter1"] == "Aspen Ladd") &
    (bestfightodds_moneyline_odds["fighter2"] == "Jessica Eye"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1562) &
    (bestfightodds_moneyline_odds["fighter1"] == "Brian Kelleher") &
    (bestfightodds_moneyline_odds["fighter2"] == "Montel Jackson"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1647) &
    (bestfightodds_moneyline_odds["fighter1"] == "Diego Ferreira") &
    (bestfightodds_moneyline_odds["fighter2"] == "Francisco Trinaldo"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1713) &
    (bestfightodds_moneyline_odds["fighter1"] == "Giacomo Lemos") &
    (bestfightodds_moneyline_odds["fighter2"] == "Tanner Boser"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1849) &
    (bestfightodds_moneyline_odds["fighter1"] == "Emily Whitmire") &
    (bestfightodds_moneyline_odds["fighter2"] == "Polyana Viana"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1246) &
    (bestfightodds_moneyline_odds["fighter1"] == "Brett Johns") &
    (bestfightodds_moneyline_odds["fighter2"] == "Ian Entwistle"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1680) &
    (bestfightodds_moneyline_odds["fighter1"] == "John Lineker") &
    (bestfightodds_moneyline_odds["fighter2"] == "Rob Font"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1673) &
    (bestfightodds_moneyline_odds["fighter1"] == "Devin Clark") &
    (bestfightodds_moneyline_odds["fighter2"] == "Ivan Shtyrkov"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1467) &
    (bestfightodds_moneyline_odds["fighter1"] == "Jessica Aguilar") &
    (bestfightodds_moneyline_odds["fighter2"] == "Jodie Esquibel"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1478) &
    (bestfightodds_moneyline_odds["fighter1"] == "Davey Grant") &
    (bestfightodds_moneyline_odds["fighter2"] == "Manny Bermudez"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1626) &
    (bestfightodds_moneyline_odds["fighter1"] == "Ian Heinisch") &
    (bestfightodds_moneyline_odds["fighter2"] == "Tom Breese"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1423) &
    (bestfightodds_moneyline_odds["fighter1"] == "Nad Narimani") &
    (bestfightodds_moneyline_odds["fighter2"] == "Nasrat Haqparast"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1687) &
    (bestfightodds_moneyline_odds["fighter1"] == "Jordan Griffin") &
    (bestfightodds_moneyline_odds["fighter2"] == "Vince Murdock"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1637) &
    (bestfightodds_moneyline_odds["fighter1"] == "Jessica Penne") &
    (bestfightodds_moneyline_odds["fighter2"] == "Jodie Esquibel"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1587) &
    (bestfightodds_moneyline_odds["fighter1"] == "Andrea Lee") &
    (bestfightodds_moneyline_odds["fighter2"] == "Jessica-Rose Clark"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 485) &
    (bestfightodds_moneyline_odds["fighter1"] == "Buddy Roberts") &
    (bestfightodds_moneyline_odds["fighter2"] == "Sean Loeffler"))
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~((bestfightodds_moneyline_odds["event_id"] == 1647) &
    (bestfightodds_moneyline_odds["fighter1"] == "Melissa Gatto") &
    (bestfightodds_moneyline_odds["fighter2"] == "Talita Bernardo"))
].reset_index(drop=True)

# drop fighter 1 and fighter 2 columns
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(columns=["fighter1", "fighter2"])

# rename William Hill
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["betsite"] == "William\xa0H.", "betsite"] = "William Hill"

# timestamp seconds
bestfightodds_moneyline_odds["dates"] = bestfightodds_moneyline_odds["dates"].divide(1000).astype(int)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.rename(columns={"dates": "timestamp"})

tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[["id", "bestfightodds_id", "event_order"]]
tapology_events = tapology_events.rename(columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"})
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.merge(tapology_events, on="event_id", how="left")

tapology_bouts = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"))[["event_id", "fighter_1_id", "fighter_2_id"]]
tapology_bouts = tapology_bouts.loc[tapology_bouts["event_id"].isin(bestfightodds_moneyline_odds["tapology_event_id"])]
stacked_fighters = tapology_bouts.melt(id_vars="event_id", value_vars=["fighter_1_id", "fighter_2_id"], value_name="fighter_id").drop(columns="variable")
tapology_fighters = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv"))[["id", "bestfightodds_id", "name"]]
stacked_fighters = stacked_fighters.merge(tapology_fighters, left_on="fighter_id", right_on="id", how="left").drop(columns=["id", "fighter_id"]).rename(columns={"bestfightodds_id": "fighter_id", "event_id": "tapology_event_id", "name": "Bet"})
stacked_fighters["Bet"] = stacked_fighters["Bet"].str.lower().str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
stacked_fighters["Bet"] = stacked_fighters["Bet"].str.replace("jr.", "junior")
stacked_fighters["fighter_id"] = stacked_fighters["fighter_id"].astype("Int64")
bestfightodds_moneyline_odds["Bet"] = bestfightodds_moneyline_odds["Bet"].str.lower()

# replace duplicate names
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "paulo borrachinha", "Bet"] = "paulo costa"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "paulo henrique costa", "Bet"] = "paulo costa"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "rodrigo lima", "Bet"] = "rodrigo de lima"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "rodrigo goiana de lima", "Bet"] = "rodrigo de lima"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "t.j. waldburger", "Bet"] = "tj waldburger"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "anthony waldburger", "Bet"] = "tj waldburger"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "luiz henrique da silva", "Bet"] = "luis henrique da silva"
bestfightodds_moneyline_odds.loc[bestfightodds_moneyline_odds["Bet"] == "henrique da silva", "Bet"] = "luis henrique da silva"

bestfightodds_moneyline_odds2 = bestfightodds_moneyline_odds.merge(stacked_fighters, on=["tapology_event_id", "Bet"], how="left")
good_matches = bestfightodds_moneyline_odds2.loc[bestfightodds_moneyline_odds2["fighter_id"].notnull()]

bestfightodds_moneyline_odds3 = bestfightodds_moneyline_odds2.loc[bestfightodds_moneyline_odds2["fighter_id"].isnull()].drop(columns=["fighter_id"])

# select rows where fighter_id and tapology_event_id pair is not in good_matches
matched_tuples = set(zip(good_matches["fighter_id"], good_matches["tapology_event_id"]))
stacked_fighters2 = stacked_fighters.copy()
stacked_fighters2["temp_tuple"] = list(zip(stacked_fighters2["fighter_id"], stacked_fighters2["tapology_event_id"]))
stacked_fighters2 = stacked_fighters2.loc[~stacked_fighters2["temp_tuple"].isin(matched_tuples)]
stacked_fighters2 = stacked_fighters2.drop(columns="temp_tuple")

stacked_fighters2["last_name_first"] = stacked_fighters2["Bet"].str.split(" ").str[-1] + " " + stacked_fighters2["Bet"].str.split(" ").str[:-1].str.join(" ")
stacked_fighters2 = stacked_fighters2.drop_duplicates(subset=["tapology_event_id", "last_name_first"], keep=False)
stacked_fighters2 = stacked_fighters2.drop(columns="Bet")

bestfightodds_moneyline_odds3["last_name_first"] = bestfightodds_moneyline_odds3["Bet"].copy()

bestfightodds_moneyline_odds3 = bestfightodds_moneyline_odds3.merge(stacked_fighters2, on=["tapology_event_id", "last_name_first"], how="left")
good_matches2 = bestfightodds_moneyline_odds3.loc[bestfightodds_moneyline_odds3["fighter_id"].notnull()].drop(columns=["last_name_first"])

bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds3.loc[bestfightodds_moneyline_odds3["fighter_id"].isnull()].copy()
bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds4.drop(columns=["fighter_id", "last_name_first"])
matched_tuples2 = set(zip(good_matches2["fighter_id"], good_matches2["tapology_event_id"])) | matched_tuples
stacked_fighters3 = stacked_fighters.copy()
stacked_fighters3["temp_tuple"] = list(zip(stacked_fighters3["fighter_id"], stacked_fighters3["tapology_event_id"]))
stacked_fighters3 = stacked_fighters3.loc[~stacked_fighters3["temp_tuple"].isin(matched_tuples2)]
stacked_fighters3 = stacked_fighters3.drop(columns="temp_tuple")

bestfightodds_moneyline_odds4["first_name"] = bestfightodds_moneyline_odds4["Bet"].str.split(" ").str[0]
stacked_fighters3["first_name"] = stacked_fighters3["Bet"].str.split(" ").str[0]
stacked_fighters3 = stacked_fighters3.drop_duplicates(subset=["tapology_event_id", "first_name"], keep=False)
stacked_fighters3 = stacked_fighters3.drop(columns="Bet")

bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds4.merge(stacked_fighters3, on=["tapology_event_id", "first_name"], how="left")
good_matches3 = bestfightodds_moneyline_odds4.loc[bestfightodds_moneyline_odds4["fighter_id"].notnull()].drop(columns=["first_name"])

bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds4.loc[bestfightodds_moneyline_odds4["fighter_id"].isnull()].copy()
bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds5.drop(columns=["fighter_id", "first_name"])
matched_tuples3 = set(zip(good_matches3["fighter_id"], good_matches3["tapology_event_id"])) | matched_tuples2
stacked_fighters4 = stacked_fighters.copy()
stacked_fighters4["temp_tuple"] = list(zip(stacked_fighters4["fighter_id"], stacked_fighters4["tapology_event_id"]))
stacked_fighters4 = stacked_fighters4.loc[~stacked_fighters4["temp_tuple"].isin(matched_tuples3)]
stacked_fighters4 = stacked_fighters4.drop(columns="temp_tuple")

stacked_fighters4["last_name"] = stacked_fighters4["Bet"].str.split(" ").str[-1]
stacked_fighters4 = stacked_fighters4.drop_duplicates(subset=["tapology_event_id", "last_name"], keep=False)
stacked_fighters4 = stacked_fighters4.drop(columns="Bet")
bestfightodds_moneyline_odds5["last_name"] = bestfightodds_moneyline_odds5["Bet"].str.split(" ").str[-1]
bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds5.merge(stacked_fighters4, on=["tapology_event_id", "last_name"], how="left")
good_matches4 = bestfightodds_moneyline_odds5.loc[bestfightodds_moneyline_odds5["fighter_id"].notnull()].drop(columns=["last_name"])

bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds5.loc[bestfightodds_moneyline_odds5["fighter_id"].isnull()].copy()
bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds6.drop(columns=["fighter_id", "last_name"])
matched_tuples4 = set(zip(good_matches4["fighter_id"], good_matches4["tapology_event_id"])) | matched_tuples3
stacked_fighters5 = stacked_fighters.copy()
stacked_fighters5["temp_tuple"] = list(zip(stacked_fighters5["fighter_id"], stacked_fighters5["tapology_event_id"]))
stacked_fighters5 = stacked_fighters5.loc[~stacked_fighters5["temp_tuple"].isin(matched_tuples4)]
stacked_fighters5 = stacked_fighters5.drop(columns="temp_tuple")

# manually assign
stacked_fighters5.loc[stacked_fighters5["Bet"] == "zhumabek tursyn", "name_alt"] = "jumabieke tuerxun"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "joe duffy", "name_alt"] = "joseph duffey"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "cris cyborg", "name_alt"] = "cristiane justino"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "abdul razak alhassan", "name_alt"] = "razak al-hassan"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "wulijiburen", "name_alt"] = "wuliji buren"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "serghei spivac", "name_alt"] = "sergey spivak"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "sumudaerji", "name_alt"] = "su mudaerji"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "alatengheili", "name_alt"] = "heili alateng"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "leonardo guimaraes", "name_alt"] = "leonardo augusto leleco"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "aleksei oleinik", "name_alt"] = "oleksiy oliynyk"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "seung woo choi", "name_alt"] = "seungwoo choi"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "doo ho choi", "name_alt"] = "dooho choi"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "dmitry smolyakov", "name_alt"] = "dmitrii smoliakov"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "zubaira tukhugov", "name_alt"] = "zubair tuhugov"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "luis henrique barbosa", "name_alt"] = "luiz henrique"

stacked_fighters5 = stacked_fighters5.drop(columns="Bet")

bestfightodds_moneyline_odds6["name_alt"] = bestfightodds_moneyline_odds6["Bet"].copy()
bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds6.merge(stacked_fighters5, on=["tapology_event_id", "name_alt"], how="left")
good_matches5 = bestfightodds_moneyline_odds6.loc[bestfightodds_moneyline_odds6["fighter_id"].notnull()].drop(columns=["name_alt"])

bestfightodds_moneyline_odds7 = bestfightodds_moneyline_odds6.loc[bestfightodds_moneyline_odds6["fighter_id"].isnull()].copy()

all_matches = pd.concat([good_matches, good_matches2, good_matches3, good_matches4, good_matches5], axis=0, ignore_index=True).reset_index(drop=True)
tapology_bouts2 = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"))[["id", "event_id", "fighter_1_id", "fighter_2_id", "bout_order"]]
stacked = tapology_bouts2.melt(id_vars=["id", "event_id", "bout_order"], value_vars=["fighter_1_id", "fighter_2_id"], value_name="tapology_fighter_id").drop(columns="variable")
stacked = stacked.rename(columns={"id": "tapology_bout_id", "event_id": "tapology_event_id"})
tapology_fighters2 = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv"))[["id", "bestfightodds_id"]].rename(columns={"id": "tapology_fighter_id", "bestfightodds_id": "fighter_id"})
tapology_events2 = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[["id", "bestfightodds_id"]].rename(columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"})
stacked = stacked.merge(tapology_fighters2, on="tapology_fighter_id", how="inner")
stacked = stacked.merge(tapology_events2, on="tapology_event_id", how="inner")
stacked = stacked.dropna(subset=["fighter_id", "event_id"]).reset_index(drop=True)
stacked = stacked.drop(columns=["tapology_fighter_id", "tapology_event_id"])
stacked["fighter_id"] = stacked["fighter_id"].astype(int)
stacked["event_id"] = stacked["event_id"].astype(int)
all_matches = all_matches.merge(stacked, on=["fighter_id", "event_id"], how="left")
all_matches = all_matches.sort_values(by=["event_order", "bout_order", "fighter_id", "betsite", "timestamp"]).reset_index(drop=True)

all_matches = all_matches[["event_id", "fighter_id", "betsite", "timestamp", "odds"]]
all_matches.to_csv(os.path.join(clean_data_dir, "Best Fight Odds", "moneyline_odds.csv"), index=False)

## FightOdds.io

- `sportsbooks` and `events` are already cleaned

In [None]:
fightoddsio_bouts = pd.read_csv(os.path.join(raw_data_dir, "FightOdds.io", "bouts.csv"))

cancelled_bouts = fightoddsio_bouts.loc[fightoddsio_bouts["is_cancelled"] == 1].reset_index(drop=True)
fightoddsio_bouts = fightoddsio_bouts.loc[fightoddsio_bouts["is_cancelled"] == 0].reset_index(drop=True)

fightoddsio_bouts["weight_class"] = fightoddsio_bouts["weight_class"].str.title()
fightoddsio_bouts["weight_lbs"] = fightoddsio_bouts["weight_lbs"].astype("Int64")
fightoddsio_bouts["end_round"] = fightoddsio_bouts["end_round"].astype("Int64")

fightoddsio_bouts.loc[fightoddsio_bouts["end_round_time"] == "Roun", "end_round_time"] = np.nan
fightoddsio_bouts["fighter_1_odds"] = fightoddsio_bouts["fighter_1_odds"].astype("Int64")
fightoddsio_bouts["fighter_2_odds"] = fightoddsio_bouts["fighter_2_odds"].astype("Int64")

fightoddsio_bouts = fightoddsio_bouts.drop(columns=["is_cancelled"])

fightoddsio_bouts.to_csv(os.path.join(clean_data_dir, "FightOdds.io", "bouts.csv"), index=False)

In [None]:
fightoddsio_fighters = pd.read_csv(os.path.join(raw_data_dir, "FightOdds.io", "fighters.csv"), parse_dates=["date_of_birth"])

# due to cancellation
fighters_never_fought = {
    'RmlnaHRlck5vZGU6MTA0Mjg=',
    'RmlnaHRlck5vZGU6MTA4NDU=',
    'RmlnaHRlck5vZGU6MTA4NTk=',
    'RmlnaHRlck5vZGU6MTAwNDU=',
    'RmlnaHRlck5vZGU6MTIzMDk=',
    'RmlnaHRlck5vZGU6MTIzMTE=',
    'RmlnaHRlck5vZGU6MTQ1OTY=',
    'RmlnaHRlck5vZGU6MTU0NzE=',
    'RmlnaHRlck5vZGU6MTY1NjM=',
    'RmlnaHRlck5vZGU6MTY4NTE=',
    'RmlnaHRlck5vZGU6MTgzODM=',
    'RmlnaHRlck5vZGU6MTk5NTE=',
    'RmlnaHRlck5vZGU6MTkzMDc=',
    'RmlnaHRlck5vZGU6MjAwNzU=',
    'RmlnaHRlck5vZGU6MjAyMA==',
    'RmlnaHRlck5vZGU6MjAzMzM=',
    'RmlnaHRlck5vZGU6MjE3MA==',
    'RmlnaHRlck5vZGU6MjEzMzk=',
    'RmlnaHRlck5vZGU6MjM2MTE=',
    'RmlnaHRlck5vZGU6MjgxOTA=',
    'RmlnaHRlck5vZGU6MjgyOTk=',
    'RmlnaHRlck5vZGU6Mjk5NTA=',
    'RmlnaHRlck5vZGU6MjkzMjg=',
    'RmlnaHRlck5vZGU6MzAxOTE=',
    'RmlnaHRlck5vZGU6MzU2OA==',
    'RmlnaHRlck5vZGU6MzYyNjI=',
    'RmlnaHRlck5vZGU6Mzc3MA==',
    'RmlnaHRlck5vZGU6NDAyNw==',
    'RmlnaHRlck5vZGU6NDI2Mjc=',
    'RmlnaHRlck5vZGU6NDUxNzM=',
    'RmlnaHRlck5vZGU6NDc4MTI=',
    'RmlnaHRlck5vZGU6NDcwNjg=',
    'RmlnaHRlck5vZGU6NTI1MDE=',
    'RmlnaHRlck5vZGU6NTIxOTE=',
    'RmlnaHRlck5vZGU6NTgzOTQ=',
    'RmlnaHRlck5vZGU6NTk4NA==',
    'RmlnaHRlck5vZGU6NjczMQ==',
    'RmlnaHRlck5vZGU6Njg4Mw==',
    'RmlnaHRlck5vZGU6NzM3Mg==',
    'RmlnaHRlck5vZGU6ODEzNg=='
}

fightoddsio_fighters = fightoddsio_fighters.loc[~fightoddsio_fighters["id"].isin(fighters_never_fought)].sort_values(by="pk").reset_index(drop=True)
fightoddsio_fighters["leg_reach_inches"] = fightoddsio_fighters["leg_reach_inches"].astype("Int64")

fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Stirker", "fighting_style"] = "Striker"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Strker", "fighting_style"] = "Striker"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Power Striker", "fighting_style"] = "Striker"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Pressure Striker", "fighting_style"] = "Striker"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Technical Striker", "fighting_style"] = "Striker"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Wrestling", "fighting_style"] = "Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Sambo", "fighting_style"] = "Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Brazilian Jiu-Jitsu", "fighting_style"] = "Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "BJJ Grappler", "fighting_style"] = "Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Judo Grappler", "fighting_style"] = "Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "BJJ Grappler / Striker", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / BJJ Grappler", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / Bjj Grappler", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Power Striker/Grappler", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker/ Grappler", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / Grappler", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / Judo", "fighting_style"] = "Striker/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / Sambo", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Stirker/Wrestler", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Power Striker/Wrestler", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Boxer / Wrestler", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Wrestler / Boxing", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker / Wrestler", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Striker/ Wrestler", "fighting_style"] = "Striker/Wrestler"
fightoddsio_fighters.loc[fightoddsio_fighters["fighting_style"] == "Wrestler / BJJ Grappler", "fighting_style"] = "Wrestler/Grappler"
fightoddsio_fighters.loc[fightoddsio_fighters["id"] == "RmlnaHRlck5vZGU6NDExMA==", "fighting_style"] = "Grappler"

fightoddsio_fighters["nationality"] = fightoddsio_fighters["nationality"].str.strip()

fightoddsio_fighters.to_csv(os.path.join(clean_data_dir, "FightOdds.io", "fighters.csv"), index=False)

In [149]:
cancelled_bout_ids = cancelled_bouts["id"].unique()
fightoddsio_moneyline_odds = pd.read_csv(os.path.join(raw_data_dir, "FightOdds.io", "moneyline_odds_summaries.csv"))
fightoddsio_moneyline_odds = fightoddsio_moneyline_odds.loc[~fightoddsio_moneyline_odds["bout_id"].isin(cancelled_bout_ids)].reset_index(drop=True)

odds_cols = ["fighter_1_odds_open", "fighter_1_odds_worst", "fighter_1_odds_current", "fighter_1_odds_best", "fighter_2_odds_open", "fighter_2_odds_worst", "fighter_2_odds_current", "fighter_2_odds_best"]
fightoddsio_moneyline_odds = fightoddsio_moneyline_odds.loc[~fightoddsio_moneyline_odds[odds_cols].isnull().all(axis=1)].copy().reset_index(drop=True)

fightoddsio_moneyline_odds[odds_cols] = fightoddsio_moneyline_odds[odds_cols].astype("Int64")

fightoddsio_moneyline_odds.to_csv(os.path.join(clean_data_dir, "FightOdds.io", "moneyline_odds.csv"), index=False)

In [None]:
fightoddsio_proposition_odds = pd.read_csv(os.path.join(raw_data_dir, "FightOdds.io", "expected_outcome_summaries.csv"))
fightoddsio_proposition_odds["fighter_pk"] = fightoddsio_proposition_odds["fighter_pk"].astype("Int64")

# ignore totals since i didn't scrape the values
fightoddsio_proposition_odds = fightoddsio_proposition_odds.loc[~fightoddsio_proposition_odds["offer_type_id"].isin(["TOTAL_SS", "TOTAL_TD"])].reset_index(drop=True)

fightoddsio_proposition_odds.to_csv(os.path.join(clean_data_dir, "FightOdds.io", "proposition_odds.csv"), index=False)

## Tapology

In [116]:
tapology_events = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "events.csv"))
tapology_events = tapology_events.drop(columns="wikipedia_url")
tapology_events["bestfightodds_id"] = tapology_events["bestfightodds_id"].str.split("-").str[-1].astype("Int64")

# fill in missing bestfightodds ids
tapology_events.loc[tapology_events["id"] == "24353-ufc-179", "bestfightodds_id"] = 855
tapology_events.loc[tapology_events["id"] == "30831-ufc-193", "bestfightodds_id"] = 1002
tapology_events.loc[tapology_events["id"] == "37906-ufc-fight-night-87-jacare-vs-belfort", "bestfightodds_id"] = 1070
tapology_events.loc[tapology_events["id"] == "38668-ufc-199-rockhold-vs-weidman-2", "bestfightodds_id"] = 1081
tapology_events.loc[tapology_events["id"] == "41373-ufc-207", "bestfightodds_id"] = 1210
tapology_events.loc[tapology_events["id"] == "43945-ufc-211", "bestfightodds_id"] = 1252
tapology_events.loc[tapology_events["id"] == "38669-ufc-fight-night-88", "bestfightodds_id"] = 1104
tapology_events.loc[tapology_events["id"] == "28239-ufc-fight-night-65", "bestfightodds_id"] = 936
tapology_events.loc[tapology_events["id"] == "44376-ufc-fight-night", "bestfightodds_id"] = 1277
tapology_events.loc[tapology_events["id"] == "44562-ufc-fight-night-110", "bestfightodds_id"] = 1275
tapology_events.loc[tapology_events["id"] == "26376-ufc-fight-night-55", "bestfightodds_id"] = 894
tapology_events.loc[tapology_events["id"] == "45800-ufc-fight-night", "bestfightodds_id"] = 1332
tapology_events.loc[tapology_events["id"] == "44728-ufc-fight-night", "bestfightodds_id"] = 1310
tapology_events.loc[tapology_events["id"] == "16683-ufc-on-fox-9", "bestfightodds_id"] = 729

tapology_events.to_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"), index=False)

In [14]:
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))

# isolate fighter-gym-bout relationships
temp = tapology_bouts[["id", "fighter_1_id", "fighter_2_id", "fighter_1_gym_info", "fighter_1_gym_ids", "fighter_2_gym_info", "fighter_2_gym_ids"]].copy()

temp_f1 = temp[["fighter_1_id", "id", "fighter_1_gym_ids", "fighter_1_gym_info"]].rename(columns={
    "fighter_1_id": "fighter_id",
    "fighter_1_gym_info": "gym_info",
    "fighter_1_gym_ids": "gym_ids",
    "id": "bout_id"
})
temp_f2 = temp[["fighter_2_id", "id", "fighter_2_gym_ids", "fighter_2_gym_info"]].rename(columns={
    "fighter_2_id": "fighter_id",
    "fighter_2_gym_info": "gym_info",
    "fighter_2_gym_ids": "gym_ids",
    "id": "bout_id"
})

tapology_fighter_gyms_by_bout = pd.concat([temp_f1, temp_f2]).sort_index().reset_index(drop=True)
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.dropna(subset=["gym_info", "gym_ids"], how="all")

# fill in missing gym ids only if gym id is missing
tapology_gyms = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"))
name_to_id = tapology_gyms.set_index("name")["id"].to_dict()
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_ids"] = tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_info"].map(name_to_id)

name_alt_to_id = tapology_gyms.set_index("name_alternative")["id"].to_dict()
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_ids"] = tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_info"].map(name_alt_to_id)

# rename duplicate gym names
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Ataque Duplo / Team Tavares", "gym_info"] = "Ataque Duplo"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Ataque Duplo / Thiago Tavares Team", "gym_info"] = "Ataque Duplo"

# fill in missing gyms
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Beverly Hills Jiu-Jitsu Club", "gym_ids"] = "3224-beverly-hills-jiu-jitsu-club"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Black House", "gym_ids"] = "779-black-house-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Black Tiger Team", "gym_ids"] = "5048-black-tiger-fight-club"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Cesar Gracie Jiu-Jitsu", "gym_ids"] = "783-cesar-gracie-jiu-jitsu"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Chute Boxe Academy", "gym_ids"] = "5110-chute-boxe-academy"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Dragon's Lair MMA", "gym_ids"] = "6182-dragons-lair-melksham"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Elite Training Center", "gym_ids"] = "2607-elite-training-center"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "FFC Gym", "gym_ids"] = "9865-ffc-team"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Fisticuff/Purebred Omiya", "gym_ids"] = "5967-fisticuffspurebred"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Frankiko Team / Trator Team", "gym_ids"] = "9465-frankiko-team"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Gile Ribeiro Team / Noguchi", "gym_ids"] = "9213-gile-ribeiro-team"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Gladiators Training Academy", "gym_ids"] = "2544-eugene-jacksons-undisputed-gladiators"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "HIT Squad", "gym_ids"] = "1170-finneys-hit-squad"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Hilti NHB", "gym_ids"] = "522-hilti-bjj"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "IN FIGHT JAPAN", "gym_ids"] = "4366-infight-japan"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Integrated Fighting", "gym_ids"] = "1214-integrated-fighting-academy"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Kaisho Martial Arts", "gym_ids"] = "9339-kaisho-kampsport-klubb"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Lovato's BJJ", "gym_ids"] = "5236-lovatos-brazilian-jiu-jitsu-and-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "MMA Clinic", "gym_ids"] = "5247-the-mma-clinic"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Next Generation UK", "gym_ids"] = "3263-next-generation-mma-liverpool"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Perfect Team", "gym_ids"] = "4933-perfect-team-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Pro Athletes", "gym_ids"] = "7684-proathlets"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Ralph Gracie", "gym_ids"] = "12357-ralph-gracie-jiu-jitsu"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Range", "gym_ids"] = "10526-range-martial-arts-academy"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Renyi Fight Camp", "gym_ids"] = "2118-renyi"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Ruas Vale Tudo", "gym_ids"] = "4382-ruas-vale-tudo"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Sure Grip Vale Tudo", "gym_ids"] = "507-team-sure-grip"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Tapout Training Center", "gym_ids"] = "731-tapout-training-center"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "The Jungle MMA", "gym_ids"] = "117-the-jungle-mma-fitness"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Throwdown Elite Training Center", "gym_ids"] = "732-throwdown-training-center"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Throwdown Training Center", "gym_ids"] = "732-throwdown-training-center"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Todd Medina Freestyle Team", "gym_ids"] = "4478-tod-medinas-fight-school"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Trojan Freefighters", "gym_ids"] = "8081-trojan-free-fighters-gloucester"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "VT-1 Gym", "gym_ids"] = "11234-vt1-martial-arts"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "VT1 Gym", "gym_ids"] = "11234-vt1-martial-arts"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "West Coast Fight Team", "gym_ids"] = "1106-west-coast-fight-team"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Wolfslair Academy", "gym_ids"] = "3321-wolfslair-mma-academy"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Wolfslair MMA Academy", "gym_ids"] = "3321-wolfslair-mma-academy"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "World Class MMA", "gym_ids"] = "5210-world-class-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Xplode MMA", "gym_ids"] = "3389-xplode-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Team Xplode MMA", "gym_ids"] = "3389-xplode-mma"
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_info"] == "Zen Jiu-Jitsu", "gym_ids"] = "4058-zen-jiu-jitsu"

# explode gym ids and gym info into separate rows
tapology_fighter_gyms_by_bout["gym_ids"] = tapology_fighter_gyms_by_bout["gym_ids"].str.split("; ")
tapology_fighter_gyms_by_bout["gym_info"] = tapology_fighter_gyms_by_bout["gym_info"].str.split("; ")

tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.explode(["gym_ids", "gym_info"]).reset_index(drop=True)
tapology_fighter_gyms_by_bout["gym_purpose"] = tapology_fighter_gyms_by_bout["gym_info"].str.extract(r"\((.*)\)")[0]
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_purpose"] == "St. Petersburg", "gym_purpose"] = np.nan
# tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.drop(columns=["gym_info"])
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.rename(columns={"gym_ids": "gym_id"})

gyms = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"))[["id", "name"]]
gyms = gyms.rename(columns={"id": "gym_id", "name": "gym_name"})
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.merge(gyms, on="gym_id", how="left")

# fill gym_name with gym_info if gym_name is missing
tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_name"].isnull(), "gym_name"] = tapology_fighter_gyms_by_bout.loc[tapology_fighter_gyms_by_bout["gym_name"].isnull(), "gym_info"]

tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout[["fighter_id", "bout_id", "gym_id", "gym_name", "gym_purpose"]]

# tapology_fighter_gyms_by_bout.to_csv(os.path.join(clean_data_dir, "Tapology", "fighter_gyms_by_bout.csv"), index=False)

# # concat missing gyms to gyms
# missing_gyms = pd.read_csv("tapology_missing_gyms.csv")

# tapology_gyms_clean = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"))
# tapology_gyms_clean = pd.concat([tapology_gyms_clean, missing_gyms]).drop_duplicates("id").reset_index(drop=True)
# tapology_gyms_clean.to_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"), index=False)
tapology_fighter_gyms_by_bout.to_csv(os.path.join(clean_data_dir, "Tapology", "fighter_gyms.csv"), index=False)

In [479]:
tapology_fighters = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "fighters.csv"), parse_dates=["date_of_birth"])

def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan
    
    height_str = height_str.split("(")[0].replace('"', "").strip()
    feet, inches = map(int, height_str.split("'"))

    return feet * 12 + inches

tapology_fighters["height"] = tapology_fighters["height"].apply(convert_height)
tapology_fighters = tapology_fighters.rename(columns={"height": "height_inches"})
tapology_fighters["height_inches"] = tapology_fighters["height_inches"].astype("Int64")

def clean_reach(reach_str):
    if pd.isna(reach_str):
        return np.nan
    
    reach_str = reach_str.split("(")[0].replace('"', "").strip()

    return float(reach_str)

tapology_fighters["reach"] = tapology_fighters["reach"].apply(clean_reach)
tapology_fighters = tapology_fighters.rename(columns={"reach": "reach_inches"})
tapology_fighters["bestfightodds_id"] = tapology_fighters["bestfightodds_id"].str.split("-").str[-1].astype("Int64")

# fill in missing bestfightodds ids
tapology_fighters.loc[tapology_fighters["id"] == "70996-fredy-serrano", "bestfightodds_id"] = 5321
tapology_fighters.loc[tapology_fighters["id"] == "43585-levan-makashvili", "bestfightodds_id"] = 5499
tapology_fighters.loc[tapology_fighters["id"] == "24897-sirwan-kakai-zohan", "bestfightodds_id"] = 2503
tapology_fighters.loc[tapology_fighters["id"] == "41568-tony-sims", "bestfightodds_id"] = 5715
tapology_fighters.loc[tapology_fighters["id"] == "20516-andrew-holbrook", "bestfightodds_id"] = 5727
tapology_fighters.loc[tapology_fighters["id"] == "89121-adam-yandiev", "bestfightodds_id"] = 8459
tapology_fighters.loc[tapology_fighters["id"] == "74166-jin-soo-son", "bestfightodds_id"] = 8537
tapology_fighters.loc[tapology_fighters["id"] == "123362-michel-batista", "bestfightodds_id"] = 7819
tapology_fighters.loc[tapology_fighters["id"] == "24429-reginaldo-vieira", "bestfightodds_id"] = 5722
tapology_fighters.loc[tapology_fighters["id"] == "4420-erik-montano", "bestfightodds_id"] = 5985
tapology_fighters.loc[tapology_fighters["id"] == "33349-geane-herrera-la-pulga", "bestfightodds_id"] = 5729
tapology_fighters.loc[tapology_fighters["id"] == "steve-bosse", "bestfightodds_id"] = 5606
tapology_fighters.loc[tapology_fighters["id"] == "60336-james-mulheron", "bestfightodds_id"] = 6427
tapology_fighters.loc[tapology_fighters["id"] == "4430-bojan-mihajlovic", "bestfightodds_id"] = 6191
tapology_fighters.loc[tapology_fighters["id"] == "25001-bharat-khandare-daring", "bestfightodds_id"] = 7727
tapology_fighters.loc[tapology_fighters["id"] == "90671-carls-john-de-tomas-goldenboy", "bestfightodds_id"] = 7126
tapology_fighters.loc[tapology_fighters["id"] == "55567-jesus-pinedo", "bestfightodds_id"] = 8714
tapology_fighters.loc[tapology_fighters["id"] == "24602-joe-meunier", "bestfightodds_id"] = 6502
tapology_fighters.loc[tapology_fighters["id"] == "39663-abdul-kerim-edilov", "bestfightodds_id"] = 7351
tapology_fighters.loc[tapology_fighters["id"] == "98431-khalid-murtazaliev", "bestfightodds_id"] = 8169
tapology_fighters.loc[tapology_fighters["id"] == "mark-scanlon-scanno", "bestfightodds_id"] = 2162
tapology_fighters.loc[tapology_fighters["id"] == "jesse-bongfeldt-water", "bestfightodds_id"] = 2169
tapology_fighters.loc[tapology_fighters["id"] == "65391-kwan-ho-kwak", "bestfightodds_id"] = 6818
tapology_fighters.loc[tapology_fighters["id"] == "19970-yusuke-kasuya", "bestfightodds_id"] = 5793
tapology_fighters.loc[tapology_fighters["id"] == "alex-ricci", "bestfightodds_id"] = 2588
tapology_fighters.loc[tapology_fighters["id"] == "4847-cindy-dandois-battlecat", "bestfightodds_id"] = 5334
tapology_fighters.loc[tapology_fighters["id"] == "11166-wagner-campos-galeto", "bestfightodds_id"] = 3422
tapology_fighters.loc[tapology_fighters["id"] == "delson-heleno-pe-de-chumbo", "bestfightodds_id"] = 329
tapology_fighters.loc[tapology_fighters["id"] == "7793-anistavo-gasparzinho", "bestfightodds_id"] = 3417
tapology_fighters.loc[tapology_fighters["id"] == "31203-pedro-nobre-the-rock", "bestfightodds_id"] = 3819
tapology_fighters.loc[tapology_fighters["id"] == "34691-adam-cella", "bestfightodds_id"] = 4006
tapology_fighters.loc[tapology_fighters["id"] == "37069-justin-jones", "bestfightodds_id"] = 4930
tapology_fighters.loc[tapology_fighters["id"] == "15647-emily-kagen", "bestfightodds_id"] = 3802
tapology_fighters.loc[tapology_fighters["id"] == "42696-bentley-syler", "bestfightodds_id"] = 5320
tapology_fighters.loc[tapology_fighters["id"] == "27661-rocky-lee", "bestfightodds_id"] = 5528
tapology_fighters.loc[tapology_fighters["id"] == "44658-izabela-badurek", "bestfightodds_id"] = 5540
tapology_fighters.loc[tapology_fighters["id"] == "23167-jonavin-webb", "bestfightodds_id"] = 5597
tapology_fighters.loc[tapology_fighters["id"] == "50067-ericka-almeida", "bestfightodds_id"] = 5633
tapology_fighters.loc[tapology_fighters["id"] == "steven-kennedy", "bestfightodds_id"] = 5684
tapology_fighters.loc[tapology_fighters["id"] == "11579-steve-montgomery", "bestfightodds_id"] = 5070
tapology_fighters.loc[tapology_fighters["id"] == "32886-fernando-bruno", "bestfightodds_id"] = 5719
tapology_fighters.loc[tapology_fighters["id"] == "21482-roger-zapata-viva", "bestfightodds_id"] = 5511
tapology_fighters.loc[tapology_fighters["id"] == "55402-bruno-korea", "bestfightodds_id"] = 5922
tapology_fighters.loc[tapology_fighters["id"] == "12613-anthony-christodoulou-tony", "bestfightodds_id"] = 5416
tapology_fighters.loc[tapology_fighters["id"] == "12724-lukasz-sajewski-wookie", "bestfightodds_id"] = 5113
tapology_fighters.loc[tapology_fighters["id"] == "54486-joe-merritt", "bestfightodds_id"] = 5714
tapology_fighters.loc[tapology_fighters["id"] == "32791-anton-zafir", "bestfightodds_id"] = 5970
tapology_fighters.loc[tapology_fighters["id"] == "85356-vernon-ramos", "bestfightodds_id"] = 5989
tapology_fighters.loc[tapology_fighters["id"] == "36474-enrique-marin", "bestfightodds_id"] = 5984
tapology_fighters.loc[tapology_fighters["id"] == "abner-lloveras", "bestfightodds_id"] = 6087
tapology_fighters.loc[tapology_fighters["id"] == "43005-joey-gomez", "bestfightodds_id"] = 6023
tapology_fighters.loc[tapology_fighters["id"] == "10444-mehdi-baghdad", "bestfightodds_id"] = 4650
tapology_fighters.loc[tapology_fighters["id"] == "felipe-olivieri", "bestfightodds_id"] = 5966
tapology_fighters.loc[tapology_fighters["id"] == "51885-kelly-faszholz", "bestfightodds_id"] = 6036
tapology_fighters.loc[tapology_fighters["id"] == "82492-cristina-stanciu", "bestfightodds_id"] = 6269
tapology_fighters.loc[tapology_fighters["id"] == "18345-cody-east-the-freight-train", "bestfightodds_id"] = 5221
tapology_fighters.loc[tapology_fighters["id"] == "46288-jason-novelli", "bestfightodds_id"] = 5779
tapology_fighters.loc[tapology_fighters["id"] == "54222-chris-avila", "bestfightodds_id"] = 5381
tapology_fighters.loc[tapology_fighters["id"] == "16092-leonardo-guimaraes-leleco", "bestfightodds_id"] = 6243

tapology_fighters.to_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv"), index=False)

In [None]:
tapology_gyms = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "gyms.csv"))
tapology_gyms = tapology_gyms.drop(columns=["parent_name"])

# Add parent ids to gyms where applicable
parent_ids_not_in_gym_ids = tapology_gyms.loc[
    ~tapology_gyms["parent_id"].isin(tapology_gyms["id"])
]["parent_id"].dropna().unique()

tapology_parent_gyms_missing = [
    {"id": "122-ultimate-athletics", "name": "Ultimate Athletics", "name_alternative": None, "location": "Ithaca, New York", "parent_id": None},
    {"id": "166-paraestra-tokyo", "name": "Paraestra Tokyo", "name_alternative": "パラエストラ東京", "location": "Tokyo, Japan", "parent_id": None},
    {"id": "1891-ysa", "name": "YSA", "name_alternative": "YAMAMOTO SPORTS ACADEMY", "location": "Tokyo, Japan", "parent_id": None},
    {"id": "2988-john-frankl-brazilian-jiu-jitsu", "name": "John Frankl Brazilian Jiu Jitsu", "name_alternative": "John Frankl Brazilian Jiu-Jitsu Competition Team, JFBJJ", "location": "Seoul, South Korea", "parent_id": None},
    {"id": "3547-reversal-gym-tokyo-standout", "name": "Reversal Gym Tokyo Standout", "name_alternative": "リバーサルジム東京スタンドアウト(STANDOUT)", "location": "Shibuya, Tokyo, Japan", "parent_id": None},
    {"id": "3595-shooto", "name": "Japan Shooto Association", "name_alternative": None, "location": "Japan", "parent_id": None},
    {"id": "366-premier-martial-arts-watertown", "name": "Premier Martial Arts Watertown", "name_alternative": "PMA MMA", "location": "Watertown, New York", "parent_id": None},
    {"id": "3667-ps-lab-tokyo", "name": "P's Lab Tokyo", "name_alternative": "P's LAB東京 ゴールドジム原宿, パンクラスP'sLAB東京", "location": "Shibuya, Tokyo, Japan", "parent_id": None},
    {"id": "3930-nova-unio-argentina", "name": "Nova União Argentina", "name_alternative": None, "location": "Argentina", "parent_id": "53-nova-unio"},
    {"id": "4276-ufc-gym", "name": "UFC Gym", "name_alternative": None, "location": "Santa Ana, California", "parent_id": None},
    {"id": "615-straight-blast-gym-international", "name": "SBGi Portland", "name_alternative": "Straight Blast Gym International Portland", "location": "Portland, Oregon", "parent_id": None},
    {"id": "700-fang-shen-do", "name": "Fang Shen Do", "name_alternative": None, "location": "Canada", "parent_id": None},
    {"id": "731-tapout-training-center", "name": "Tapout Training Center Las Vegas", "name_alternative": None, "location": "Las Vegas, Nevada", "parent_id": None},
    {"id": "766-10th-planet-jiu-jitsu-van-nuys", "name": "10th Planet Jiu Jitsu Van Nuys", "name_alternative": None, "location": "Van Nuys, California", "parent_id": "228-10th-planet-jiu-jitsu"},
    {"id": "9533-chute-boxe", "name": "Chute Boxe", "name_alternative": None, "location": "Curitiba, Parana, Brazil", "parent_id": None},
]

tapology_parent_gyms_df = pd.DataFrame(tapology_parent_gyms_missing)
tapology_gyms = pd.concat([tapology_gyms, tapology_parent_gyms_df], ignore_index=True).sort_values(by="id").reset_index(drop=True)
tapology_gyms["name"] = tapology_gyms["name"].str.replace("’", "'")

tapology_gyms.to_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"), index=False)

In [50]:
tapology_community_picks = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "community_picks.csv"))
tapology_community_picks["fighter_last_name"] = tapology_community_picks["fighter_last_name"].str.split().str[-1]

# Check for identical fighter last names in the same bout, manually assigned fighter ids
identical_names = tapology_community_picks.groupby('bout_id')['fighter_last_name'].transform(lambda x: x.nunique() == 1)
subset_identical_names = tapology_community_picks.loc[identical_names, :].copy()
subset_identical_names.loc[:, "fighter_id"] = ["43895-roberto-sanchez", "28639-joby-sanchez", "69225-karine-silva-killer", "57056-ariane-lipski"]

# otherwise just join with bouts
subset_other = tapology_community_picks.loc[~identical_names, :].copy()
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))[["id", "fighter_1_id", "fighter_2_id"]]
fighter_1_stuff = tapology_bouts[["id", "fighter_1_id"]].rename(columns={"fighter_1_id": "fighter_id"})
fighter_2_stuff = tapology_bouts[["id", "fighter_2_id"]].rename(columns={"fighter_2_id": "fighter_id"})

bout_to_fighter_stacked = pd.concat([fighter_1_stuff, fighter_2_stuff], ignore_index=True).reset_index(drop=True)
tapology_fighters = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "fighters.csv")).rename(columns={"id": "fighter_id"})
bout_to_fighter_stacked = bout_to_fighter_stacked.merge(tapology_fighters[["fighter_id", "name"]], on="fighter_id", how="left")
bout_to_fighter_stacked["fighter_last_name"] = bout_to_fighter_stacked["name"].str.split().str[-1]
bout_to_fighter_stacked = bout_to_fighter_stacked.rename(columns={"id": "bout_id"}).drop(columns=["name"])

subset_other = subset_other.merge(bout_to_fighter_stacked, on=["bout_id", "fighter_last_name"], how="left").set_axis(subset_other.index, axis=0)


tapology_community_picks_clean = pd.concat([subset_identical_names, subset_other]).sort_index()
tapology_community_picks_clean = tapology_community_picks_clean[[
    "bout_id", "fighter_id", "ko_tko_percentage", "submission_percentage", "decision_percentage", "overall_percentage", "num_picks"
]]
tapology_community_picks_clean["overall_percentage"] = tapology_community_picks_clean["overall_percentage"].astype(int)
tapology_community_picks_clean.to_csv(os.path.join(clean_data_dir, "Tapology", "community_picks.csv"), index=False)

In [42]:
tapology_rehydration_weights = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "csac_rehydration_weights.csv"))
tapology_rehydration_weights["weigh_in_result"] = tapology_rehydration_weights["weigh_in_result"].str.split("(").str[0].str.replace("lbs", "").astype(float)
tapology_rehydration_weights["fight_night_weight"] = tapology_rehydration_weights["fight_night_weight"].str.split("(").str[0].str.replace("lbs", "").astype(float)
tapology_rehydration_weights["weight_gain"] = tapology_rehydration_weights["weight_gain"].str.split("(").str[0].str.replace("lbs", "").astype(float)

tapology_rehydration_weights = tapology_rehydration_weights.rename(columns={
    "weigh_in_result": "weigh_in_result_lbs",
    "fight_night_weight": "fight_night_weight_lbs",
    "weight_gain": "weight_gain_lbs"
})

tapology_rehydration_weights.to_csv(os.path.join(clean_data_dir, "Tapology", "rehydration_weights.csv"), index=False)

In [98]:
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))
tapology_bouts = tapology_bouts.drop(columns=["fighter_1_gym_info", "fighter_1_gym_ids", "fighter_2_gym_info", "fighter_2_gym_ids"])
tapology_bouts[["outcome_method", "outcome_method_details"]] = tapology_bouts["outcome_method"].str.split(", ", n=1, expand=True)
tapology_bouts = tapology_bouts.drop(columns=["end_round_time_info"])
tapology_bouts["billing"] = tapology_bouts["billing"].str.split(" (", regex=False).str[0].str.strip()

def get_final_weight_class_weight_lbs(weight_class_str):
    if pd.isna(weight_class_str):
        return np.nan
    
    weight_class_split = weight_class_str.split(" (")
    weight_class_final_lbs = int(float([x.replace(")", "").replace(" lbs", "") for x in weight_class_split if "lbs" in x][0]))

    return weight_class_final_lbs

tapology_bouts["weight_class_final_weight_lbs"] = tapology_bouts["weight_class"].apply(get_final_weight_class_weight_lbs)

def get_original_weight_class_weight_lbs(weight_class_str):
    if pd.isna(weight_class_str) or "re-scheduled" not in weight_class_str:
        return np.nan
    
    weight_class_str_split = weight_class_str.split(" (")
    weight_class_original_lbs = int(float([x.replace(")", "").replace("re-scheduled from ", "") for x in weight_class_str_split if "re-scheduled" in x][0]))

    return weight_class_original_lbs

tapology_bouts["weight_class_original_weight_lbs"] = tapology_bouts["weight_class"].apply(get_original_weight_class_weight_lbs)

tapology_bouts["fighter_1_odds"] = tapology_bouts["fighter_1_odds"].str.split(" (", regex=False).str[0].str.strip().astype(float)
tapology_bouts["fighter_2_odds"] = tapology_bouts["fighter_2_odds"].str.split(" (", regex=False).str[0].str.strip().astype(float)

tapology_bouts["fighter_1_weight_lbs"] = tapology_bouts["fighter_1_weight"].str.split(" (", regex=False).str[0].str.replace("lbs", "").str.strip().astype(float)
tapology_bouts["fighter_2_weight_lbs"] = tapology_bouts["fighter_2_weight"].str.split(" (", regex=False).str[0].str.replace("lbs", "").str.strip().astype(float)

tapology_bouts = tapology_bouts[
    ['id',
 'ufcstats_id',
 'event_id',
 'bout_order',
 'fighter_1_id',
 'fighter_2_id',
 'billing',
 'weight_class_final_weight_lbs',
 'weight_class_original_weight_lbs',
 'outcome_method',
 'outcome_method_details',
 'fighter_1_odds',
 'fighter_2_odds',
 'fighter_1_weight_lbs',
 'fighter_2_weight_lbs']
]

tapology_bouts["weight_class_final_weight_lbs"] = tapology_bouts["weight_class_final_weight_lbs"].astype("Int64")
tapology_bouts["weight_class_original_weight_lbs"] = tapology_bouts["weight_class_original_weight_lbs"].astype("Int64")
tapology_bouts["fighter_1_odds"] = tapology_bouts["fighter_1_odds"].astype("Int64")
tapology_bouts["fighter_2_odds"] = tapology_bouts["fighter_2_odds"].astype("Int64")

tapology_bouts.to_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"), index=False)

In [None]:
tapology_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "fighter_histories.csv"))

tapology_fighter_histories = tapology_fighter_histories.loc[~tapology_fighter_histories["outcome"].isin(["upcoming", "cancelled"])].copy().reset_index(drop=True)
tapology_fighter_histories["outcome"] = tapology_fighter_histories["outcome"].str.title()
tapology_fighter_histories["order"] = tapology_fighter_histories.groupby("fighter_id").cumcount() + 1

tapology_fighter_histories = tapology_fighter_histories.rename(columns={"weight_class": "weight_class_TEMP", "bout_id_int": "bout_id_integer"})

# extract weight class name
def extract_weight_class(weight_class_str):
    if pd.isna(weight_class_str):
        return None
    
    weight_class_str_split = [x.strip() for x in weight_class_str.split(" · ")]
    if not weight_class_str_split[0][0].isdigit() and not weight_class_str_split[0].startswith("Weigh-In:"):
        return weight_class_str_split[0].strip()
    else:
        return None
    
tapology_fighter_histories["weight_class"] = tapology_fighter_histories["weight_class_TEMP"].apply(extract_weight_class)

# extract weight class weight
def extract_weight_class_lbs(weight_class_str):
    if pd.isna(weight_class_str):
        return None
    
    weight_lbs = None
    weight_class_str_split = [x.strip() for x in weight_class_str.split(" · ")]
    for x in weight_class_str_split:
        if x[0].isdigit() and not x.startswith("Weigh-In:"):
            weight_str = [y for y in x.split(" (") if "lbs" in y][0]
            weight_lbs = float(weight_str.replace("lbs", "").replace(")", "").strip())
            break
    
    return weight_lbs

tapology_fighter_histories["weight_class_lbs"] = tapology_fighter_histories["weight_class_TEMP"].apply(extract_weight_class_lbs)

def get_weigh_in_result_lbs(row):
    if not pd.isna(row["weight"]):
        weight_str = [x for x in row["weight"].split(" (") if "lbs" in x][0]

        return float(weight_str.replace("lbs", "").replace(")", "").strip())
    elif pd.isna(row["weight"]) and not pd.isna(row["weight_class_TEMP"]):
        if "Weigh-In:" not in row["weight_class_TEMP"]:
            return None
        weight_str = [x.strip() for x in row["weight_class_TEMP"].split(" · ") if "Weigh-In:" in x][0]
        weight_str = [x for x in weight_str.split(" (") if "lbs" in x][0]
        weight_lbs = float(weight_str.replace("lbs", "").replace(")", "").replace("Weigh-In:", "").strip())

        return weight_lbs
    else:
        return None

tapology_fighter_histories["weigh_in_result_lbs"] = tapology_fighter_histories.apply(get_weigh_in_result_lbs, axis=1) # type: ignore

tapology_fighter_histories["odds"] = tapology_fighter_histories["odds"].astype("Int64")
tapology_fighter_histories["pick_em_percent"] = tapology_fighter_histories["pick_em"].str.replace("%", "").astype("Int64")

def extract_outcome_stuff(outcome_details_str):
    if pd.isna(outcome_details_str):
        return None, None, None, None

    outcome_details_split = [x.strip() for x in outcome_details_str.split(" · ") if x.strip()]
    outcome_method = None
    outcome_method_details = None
    end_round = None
    end_round_time_seconds = None
    for x in outcome_details_split:
        if x[0].isdigit() and ":" in x:
            assert end_round is None, f"{outcome_details_str}"
            time_split = x.split(":")
            assert len(time_split) == 2, f"{outcome_details_str}"

            end_round_time_seconds = int(time_split[0]) * 60 + int(time_split[1])
        elif x[0] == "R" and x[1].isdigit():
            assert end_round is None, f"{outcome_details_str}"
            end_round = int(x.replace("R", ""))
        elif outcome_method is None:
            outcome_method = x
        else:
            if outcome_method_details is None:
                outcome_method_details = x
            else:
                outcome_method_details += " - " + x
    
    return outcome_method, outcome_method_details, end_round, end_round_time_seconds

tapology_fighter_histories[[
    "outcome_method",
    "outcome_method_details",
    "end_round",
    "end_round_time_seconds"
]] = tapology_fighter_histories["outcome_details"].apply(extract_outcome_stuff).apply(pd.Series)

tapology_fighter_histories["end_round"] = tapology_fighter_histories["end_round"].astype("Int64")
tapology_fighter_histories["end_round_time_seconds"] = tapology_fighter_histories["end_round_time_seconds"].astype("Int64")

tapology_fighter_histories = tapology_fighter_histories[
    ['fighter_id',
 'order',
 'bout_id',
 'bout_id_integer',
 'event_id',
 'event_name',
 'opponent_id',
 'billing',
 'round_time_format',
 'weight_class',
 'weight_class_lbs',
 'outcome',
 'outcome_method',
 'outcome_method_details',
 'end_round',
 'end_round_time_seconds',
 'fighter_record',
 'opponent_record',
 'weigh_in_result_lbs',
 'odds',
 'pick_em_percent']
]

tapology_fighter_histories.to_csv(os.path.join(clean_data_dir, "Tapology", "fighter_histories.csv"), index=False)

## Fight Matrix

In [6]:
fightmatrix_rankings = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "rankings.csv"))
fightmatrix_rankings = fightmatrix_rankings.drop_duplicates(subset=["issue_date", "weight_class", "fighter_id"], keep="first").reset_index(drop=True)
fightmatrix_rankings.to_csv(os.path.join(clean_data_dir, "Fight Matrix", "rankings.csv"), index=False)

In [101]:
fightmatrix_bouts = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "bouts.csv"))

# replace -1s with NaN
fightmatrix_bouts = fightmatrix_bouts.replace(-1, np.nan)

# replace -500s with pre
fightmatrix_bouts.loc[fightmatrix_bouts["fighter_1_glicko_1_post"] == -500, "fighter_1_glicko_1_post"] = fightmatrix_bouts["fighter_1_glicko_1_pre"]
fightmatrix_bouts.loc[fightmatrix_bouts["fighter_2_glicko_1_post"] == -500, "fighter_2_glicko_1_post"] = fightmatrix_bouts["fighter_2_glicko_1_pre"]

fightmatrix_bouts[
    [
        'fighter_1_elo_k170_pre',
 'fighter_1_elo_k170_post',
 'fighter_1_elo_modified_pre',
 'fighter_1_elo_modified_post',
 'fighter_1_glicko_1_pre',
 'fighter_1_glicko_1_post',
 'fighter_2_elo_k170_pre',
 'fighter_2_elo_k170_post',
 'fighter_2_elo_modified_pre',
 'fighter_2_elo_modified_post',
 'fighter_2_glicko_1_pre',
 'fighter_2_glicko_1_post',
    ]
] = fightmatrix_bouts[
    [
        'fighter_1_elo_k170_pre',
 'fighter_1_elo_k170_post',
 'fighter_1_elo_modified_pre',
 'fighter_1_elo_modified_post',
 'fighter_1_glicko_1_pre',
 'fighter_1_glicko_1_post',
 'fighter_2_elo_k170_pre',
 'fighter_2_elo_k170_post',
 'fighter_2_elo_modified_pre',
 'fighter_2_elo_modified_post',
 'fighter_2_glicko_1_pre',
 'fighter_2_glicko_1_post',
    ]
].astype("Int64")


fightmatrix_bouts.to_csv(os.path.join(clean_data_dir, "Fight Matrix", "bouts.csv"), index=False)

In [4]:
fightmatrix_fighters = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "fighters.csv"), parse_dates=["pro_debut_date", "ufc_debut_date"])
fightmatrix_fighters["sherdog_id"] = fightmatrix_fighters["sherdog_id"].astype("Int64")
fightmatrix_fighters.to_csv(os.path.join(clean_data_dir, "Fight Matrix", "fighters.csv"), index=False)

In [None]:
fightmatrix_events = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "events.csv"), parse_dates=["date"])
fightmatrix_events["event_order"] = fightmatrix_events["event_order"].astype("Int64")
fightmatrix_events.to_csv(os.path.join(clean_data_dir, "Fight Matrix", "events.csv"), index=False)

In [94]:
fightmatrix_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "fighter_histories.csv"), parse_dates=["date"])

# remove rows with -1
fightmatrix_fighter_histories = fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["fighter_elo_k170_pre"] != -1]
fightmatrix_fighter_histories["temp_order"] = fightmatrix_fighter_histories.groupby("fighter_id").cumcount() + 1

# replace -500s
fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["fighter_glicko_1_post"] == -500, "fighter_glicko_1_post"] = fightmatrix_fighter_histories["fighter_glicko_1_pre"]
fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["opponent_glicko_1_post"] == -500, "opponent_glicko_1_post"] = fightmatrix_fighter_histories["opponent_glicko_1_pre"]

# get fighter ids where no fights have bad_order_flag = 1
bad_fighter_ids = fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["bad_ordering_flag"] == 1, "fighter_id"].unique()
fighter_histories_good = fightmatrix_fighter_histories.loc[~fightmatrix_fighter_histories["fighter_id"].isin(bad_fighter_ids)]
fighter_histories_good = fighter_histories_good.drop(columns=["bad_ordering_flag"])
fighter_histories_good = fighter_histories_good.rename(columns={"temp_order": "order"})
fighter_histories_good = fighter_histories_good.replace(-1, np.nan)

# correct the order for fighters with bad_ordering_flag = 1
# main idea is to create a DAG, find the source node, and traverse it
fighter_histories_bad = fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["fighter_id"].isin(bad_fighter_ids)]

corrected = []
still_bad_fighter_ids = []
for fighter_id in bad_fighter_ids:
    history_slice = fighter_histories_bad.loc[fighter_histories_bad["fighter_id"] == fighter_id]
    dates_sorted = sorted(history_slice["date"].unique())

    row_dicts_in_order = []
    for date in dates_sorted:
        rows_for_date = history_slice.loc[history_slice["date"] == date]
        if len(rows_for_date) == 1:
            row_dicts_in_order.append(rows_for_date.iloc[0].to_dict())
        else:
            elo_tuple_pre_set = set(zip(rows_for_date["fighter_elo_k170_pre"], rows_for_date["fighter_elo_modified_pre"], rows_for_date["fighter_glicko_1_pre"]))
            elo_tuple_post_set = set(zip(rows_for_date["fighter_elo_k170_post"], rows_for_date["fighter_elo_modified_post"], rows_for_date["fighter_glicko_1_post"]))
            if len(row_dicts_in_order) == 0:
                start_row = rows_for_date.loc[(rows_for_date["fighter_elo_k170_pre"] == 1000) & (rows_for_date["fighter_elo_modified_pre"] == 1000) & (rows_for_date["fighter_glicko_1_pre"] == 1000)]
                if len(start_row) == 1:
                    row_dicts_in_order.append(start_row.iloc[0].to_dict())
                else:
                    start_elo_candidates = list(elo_tuple_pre_set - elo_tuple_post_set)
                    finish_elo_candidates = list(elo_tuple_post_set - elo_tuple_pre_set)
                    if len(start_elo_candidates) == 1 and len(finish_elo_candidates) == 1:
                        start_row = rows_for_date.loc[
                            (rows_for_date["fighter_elo_k170_pre"] == start_elo_candidates[0][0]) &
                            (rows_for_date["fighter_elo_modified_pre"] == start_elo_candidates[0][1]) &
                            (rows_for_date["fighter_glicko_1_pre"] == start_elo_candidates[0][2]) &
                            (rows_for_date["fighter_elo_k170_post"] != finish_elo_candidates[0][0]) &
                            (rows_for_date["fighter_elo_modified_post"] != finish_elo_candidates[0][1]) &
                            (rows_for_date["fighter_glicko_1_post"] != finish_elo_candidates[0][2])
                        ]

                        if len(start_row) == 1:
                            row_dicts_in_order.append(start_row.iloc[0].to_dict())
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    elif len(rows_for_date) == 2 and len(finish_elo_candidates) == 1 and len(start_elo_candidates) == 0:
                        start_row = rows_for_date.loc[
                            (rows_for_date["fighter_elo_k170_post"] != finish_elo_candidates[0][0]) & 
                            (rows_for_date["fighter_elo_modified_post"] != finish_elo_candidates[0][1]) &
                            (rows_for_date["fighter_glicko_1_post"] != finish_elo_candidates[0][2])
                        ]

                        if len(start_row) == 1:
                            row_dicts_in_order.append(start_row.iloc[0].to_dict())
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    else:
                        still_bad_fighter_ids.append(fighter_id)
                        break

                # exclude start row
                rows_for_date = rows_for_date.drop(start_row.index)
            
            while len(rows_for_date) > 0:
                # find row with pre fight elos equal to the last added row's post fight elos
                row_match = rows_for_date.loc[
                    (rows_for_date["fighter_elo_k170_pre"] == row_dicts_in_order[-1]["fighter_elo_k170_post"]) &
                    (rows_for_date["fighter_elo_modified_pre"] == row_dicts_in_order[-1]["fighter_elo_modified_post"]) &
                    (rows_for_date["fighter_glicko_1_pre"] == row_dicts_in_order[-1]["fighter_glicko_1_post"])
                ]

                if len(row_match) == 1:
                    row_dicts_in_order.append(row_match.iloc[0].to_dict())
                    rows_for_date = rows_for_date.drop(row_match.index)
                else:
                    elo_tuple_pre_set2 = set(zip(rows_for_date["fighter_elo_k170_pre"], rows_for_date["fighter_elo_modified_pre"], rows_for_date["fighter_glicko_1_pre"]))
                    elo_tuple_post_set2 = set(zip(rows_for_date["fighter_elo_k170_post"], rows_for_date["fighter_elo_modified_post"], rows_for_date["fighter_glicko_1_post"]))

                    start_elo_candidates2 = list(elo_tuple_pre_set2 - elo_tuple_post_set2)
                    finish_elo_candidates2 = list(elo_tuple_post_set2 - elo_tuple_pre_set2)

                    if len(start_elo_candidates2) == 1 and len(finish_elo_candidates2) == 1:
                        row_match = rows_for_date.loc[
                            (rows_for_date["fighter_elo_k170_pre"] == start_elo_candidates2[0][0]) &
                            (rows_for_date["fighter_elo_modified_pre"] == start_elo_candidates2[0][1]) &
                            (rows_for_date["fighter_glicko_1_pre"] == start_elo_candidates2[0][2]) &
                            (rows_for_date["fighter_elo_k170_post"] != finish_elo_candidates2[0][0]) &
                            (rows_for_date["fighter_elo_modified_post"] != finish_elo_candidates2[0][1]) &
                            (rows_for_date["fighter_glicko_1_post"] != finish_elo_candidates2[0][2])
                        ]
                        
                        if len(row_match) == 1:
                            row_dicts_in_order.append(row_match.iloc[0].to_dict())
                            rows_for_date = rows_for_date.drop(row_match.index)
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    elif len(rows_for_date) == 2 and len(finish_elo_candidates2) == 1 and len(start_elo_candidates2) == 0:
                        row_match = rows_for_date.loc[
                            (rows_for_date["fighter_elo_k170_post"] != finish_elo_candidates2[0][0]) & 
                            (rows_for_date["fighter_elo_modified_post"] != finish_elo_candidates2[0][1]) &
                            (rows_for_date["fighter_glicko_1_post"] != finish_elo_candidates2[0][2])
                        ]

                        if len(row_match) == 1:
                            row_dicts_in_order.append(row_match.iloc[0].to_dict())
                            rows_for_date = rows_for_date.drop(row_match.index)
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    else:
                        still_bad_fighter_ids.append(fighter_id)
                        break
                    
    if fighter_id in still_bad_fighter_ids:
        continue
    else:
        for i, row in enumerate(row_dicts_in_order):
            row["order"] = i + 1
        corrected.extend(row_dicts_in_order)

fighter_histories_bad_corrected = pd.DataFrame(corrected)
fighter_histories_bad_corrected = fighter_histories_bad_corrected[
    ['fighter_id',
 'order',
 'event_id',
 'date',
 'opponent_id',
 'outcome',
 'outcome_method',
 'end_round',
 'fighter_elo_k170_pre',
 'fighter_elo_k170_post',
 'fighter_elo_modified_pre',
 'fighter_elo_modified_post',
 'fighter_glicko_1_pre',
 'fighter_glicko_1_post',
 'opponent_elo_k170_pre',
 'opponent_elo_k170_post',
 'opponent_elo_modified_pre',
 'opponent_elo_modified_post',
 'opponent_glicko_1_pre',
 'opponent_glicko_1_post']
]

fighter_histories_still_bad = fightmatrix_fighter_histories.loc[fightmatrix_fighter_histories["fighter_id"].isin(still_bad_fighter_ids)]
# fighter_histories_still_bad.to_csv("temporary_still_bad.csv", index=False)

# manual fixes
fighter_histories_still_bad_corrected = pd.read_csv("./fightmatrix_histories_edge_case_fixes.csv", parse_dates=["date"])
fighter_histories_still_bad_corrected = fighter_histories_still_bad_corrected[
    ['fighter_id',
 'order',
 'event_id',
 'date',
 'opponent_id',
 'outcome',
 'outcome_method',
 'end_round',
 'fighter_elo_k170_pre',
 'fighter_elo_k170_post',
 'fighter_elo_modified_pre',
 'fighter_elo_modified_post',
 'fighter_glicko_1_pre',
 'fighter_glicko_1_post',
 'opponent_elo_k170_pre',
 'opponent_elo_k170_post',
 'opponent_elo_modified_pre',
 'opponent_elo_modified_post',
 'opponent_glicko_1_pre',
 'opponent_glicko_1_post']
]

# this whole stupid process took me 3 hours total
fightmatrix_fighter_histories_clean = pd.concat([fighter_histories_good, fighter_histories_bad_corrected, fighter_histories_still_bad_corrected], ignore_index=True)
fightmatrix_fighter_histories_clean = fightmatrix_fighter_histories_clean.sort_values(by=["fighter_id", "order"]).reset_index(drop=True)
fightmatrix_fighter_histories_clean.to_csv(os.path.join(clean_data_dir, "Fight Matrix", "fighter_histories.csv"), index=False)

## MMA Decisions

- `events`, `judges`, and `media_scores` are already cleaned

In [10]:
dupe_fighter_map = {
    4934: 6373,
    6748: 6394,
}

In [15]:
mmadecisions_fighters = pd.read_csv(os.path.join(raw_data_dir, "MMA Decisions", "fighters.csv"), parse_dates=["date_of_birth"])

# Duplicate fighters
dupes = [4934, 6748]
mmadecisions_fighters = mmadecisions_fighters[~mmadecisions_fighters["id"].isin(dupes)]

def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan
    
    height_str_split = height_str.split("'")
    feet = height_str_split[0]
    inches = height_str_split[1]
    feet = int(feet)
    inches = float(inches.replace('"', ''))

    return feet * 12 + inches

mmadecisions_fighters["height"] = mmadecisions_fighters["height"].apply(convert_height)
mmadecisions_fighters = mmadecisions_fighters.rename(columns={"height": "height_inches"})

mmadecisions_fighters.to_csv(os.path.join(clean_data_dir, "MMA Decisions", "fighters.csv"), index=False)

In [33]:
mmadecisions_bouts = pd.read_csv(os.path.join(raw_data_dir, "MMA Decisions", "bouts.csv"))

# replace duplicate fighter IDs
mmadecisions_bouts["fighter_1_id"] = mmadecisions_bouts["fighter_1_id"].replace(dupe_fighter_map)
mmadecisions_bouts["fighter_2_id"] = mmadecisions_bouts["fighter_2_id"].replace(dupe_fighter_map)

mmadecisions_bouts.to_csv(os.path.join(clean_data_dir, "MMA Decisions", "bouts.csv"), index=False)

In [38]:
mmadecisions_deductions = pd.read_csv(os.path.join(raw_data_dir, "MMA Decisions", "deductions.csv"))
mmadecisions_deductions["fighter_id"] = mmadecisions_deductions["fighter_id"].replace(dupe_fighter_map)
mmadecisions_deductions.to_csv(os.path.join(clean_data_dir, "MMA Decisions", "deductions.csv"), index=False)

In [42]:
mmadecisions_judge_scores = pd.read_csv(os.path.join(raw_data_dir, "MMA Decisions", "judge_scores.csv"))
mmadecisions_judge_scores["judge_id"] = mmadecisions_judge_scores["judge_id"].astype("Int64")
mmadecisions_judge_scores["fighter_1_score"] = mmadecisions_judge_scores["fighter_1_score"].astype("Int64")
mmadecisions_judge_scores["fighter_2_score"] = mmadecisions_judge_scores["fighter_2_score"].astype("Int64")

mmadecisions_judge_scores.to_csv(os.path.join(clean_data_dir, "MMA Decisions", "judge_scores.csv"), index=False)

## Bet MMA

- `missed_weights`, `late_replacements`, and `bouts` are already cleaned

In [102]:
betmma_fighters = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "fighters.csv"))
betmma_fighters["sherdog_id"] = betmma_fighters["sherdog_id"].astype("Int64")
betmma_fighters = betmma_fighters.rename(columns={"reach": "reach_inches"})

def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan
    
    height_str = height_str.replace('"', "")
    feet, inches = map(float, height_str.split("'"))

    return feet * 12 + inches

betmma_fighters["height_inches"] = betmma_fighters["height"].apply(convert_height)
betmma_fighters.loc[betmma_fighters["nationality"] == "-", "nationality"] = np.nan
betmma_fighters.loc[betmma_fighters["nationality"].str.endswith("d'Ivoire", na=False), "nationality"] = "Côte d'Ivoire"
betmma_fighters.loc[betmma_fighters["nationality"] == "Trinidad &amp; Tobago", "nationality"] = "Trinidad and Tobago"
betmma_fighters.loc[betmma_fighters["nationality"] == "Bosnia &amp; Herzegovina", "nationality"] = "Bosnia and Herzegovina"

betmma_fighters = betmma_fighters[
    ['id',
 'ufcstats_id',
 'sherdog_id',
 'name',
 'height_inches',
 'reach_inches',
 'stance',
 'nationality']
]

betmma_fighters.loc[betmma_fighters["reach_inches"] > 100, "reach_inches"] = np.nan

betmma_fighters.to_csv(os.path.join(clean_data_dir, "Bet MMA", "fighters.csv"), index=False)

In [124]:
betmma_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "fighter_histories.csv"))
betmma_fighter_histories["outcome"] = betmma_fighter_histories["outcome"].map({"Won": "W", "Lost": "L", "Draw": "D"})
betmma_fighter_histories.loc[betmma_fighter_histories["end_round"] == 0, "end_round"] = np.nan
betmma_fighter_histories["end_round"] = betmma_fighter_histories["end_round"].astype("Int64")
betmma_fighter_histories.loc[betmma_fighter_histories["end_round_time"] == "0:00", "end_round_time"] = np.nan

def convert_time(time_str):
    if pd.isna(time_str):
        return np.nan
    
    minutes, seconds = map(int, time_str.split(":"))
    return minutes * 60 + seconds

betmma_fighter_histories["end_round_time_seconds"] = betmma_fighter_histories["end_round_time"].apply(convert_time)
betmma_fighter_histories["end_round_time_seconds"] = betmma_fighter_histories["end_round_time_seconds"].astype("Int64")

def total_seconds(row):
    if pd.isna(row["end_round_time_seconds"]) or pd.isna(row["end_round"]):
        return np.nan
    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]

betmma_fighter_histories["total_time_seconds"] = betmma_fighter_histories.apply(total_seconds, axis=1)
betmma_fighter_histories["total_time_seconds"] = betmma_fighter_histories["total_time_seconds"].astype("Int64")

betmma_fighter_histories["odds"] = betmma_fighter_histories["odds"].astype("Int64")

betmma_fighter_histories = betmma_fighter_histories[
    ['fighter_id',
 'order',
 'bout_id',
 'opponent_id',
 'outcome',
 'outcome_method',
 'end_round',
 'end_round_time_seconds',
 'total_time_seconds',
 'odds']
]

betmma_fighter_histories.to_csv(os.path.join(clean_data_dir, "Bet MMA", "fighter_histories.csv"), index=False)

In [131]:
betmma_events = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "events.csv"))

betmma_events['temp_event_order'] = (
    betmma_events['is_ufc_event']
    .eq(1)  # Check where the column equals 1
    .cumsum()  # Cumulative sum to count occurrences
    .where(betmma_events['is_ufc_event'] == 1, np.nan)  # Set to NaN where the trigger column is 0
)
betmma_events["event_order"] = 233 + betmma_events["temp_event_order"]
betmma_events["event_order"] = betmma_events["event_order"].astype("Int64")

betmma_events = betmma_events[
    ['id',
 'name',
 'date',
 'location',
 'is_ufc_event',
 'event_order']
]

betmma_events.to_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"), index=False)

## ESPN

- `teams` and `venues` are already cleaned

In [18]:
espn_fighters = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "fighters.csv"))
espn_fighters["team_id"] = espn_fighters["team_id"].astype("Int64")
espn_fighters["date_of_birth"] = pd.to_datetime(espn_fighters["date_of_birth"], format="%d/%m/%Y")
espn_fighters.loc[espn_fighters["stance"] == "--", "stance"] = np.nan
espn_fighters["reach_inches"] = espn_fighters["reach"].str.replace('"', "").astype("float")

def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan
    
    height_str = height_str.replace("'", "").replace('"', "")
    feet, inches = map(int, height_str.split())

    return feet * 12 + inches

espn_fighters["height_inches"] = espn_fighters["height"].apply(convert_height)
espn_fighters["height_inches"] = espn_fighters["height_inches"].astype("Int64")

espn_fighters = espn_fighters[
    ['id',
 'name',
 'nickname',
 'date_of_birth',
 'reach_inches',
 'height_inches',
 'stance',
 'team_id',
 'nationality',
 'fighting_style',]
]

espn_fighters.to_csv(os.path.join(clean_data_dir, "ESPN", "fighters.csv"), index=False)

In [26]:
espn_events = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "events.csv"))
espn_events["venue_id"] = espn_events["venue_id"].astype("Int64")
espn_events = espn_events.rename(columns={"date": "date_TEMP"})

espn_events["date"] = pd.to_datetime(espn_events["date_TEMP"]).dt.date
espn_events["hour_utc"] = pd.to_datetime(espn_events["date_TEMP"]).dt.hour

espn_events = espn_events[["id", "name", "date", "hour_utc", "venue_id", "event_order"]]

espn_events.to_csv(os.path.join(clean_data_dir, "ESPN", "events.csv"), index=False)

In [29]:
espn_bouts = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "bouts.csv"))
espn_bouts["winner_id"] = espn_bouts["winner_id"].astype("Int64")
espn_bouts.to_csv(os.path.join(clean_data_dir, "ESPN", "bouts.csv"), index=False)

In [45]:
espn_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "fighter_histories.csv"))
espn_fighter_histories["opponent_id"] = espn_fighter_histories["opponent_id"].astype("Int64")

espn_fighter_histories = espn_fighter_histories.rename(columns={"date": "date_TEMP"})
espn_fighter_histories["date"] = pd.to_datetime(espn_fighter_histories["date_TEMP"]).dt.date
espn_fighter_histories["hour_utc"] = pd.to_datetime(espn_fighter_histories["date_TEMP"]).dt.hour

espn_fighter_histories["end_round"] = espn_fighter_histories["end_round"].astype("Int64")
espn_fighter_histories.loc[espn_fighter_histories["end_round"] == 0, "end_round"] = np.nan

def convert_time(time_str):
    if pd.isna(time_str) or time_str == "-":
        return np.nan
    
    minutes, seconds = map(int, time_str.split(":"))
    return minutes * 60 + seconds

espn_fighter_histories["end_round_time_seconds"] = espn_fighter_histories["end_round_time"].apply(convert_time)
espn_fighter_histories["end_round_time_seconds"] = espn_fighter_histories["end_round_time_seconds"].astype("Int64")

def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan
    
    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]

espn_fighter_histories["total_time_seconds"] = espn_fighter_histories.apply(calculate_total_time_seconds, axis=1)
espn_fighter_histories["total_time_seconds"] = espn_fighter_histories["total_time_seconds"].astype("Int64")

espn_fighter_histories = espn_fighter_histories[
    ['fighter_id',
 'order',
 'bout_id',
 'event_id',
 'event_name',
 'date',
 'hour_utc',
 'opponent_id',
 'outcome',
 'outcome_method',
 'end_round',
 'end_round_time_seconds',
  'total_time_seconds',
 'is_title_bout']
]

espn_fighter_histories.to_csv(os.path.join(clean_data_dir, "ESPN", "fighter_histories.csv"), index=False)

In [72]:
espn_bout_stats = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "fighter_bout_statistics.csv"))

stat_cols = [x for x in espn_bout_stats.columns if x not in {'fighter_id','order','bout_id','event_id',}]
espn_bout_stats[stat_cols] = espn_bout_stats[stat_cols].astype("Int64")

espn_bout_stats = espn_bout_stats.drop(columns=["order", "event_id"])
espn_bout_stats = espn_bout_stats[["bout_id", "fighter_id"] + stat_cols]
espn_bout_stats = espn_bout_stats.sort_values(by=["bout_id", "fighter_id"]).reset_index(drop=True)

espn_bout_stats.to_csv(os.path.join(clean_data_dir, "ESPN", "bout_stats.csv"), index=False)

## UFC Stats

- `fighter_histories` is already cleaned

In [None]:
ufcstats_fighters = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "fighters.csv"), parse_dates=["date_of_birth"])
# ufcstats_fighters

# Convert height and reach to integers, defaulted to float because of NaNs
ufcstats_fighters[["height_inches", "reach_inches"]] = ufcstats_fighters[["height_inches", "reach_inches"]].astype("Int64")
ufcstats_fighters.to_csv(os.path.join(clean_data_dir, "UFC Stats", "fighters.csv"), index=False)

In [20]:
ufcstats_events = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"])
# ufcstats_events

ufcstats_events["event_order"] = ufcstats_events["event_order"].astype("Int64")
ufcstats_events.to_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"), index=False)

In [23]:
ufcstats_round_stats = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "round_stats.csv"))
# ufcstats_round_stats.dtypes

ufcstats_round_stats["control_time_seconds"] = ufcstats_round_stats["control_time_seconds"].astype("Int64")
ufcstats_round_stats.to_csv(os.path.join(clean_data_dir, "UFC Stats", "round_stats.csv"), index=False)

In [46]:
# UFC 83 should be used for training cutoff
# these were manually checked using getty images
bout_ids_to_flip = [
    "219bd976b8ca745d",
    "af178adff964d854",
    "920194911d727a38",
    "b675c94f20551631",
    "67948da92e6c9bdc",
    "628d02b6046d5f73",
    "1d00497a573d7f4c",
    "b376a6fb0ca4862e",
    "03d30a534dee8ac5",
    "c5bef3da17e595ee",
    "b976eb99de0f63ca",
    "f8f73efa2d4cc566",
    "4a482ad5b021bb25",
    "0fe3681eb934bdc3",
    "171007deb1cc56aa",
    "ff872fa3e9ec32a9",
    "b3a3341a5db2d484",
    "0700b74037329215",
    "5288ef3edf1eda6e",
    "98a1121ffbc47657",
    "688c8b695e521ce8",
    "98b1ee14ad4ea1c6",
    "bcefcb29bec709d6",
    "66f0238fac31130f",
    "ae674ec42bd0a979",
    "403bed9fe983b10a",
    "51ae1d8a663ae1fe",
    "9f14f080b08e6869",
    "c5e0e4ee11903076",
    "e2a61ec3bc83da98",
    "31362d9ea15127b2",
    "e915c1987050eac8",
    "b2013a8b4ed68c8d",
    "9a596aaa2b9a18db",
    "81734d11cb6a8ebb",
    "976f7f5e7537c62b",
    "b74ecc368d3ecbc4",
    "3e126f482e9a84b8",
    "1992dcae0699200e",
    "2822226962195259",
    "d23b05f33b741c47",
    "505d3d9b006014c8",
    "3818a33c9103c88f",
    "d460961065f1198b",
    "757c11f17278b06a",
    "9b8c29f50d452025",
    "194f8d501f318ada",
    "cc9cabc1fc747aaf",
    "f24db9d7c3636a2a",
    "781017bfa44d4058",
    "cfbeeffd65101ae1",
    "af4cbb9072814a86",
    "27868eac1d141498",
    "958bd435389dd12a",
    "bfd846ad4d597bbb",
    "75577b11361cb645",
    "2f736846aebbf12d",
    "c4e16d57dd9a1b39",
    "d90567f7d4372160",
    "8191b8edfde3b9c3",
    "09b36c7157770516",
    "0b58bd40373b1811",
    "f7cd0a90da3be3b2",
    "86b8419f22f398c0",
    "8add23232cdb36ee",
    "f27bf6f21410547f",
    "02c54d382fb1c347",
    "041dd0fae4650970",
    "20b84b2cbfb92dc7",
    "dd5f7a14f5782f79",
    "396214fd188c2549",
    "cb7cfa8b9900878f",
    "fb7d31b36bebc5af",
    "08f5bdfe45082ec5",
    "e6dd15ef9fd7927e",
    "92e9c53671ac8fe7",
    "afb9d3476c5caafa",
    "e4420b9812501c17",
    "6a663c04dfe5b88e",
    "868833557ae90877",
    "df7f3c60c523ca55",
    "0d395cb5d91f811f",
    "d89b849df7bff79b",
    "fd05ac3ac2f7d042",
    "21b02cb26307ac0f",
    "39b79764bbac93cb",
    "1df12710cfdb60f4",
    "038c472ae3fd7728",
    "ddc233f8ca7c05aa",
    "ae68274ab6f78cd8",
    "c6a5ed1127cbc9b9",
    "226de3e7e71f2b4f",
    "06142a1de3bbae75",
    "793432d042384a02",
    "0bf025d86cc2a409",
    "d29dd21d8881a0e0",
    "5060770f197e2147",
    "b4bf090eb5c57903",
    "f1423b63d81c88ea",
    "eb23f3574b656624",
    "15750dc75b7426f3",
    "440936f9db783997",
    "83314e1ed6f7d94e",
    "11ca5d09db47708b",
    "11871325db7a13ec",
    "036bb5b335e93ca8",
    "55bcf58be822d115",
    "f8896b43f4ecf861",
    "d9c3d7b09d8df3e4",
    "faec13fd40b6d797",
    "ef6a0ee848382391",
    "b71f07f052d0a48b",
    "a9dc59e90779c1e3",
    "28037f9be7e0781a",
    "6a61d45818534b19",
    "b0958ed411ae9675",
    "fd292c2a587bb2f5",
    "fa0d393a452c0809",
    "6ae10aa4c6dc986d",
    "2711e57bbef4d928",
    "e3c9c95b67580b9b",
    "39f07708af6ecfc7",
    "2a23f787fff1d837",
    "0454bfbc49e95c16",
    "7c620723b6840231",
    "c1be2b862f737c18",
    "329a097e0b53daac",
    "a2da03d9c25b49e5",
    "acfa3a3c0d09bb57",
    "2ff09ddaa2b78cc4",
    "ec271393bc31ab72",
    "03c2cebad8d8c012",
    "32ae3f0d43babd86",
    "d77a72e3fa8db853",
    "92975b516e6463b6",
    "6c569785225fd2e7",
    "cb510afff3dafdab",
    "2c07f8ba25640237",
    "bf8164367980fcd3",
    "ef8f31b86469a967",
    "0c6055509d98ed06",
    "119b156ded2af0f1",
    "9fa3f9f7caf429d6",
    "f5affe923c640d0e",
    "f6246d2b9f663e33",
    "7e50ae4dbdd380a1",
    "a29236fcb395f443",
    "24e14c4824144c64",
    "d7f9a09021a9a13c",
    "c99370e3e54bd5fd",
    "c4fa93a4f37a6ca7",
    "19f615a7a5cfd304",
    "be72958d9715757d",
    "504b540805598fa5",
    "1a21263dc5d866b6",
    "20628fd4e19a97e4",
    "adddc6e46da5ca19",
    "840863604b38a33f",
    "5e52b0bf9719f0ae",
    "b091e021e61f1950",
    "ca93e3f69fa3d725",
    "b4d624bdc27dff83",
    "aefca2869c87eb11",
    "0ea087a71863184d",
    "b091e021e61f1950",
    "5e52b0bf9719f0ae",
    "840863604b38a33f",
    "adddc6e46da5ca19",
    "20628fd4e19a97e4",
    "1a21263dc5d866b6",
    "504b540805598fa5",
    "be72958d9715757d",
    "19f615a7a5cfd304",
    "c4fa93a4f37a6ca7",
    "c99370e3e54bd5fd",
    "d7f9a09021a9a13c",
    "24e14c4824144c64",
]
column_swap_map = {
    "red_fighter_id": "blue_fighter_id",
    "blue_fighter_id": "red_fighter_id",
    "red_outcome": "blue_outcome",
    "blue_outcome": "red_outcome",
}

ufcstats_bouts = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "bouts.csv"))

# flip fighter ids and outcomes for specific bouts
ufcstats_bouts.update(
    ufcstats_bouts[ufcstats_bouts["id"].isin(bout_ids_to_flip)]
    .rename(columns=column_swap_map)
)

ufcstats_bouts.to_csv(os.path.join(clean_data_dir, "UFC Stats", "bouts.csv"), index=False)

## Wikipedia

- `venues` is already cleaned

In [25]:
wikipedia_events = pd.read_csv(os.path.join(raw_data_dir, "Wikipedia", "events.csv"), parse_dates=["date"])
wikipedia_events["attendance"] = wikipedia_events["attendance"].astype("Int64")
wikipedia_events.to_csv(os.path.join(clean_data_dir, "Wikipedia", "events.csv"), index=False)

## Sherdog

In [27]:
sherdog_fighters = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "fighters.csv"), parse_dates=["date_of_birth", "pro_debut_date"])
sherdog_fighters["height_inches"] = sherdog_fighters["height_inches"].astype("Int64")
sherdog_fighters.to_csv(os.path.join(clean_data_dir, "Sherdog", "fighters.csv"), index=False)

In [38]:
sherdog_events = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "events.csv"), parse_dates=["date"])
sherdog_events["name"] = sherdog_events["name"].str.replace("\n", " ", regex=False).replace("  ", " ", regex=False).str.strip()
sherdog_events["location"] = sherdog_events["location"].str.replace("\r", ",", regex=False).replace("\n", " ", regex=False).replace("  ", " ", regex=False).str.strip()
sherdog_events["country"] = sherdog_events["country"].str.split('\r\n').str[0].str.strip()
sherdog_events["event_order"] = sherdog_events["event_order"].astype("Int64")
sherdog_events.loc[sherdog_events["date"] == "2030-01-11", "date"] = pd.to_datetime("2022-11-30")
sherdog_events = sherdog_events.sort_values(by=["date", "event_order"]).reset_index(drop=True)

# print(sherdog_events.loc[sherdog_events["id"] == 97481, "location"].values)

sherdog_events.to_csv(os.path.join(clean_data_dir, "Sherdog", "events.csv"), index=False)

In [145]:
sherdog_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "fighter_histories.csv"))
sherdog_fighter_histories["opponent_id"] = sherdog_fighter_histories["opponent_id"].astype("Int64")
sherdog_fighter_histories["outcome"] = sherdog_fighter_histories["outcome"].map({"win": "W", "loss": "L", "draw": "D", "nc": "NC"})

sherdog_fighter_histories["outcome_method_broad"] = sherdog_fighter_histories["outcome_method"].str.split(r'\(|\)', expand=True).iloc[:, 0]
sherdog_fighter_histories["outcome_method_broad"] = sherdog_fighter_histories["outcome_method_broad"].str.strip()

# stupid edge cases
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"] == "", "outcome_method_broad"] = np.nan

dqs = ["Disqualification", "Disqualifcation", "Desqualification", "DG", "DQ"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(dqs), "outcome_method_broad"] = "Disqualification"

ko_tkos = ["ТКО", "Tko", "TKP", "ΤΚΟ", "Corner Stoppage", "Corner's towel", "Doctor's Stoppage", "Injury", "Retirement",
           "K.O", "K.O.", "KO", "Ko", "Knockout", "ko", "KO/TKO", "KO/TKO PUNCHES", "TKO Punches", "TKO", "RETIREMENT"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(ko_tkos), "outcome_method_broad"] = "KO/TKO"

draws = ["DRAW", "Draw Unanimous", "Drew", "Majority Draw", "Technical Draw"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(draws), "outcome_method_broad"] = "Draw"

decisions = ["Decision", "Decision Unanimous", "Decision unanimous", "Decisions", "Decison", "Decisão Unanime", "Desicion",
             "Desision", "Majority Decision", "Points", "Split Decision", "Split Division", "Split decision", "Technical Decision",
             "Unaminous Decision", "Unanimous", "Unanimous Decision", "Unanimous decision", "de"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(decisions), "outcome_method_broad"] = "Decision"

subs = ["Frontal Anaconda Choke", "Guillotine Choke", "Guillotine choke", "Kraken Choke", "Rear Naked Choke", "SUBMISSION",
        "Submision", "Submisison", "Submissio", "Submission", "Submissions", "Submisson", "Submssion", "Tapout",
        "Techinal Submission", "Techincal Submission", "Technial Submission", "Technical Submission", "Triangle Choke",
        "Verbal Submission", "su", "submison", "submission", "ubmission"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(subs), "outcome_method_broad"] = "Submission"

ncs = ["N/C", "NC", "ND", "No Conest", "No Contest", "No Contest - Clements Failed Drug Test", 
       "No Contest - Collard Failed Drug Test", "No Contest - Overturned by Commission", 
       "No Contest - Overturned by NSAC", "No Contest - Overturned by WTKA", "No Contest - Result overturned by FIGMMA",
       "No Contest - Strikes After The Fight", "No Decision", "No Decision - Overturned by CSAC",
       "No Decision - Overturned by FMMAF", "No Decision - Zappitella Failed Drug Test", "Overtuned",
       "Overturned by Promoter"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(ncs), "outcome_method_broad"] = "No Contest"

# end round stuff
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 0, "end_round"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 12, "end_round"] = 1
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 22, "end_round"] = 2
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 30, "end_round"] = 3
sherdog_fighter_histories["end_round"] = sherdog_fighter_histories["end_round"].astype("Int64")

# end round time
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(";", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace('"', ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(".", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(",", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("?", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("L", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("_", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(" min", "", regex=False)
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "M/A", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "SUBMISSION", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"].isin(["1", "2", "3", "5"]), "end_round_time"] += ":00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "05", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "38", "end_round_time"] = "0:38"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "51", "end_round_time"] = "0:51"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "021", "end_round_time"] = "0:21"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "215", "end_round_time"] = "2:15"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "311", "end_round_time"] = "3:11"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "201641", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "383518", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "429661", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2:30::", "end_round_time"] = "2:30"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2: 5", "end_round_time"] = "2:05"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2:26:", "end_round_time"] = "2:26"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "01:15:00", "end_round_time"] = "1:15"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:03:50", "end_round_time"] = "3:50"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "05:00:00", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:02:48", "end_round_time"] = "2:48"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:05:00", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "04:17:00", "end_round_time"] = "4:17"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:01:39", "end_round_time"] = "1:39"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "4:45:00", "end_round_time"] = "4:45"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "1:12:00 AM", "end_round_time"] = "1:12"
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("!", "1", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("$", "4", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("&", "7", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("q", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("d", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("'", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("::", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(" :", ":", regex=False)

# end round time seconds
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = time_str.split(":")
    if minutes == "":
        minutes = 0
    
    minutes = int(minutes)
    seconds = int(seconds)

    return minutes * 60 + seconds

sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories["end_round_time"].apply(convert_to_seconds)
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time_seconds"] == 0, "end_round_time_seconds"] = np.nan
sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories["end_round_time_seconds"].astype("Int64")

# total time seconds
def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]

sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories.apply(calculate_total_time_seconds, axis=1)
sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories["total_time_seconds"].astype("Int64")

sherdog_fighter_histories.loc[sherdog_fighter_histories["event_id"] == 95098, "date"] = pd.to_datetime("2022-11-30")

sherdog_fighter_histories = sherdog_fighter_histories.sort_values(by=["fighter_id", "date", "order"]).reset_index(drop=True)
sherdog_fighter_histories["order"] = sherdog_fighter_histories.groupby("fighter_id").cumcount() + 1

sherdog_fighter_histories = sherdog_fighter_histories[
    ['fighter_id',
    'order',
    'event_id',
    'date',
    'opponent_id',
    'outcome',
    'outcome_method',
    'outcome_method_broad',
    'end_round',
    'end_round_time_seconds',
    'total_time_seconds']
]

sherdog_fighter_histories.to_csv(os.path.join(clean_data_dir, "Sherdog", "fighter_histories.csv"), index=False)

In [55]:
sherdog_bouts = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "bouts.csv"))
sherdog_bouts["fighter_1_id"] = sherdog_bouts["fighter_1_id"].astype("Int64")
sherdog_bouts["fighter_2_id"] = sherdog_bouts["fighter_2_id"].astype("Int64")

outcome_map = {
    "win": "W",
    "loss": "L",
    "draw": "D",
    "nc": "NC",
    "yet to come": np.nan
}
sherdog_bouts["fighter_1_outcome"] = sherdog_bouts["fighter_1_outcome"].map(outcome_map)
sherdog_bouts["fighter_2_outcome"] = sherdog_bouts["fighter_2_outcome"].map(outcome_map)

sherdog_bouts = sherdog_bouts.rename(columns={"weight_class": "weight_class_TEMP"})
sherdog_bouts["weight_class"] = sherdog_bouts["weight_class_TEMP"].apply(lambda x: np.nan if pd.isna(x) else "Catchweight" if "Catchweight" in x else np.nan if "lb" in x else x)
sherdog_bouts.loc[sherdog_bouts["weight_class"] == "Pound for Pound", "weight_class"] = np.nan

def extract_weight_lbs(weight_class_str):
    weight_map = {
        "Atomweight": 105,
        "Strawweight": 115,
        "Flyweight": 125,
        "Bantamweight": 135,
        "Featherweight": 145,
        "Lightweight": 155,
        "Welterweight": 170,
        "Middleweight": 185,
        "Light Heavyweight": 205,
        "Heavyweight": 265,
    }

    if pd.isna(weight_class_str):
        return np.nan
    elif "lb" in weight_class_str:
        return int(weight_class_str.replace("lb", "").replace("Catchweight", ""))
    
    return weight_map.get(weight_class_str, np.nan)

sherdog_bouts["weight_class_lbs"] = sherdog_bouts["weight_class_TEMP"].apply(extract_weight_lbs)
sherdog_bouts["weight_class_lbs"] = sherdog_bouts["weight_class_lbs"].astype("Int64")


sherdog_bouts["outcome_method_broad"] = sherdog_bouts["outcome_method"].str.split(r'\(|\)', expand=True).iloc[:, 0]
sherdog_bouts["outcome_method_broad"] = sherdog_bouts["outcome_method_broad"].str.strip()

# stupid edge cases
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"] == "", "outcome_method_broad"] = np.nan

dqs = ["Disqualification", "Disqualifcation", "Desqualification", "DG", "DQ"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(dqs), "outcome_method_broad"] = "Disqualification"

ko_tkos = ["ТКО", "Tko", "TKP", "ΤΚΟ", "Corner Stoppage", "Corner's towel", "Doctor's Stoppage", "Injury", "Retirement",
           "K.O", "K.O.", "KO", "Ko", "Knockout", "ko", "KO/TKO", "KO/TKO PUNCHES", "TKO Punches", "TKO", "RETIREMENT",
           "Dcotor Stoppade", "Nocaute"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(ko_tkos), "outcome_method_broad"] = "KO/TKO"

draws = ["DRAW", "Draw Unanimous", "Drew", "Majority Draw", "Technical Draw"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(draws), "outcome_method_broad"] = "Draw"

decisions = ["Decision", "Decision Unanimous", "Decision unanimous", "Decisions", "Decison", "Decisão Unanime", "Desicion",
             "Desision", "Majority Decision", "Points", "Split Decision", "Split Division", "Split decision", "Technical Decision",
             "Unaminous Decision", "Unanimous", "Unanimous Decision", "Unanimous decision", "de", "Decisionn", "Deicision"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(decisions), "outcome_method_broad"] = "Decision"

subs = ["Frontal Anaconda Choke", "Guillotine Choke", "Guillotine choke", "Kraken Choke", "Rear Naked Choke", "SUBMISSION",
        "Submision", "Submisison", "Submissio", "Submission", "Submissions", "Submisson", "Submssion", "Tapout",
        "Techinal Submission", "Techincal Submission", "Technial Submission", "Technical Submission", "Triangle Choke",
        "Verbal Submission", "su", "submison", "submission", "ubmission", "Bulldog Choke", "RNC", "Submissiom",
        "Submission-Guillotine Choke", "Submissoin", "sub"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(subs), "outcome_method_broad"] = "Submission"

ncs = ["N/C", "NC", "ND", "No Conest", "No Contest", "No Contest - Clements Failed Drug Test", 
       "No Contest - Collard Failed Drug Test", "No Contest - Overturned by Commission", 
       "No Contest - Overturned by NSAC", "No Contest - Overturned by WTKA", "No Contest - Result overturned by FIGMMA",
       "No Contest - Strikes After The Fight", "No Decision", "No Decision - Overturned by CSAC",
       "No Decision - Overturned by FMMAF", "No Decision - Zappitella Failed Drug Test", "Overtuned",
       "Overturned by Promoter", "Illegal Punch", "Unintentional Eye Poke"]
sherdog_bouts.loc[sherdog_bouts["outcome_method_broad"].isin(ncs), "outcome_method_broad"] = "No Contest"

# end round stuff
sherdog_bouts.loc[sherdog_bouts["end_round"] == 0, "end_round"] = np.nan
sherdog_bouts.loc[sherdog_bouts["end_round"] == 12, "end_round"] = 1
sherdog_bouts.loc[sherdog_bouts["end_round"] == 22, "end_round"] = 2
sherdog_bouts.loc[sherdog_bouts["end_round"] == 30, "end_round"] = 3
sherdog_bouts.loc[sherdog_bouts["end_round"] == 31, "end_round"] = 1
sherdog_bouts.loc[sherdog_bouts["end_round"] == 4355842, "end_round"] = 3
sherdog_bouts["end_round"] = sherdog_bouts["end_round"].astype("Int64")

# end round time
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(";", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace('"', ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(" :", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(")", "0", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("!", "1", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("$", "4", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("&", "7", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(".:", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(".", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("_", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("?", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("L", ":", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("d", "", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("q", "", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("r", "", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("s", "", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace("'", "", regex=False)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(",", ":", regex=False)

nans = [":evon Shab", "Aabek To", "Anew He", "BMISSION:", "Joeph Pe", "KO/TKO: PU", "KO/TKO: ST", "M/A",
        "Nick Kaaze", "SUBMISSION"]
sherdog_bouts.loc[sherdog_bouts["end_round_time"].isin(nans), "end_round_time"] = np.nan

sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2 m an 14", "end_round_time"] = "2:14"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2 min 11e", "end_round_time"] = "2:11"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:50 min", "end_round_time"] = "2:50"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2: 5", "end_round_time"] = "2:05"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:4u", "end_round_time"] = "2:40"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 2:1", "end_round_time"] = "2:10"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 2:5", "end_round_time"] = "2:50"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 3:0", "end_round_time"] = "3:00"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "1:12:00 AM", "end_round_time"] = "1:12"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "4:40:00 AM", "end_round_time"] = "4:40"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:01:39", "end_round_time"] = "1:39"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:02:48", "end_round_time"] = "2:48"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:03:02", "end_round_time"] = "3:02"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:03:50", "end_round_time"] = "3:50"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:05:00", "end_round_time"] = "5:00"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "01:15:00", "end_round_time"] = "1:15"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "04:17:00", "end_round_time"] = "4:17"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "05:00:00", "end_round_time"] = "5:00"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "4:45:00", "end_round_time"] = "4:45"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "0::32", "end_round_time"] = "0:32"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "1::45", "end_round_time"] = "1:45"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:26:", "end_round_time"] = "2:26"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:30::", "end_round_time"] = "2:30"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2::58", "end_round_time"] = "2:58"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "05", "end_round_time"] = "5:00"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "215", "end_round_time"] = "2:15"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "021", "end_round_time"] = "0:21"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "241", "end_round_time"] = "2:41"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "250", "end_round_time"] = "2:50"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "311", "end_round_time"] = "3:11"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "135", "end_round_time"] = "1:35"

nans2 = ["1", '110087',
 '135',
 '2',
 '201641',
 '23',
 '3',
 '308073',
 '38',
 '383518',
 '429661',
 '431507',
 '467551',
 '5',
 '51']
sherdog_bouts.loc[sherdog_bouts["end_round_time"].isin(nans2), "end_round_time"] = np.nan

# end round time seconds
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = time_str.split(":")
    if minutes == "":
        minutes = 0
    
    minutes = int(minutes)
    seconds = int(seconds)

    return minutes * 60 + seconds

sherdog_bouts["end_round_time_seconds"] = sherdog_bouts["end_round_time"].apply(convert_to_seconds)
sherdog_bouts.loc[sherdog_bouts["end_round_time_seconds"] == 0, "end_round_time_seconds"] = np.nan
sherdog_bouts["end_round_time_seconds"] = sherdog_bouts["end_round_time_seconds"].astype("Int64")

# total time seconds
def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]

sherdog_bouts["total_time_seconds"] = sherdog_bouts.apply(calculate_total_time_seconds, axis=1)
sherdog_bouts["total_time_seconds"] = sherdog_bouts["total_time_seconds"].astype("Int64")

sherdog_bouts = sherdog_bouts[
    ['event_id',
 'bout_order',
 'fighter_1_id',
 'fighter_2_id',
 'fighter_1_outcome',
 'fighter_2_outcome',
 'is_title_bout',
 'weight_class',
 'weight_class_lbs',
 'outcome_method',
 'outcome_method_broad',
 'end_round',
 'end_round_time_seconds',
 'total_time_seconds']
]

sherdog_bouts.to_csv(os.path.join(clean_data_dir, "Sherdog", "bouts.csv"), index=False)