# Data Cleaning - Best Fight Odds

In [None]:
# standard library imports
import os
import zipfile

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

In [None]:
# data preparation
bestfightodds_prop_odds = pd.read_csv(
    os.path.join(raw_data_dir, "Best Fight Odds", "closing_with_props.csv"),
    parse_dates=["Card_Date"],
)

# drop bad events
bad_events = [
    "UFC 9: Motor City Madness",
    "UFC 14: Showdown",
    "UFC 15: Collision Course",
    "UFC 18: The Road to the Heavyweight Title",
    "UFC 32: Showdown in the Meadowlands",
    "UFC 35: Throwdown",
    "UFC 48: Payback",
    "UFC 53: Heavy Hitters",
    "UFC 56: Full Force",
]
bestfightodds_prop_odds = bestfightodds_prop_odds.loc[
    ~bestfightodds_prop_odds["Event"].isin(bad_events)
]

# fix event urls
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 73: Stacked", "url"
] = "https://www.bestfightodds.com/events/ufc-73-stacked-1"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 74: Respect", "url"
] = "https://www.bestfightodds.com/events/ufc-74-respect-7"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 76: Knockout", "url"
] = "https://www.bestfightodds.com/events/ufc-76-knockout-12"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 84: Ill Will", "url"
] = "https://www.bestfightodds.com/events/ufc-84-ill-will-47"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 85: Bedlam", "url"
] = "https://www.bestfightodds.com/events/ufc-85-bedlam-46"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 87: Seek and Destroy", "url"
] = "https://www.bestfightodds.com/events/ufc-87-seek-and-destroy-57"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 99: The Comeback", "url"
] = "https://www.bestfightodds.com/events/ufc-99-the-comeback-136"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 100", "url"] = (
    "https://www.bestfightodds.com/events/ufc-100-137"
)
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 101: Declaration", "url"
] = "https://www.bestfightodds.com/events/ufc-101-declaration-145"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 109: Relentless", "url"
] = "https://www.bestfightodds.com/events/ufc-109-relentless-226"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 112: Invincible", "url"
] = "https://www.bestfightodds.com/events/ufc-112-invincible-245"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 119: Mir vs. Cro Cop", "url"
] = "https://www.bestfightodds.com/events/ufc-119-mir-vs-cro-cop-296"

bestfightodds_prop_odds["event_id"] = (
    bestfightodds_prop_odds["url"]
    .str.split("/")
    .str[-1]
    .str.split("-")
    .str[-1]
    .astype(int)
)
bestfightodds_prop_odds = bestfightodds_prop_odds.drop(
    columns=["url", "Card_Date", "Event"]
)
bestfightodds_prop_odds = bestfightodds_prop_odds.rename(
    columns={"William_H": "William Hill"}
)
bestfightodds_prop_odds["Bet"] = bestfightodds_prop_odds["Bet"].str.strip()

## Bout Proposition Odds

In [None]:
bout_prop_odds = (
    bestfightodds_prop_odds.loc[bestfightodds_prop_odds["bet_type"] == "Prop"]
    .reset_index(drop=True)
    .drop(columns=["bet_type"])
)
bout_prop_odds["Bet"] = (
    bout_prop_odds["Bet"].str.strip().str.replace("½", ".5", regex=False)
)

# drop columns with all nan
bout_prop_odds = bout_prop_odds.dropna(axis=1, how="all")

# is_not
bout_prop_odds["is_not"] = bout_prop_odds["Bet"].apply(
    lambda x: 1 if x.startswith("Not ") or x == "Any other result" else 0
)
bout_prop_odds["Bet"] = bout_prop_odds["Bet"].mask(
    bout_prop_odds["is_not"] == 1, bout_prop_odds["Bet"].shift()
)

# reference fighter last name
bout_prop_odds["ref_fighter_last_name"] = (
    bout_prop_odds["Bet"].str.split().str[0].str.strip()
)
bout_prop_odds.loc[
    bout_prop_odds["ref_fighter_last_name"].isin(["Fight", "Over", "Under", "Wins"]),
    "ref_fighter_last_name",
] = np.nan
other_edge_cases = [
    "Both fighters are knocked down",
    "Fighters touch gloves before fight",
    "No glove touch before fight",
    "Timeout is called",
    "Timeout is not called",
    "Either fighter wins by TKO/KO",
    "Either fighter wins by submission",
    "Either fighter is disqualified",
    "Either fighter is stopped by doctor/corner",
    "Clock stoppage due to eye poke",
    "No clock stoppage due to eye poke",
    "Does not announce retirement",
    "Does not pull out of the fight",
]
bout_prop_odds.loc[
    bout_prop_odds["Bet"].isin(other_edge_cases), "ref_fighter_last_name"
] = np.nan

# william hill weirdness
bout_prop_odds.loc[bout_prop_odds["William Hill"] == 100000, "William Hill"] = np.nan
bout_prop_odds.loc[bout_prop_odds["William Hill"] == -10000, "William Hill"] = np.nan

# reference fighter
bout_prop_odds["ref_fighter"] = bout_prop_odds.apply(
    lambda x: np.nan
    if pd.isna(x["ref_fighter_last_name"])
    else "fighter1"
    if x["fighter1"].endswith(x["ref_fighter_last_name"])
    else "fighter2"
    if x["fighter2"].endswith(x["ref_fighter_last_name"])
    else "PROBLEM",
    axis=1,
)

# remove joby sanchez vs roberto sanchez fighter-specific props, literally no way of knowing which is which
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["fighter1"] == "Joby Sanchez")
        & (bout_prop_odds["fighter2"] == "Roberto Sanchez")
        & (bout_prop_odds["ref_fighter_last_name"] == "Sanchez")
    )
]

# remove rows with all nan odds
bout_prop_odds = bout_prop_odds.dropna(
    how="all",
    subset=[
        "5Dimes",
        "BetDSI",
        "BookMaker",
        "SportBet",
        "Bet365",
        "Bovada",
        "William Hill",
        "Pinnacle",
        "SportsInt",
        "BetOnline",
        "Intertops",
    ],
).reset_index(drop=True)

# remove fighter names from bets
bout_prop_odds["Bet"] = bout_prop_odds.apply(
    lambda x: x["Bet"]
    if pd.isna(x["ref_fighter"])
    else x["Bet"].replace(x["ref_fighter_last_name"], "").strip().capitalize(),
    axis=1,
)
bout_prop_odds = bout_prop_odds.drop(columns=["ref_fighter_last_name"])

# drop cancelled fights
# drop cancelled fights
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1444)
        & (bout_prop_odds["fighter1"] == "Al Iaquinta")
        & (bout_prop_odds["fighter2"] == "Paul Felder")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1289)
        & (bout_prop_odds["fighter1"] == "Aspen Ladd")
        & (bout_prop_odds["fighter2"] == "Jessica Eye")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1562)
        & (bout_prop_odds["fighter1"] == "Brian Kelleher")
        & (bout_prop_odds["fighter2"] == "Montel Jackson")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1647)
        & (bout_prop_odds["fighter1"] == "Diego Ferreira")
        & (bout_prop_odds["fighter2"] == "Francisco Trinaldo")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1713)
        & (bout_prop_odds["fighter1"] == "Giacomo Lemos")
        & (bout_prop_odds["fighter2"] == "Tanner Boser")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1849)
        & (bout_prop_odds["fighter1"] == "Emily Whitmire")
        & (bout_prop_odds["fighter2"] == "Polyana Viana")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1246)
        & (bout_prop_odds["fighter1"] == "Brett Johns")
        & (bout_prop_odds["fighter2"] == "Ian Entwistle")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1680)
        & (bout_prop_odds["fighter1"] == "John Lineker")
        & (bout_prop_odds["fighter2"] == "Rob Font")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1673)
        & (bout_prop_odds["fighter1"] == "Devin Clark")
        & (bout_prop_odds["fighter2"] == "Ivan Shtyrkov")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1467)
        & (bout_prop_odds["fighter1"] == "Jessica Aguilar")
        & (bout_prop_odds["fighter2"] == "Jodie Esquibel")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1478)
        & (bout_prop_odds["fighter1"] == "Davey Grant")
        & (bout_prop_odds["fighter2"] == "Manny Bermudez")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1626)
        & (bout_prop_odds["fighter1"] == "Ian Heinisch")
        & (bout_prop_odds["fighter2"] == "Tom Breese")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1423)
        & (bout_prop_odds["fighter1"] == "Nad Narimani")
        & (bout_prop_odds["fighter2"] == "Nasrat Haqparast")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1687)
        & (bout_prop_odds["fighter1"] == "Jordan Griffin")
        & (bout_prop_odds["fighter2"] == "Vince Murdock")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1637)
        & (bout_prop_odds["fighter1"] == "Jessica Penne")
        & (bout_prop_odds["fighter2"] == "Jodie Esquibel")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1587)
        & (bout_prop_odds["fighter1"] == "Andrea Lee")
        & (bout_prop_odds["fighter2"] == "Jessica-Rose Clark")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 485)
        & (bout_prop_odds["fighter1"] == "Buddy Roberts")
        & (bout_prop_odds["fighter2"] == "Sean Loeffler")
    )
]
bout_prop_odds = bout_prop_odds.loc[
    ~(
        (bout_prop_odds["event_id"] == 1647)
        & (bout_prop_odds["fighter1"] == "Melissa Gatto")
        & (bout_prop_odds["fighter2"] == "Talita Bernardo")
    )
].reset_index(drop=True)

bout_prop_odds["fighter1"] = bout_prop_odds["fighter1"].str.strip()
bout_prop_odds["fighter2"] = bout_prop_odds["fighter2"].str.strip()

fighter_event_df = (
    bout_prop_odds[["fighter1", "fighter2", "event_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

tapology_events = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "events.csv"),
    usecols=["id", "bestfightodds_id", "event_order"],
)
tapology_events = tapology_events.rename(
    columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"}
)
tapology_bouts = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "bouts.csv"),
    usecols=["id", "event_id", "fighter_1_id", "fighter_2_id", "bout_order"],
)
tapology_bouts = tapology_bouts.rename(
    columns={
        "id": "tapology_bout_id",
        "event_id": "tapology_event_id",
        "fighter_1_id": "tapology_fighter_1_id",
        "fighter_2_id": "tapololgy_fighter_2_id",
    }
)
tapology_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv"),
    usecols=["id", "bestfightodds_id", "name"],
)
tapology_fighters = tapology_fighters.rename(
    columns={"id": "tapology_fighter_id", "bestfightodds_id": "fighter_id"}
)

tapology_bouts = tapology_bouts.merge(
    tapology_events, on="tapology_event_id", how="left"
).drop(columns=["tapology_event_id"])
tapology_bouts = tapology_bouts.merge(
    tapology_fighters,
    left_on="tapology_fighter_1_id",
    right_on="tapology_fighter_id",
    how="left",
)
tapology_bouts = tapology_bouts.rename(
    columns={"fighter_id": "fighter_1_id", "name": "fighter1"}
).drop(columns=["tapology_fighter_id", "tapology_fighter_1_id"])
tapology_bouts = tapology_bouts.merge(
    tapology_fighters,
    left_on="tapololgy_fighter_2_id",
    right_on="tapology_fighter_id",
    how="left",
)
tapology_bouts = tapology_bouts.rename(
    columns={"fighter_id": "fighter_2_id", "name": "fighter2"}
).drop(columns=["tapology_fighter_id", "tapololgy_fighter_2_id"])
tapology_bouts = tapology_bouts.loc[
    tapology_bouts["event_id"].isin(fighter_event_df["event_id"])
].reset_index(drop=True)
tapology_bouts[["event_id", "fighter_1_id", "fighter_2_id"]] = tapology_bouts[
    ["event_id", "fighter_1_id", "fighter_2_id"]
].astype(int)
tapology_bouts["fighter1"] = (
    tapology_bouts["fighter1"]
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)
tapology_bouts["fighter2"] = (
    tapology_bouts["fighter2"]
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)

fighter_event_df_temp = fighter_event_df.copy()
fighter_event_df_temp["fighter1_lower"] = (
    fighter_event_df_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
)
fighter_event_df_temp["fighter2_lower"] = (
    fighter_event_df_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts_temp = tapology_bouts.copy()
tapology_bouts_temp["fighter1_lower"] = (
    tapology_bouts_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts_temp["fighter2_lower"] = (
    tapology_bouts_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts_temp = tapology_bouts_temp.drop(columns=["fighter1", "fighter2"])
merged1 = fighter_event_df_temp.merge(
    tapology_bouts_temp, on=["event_id", "fighter1_lower", "fighter2_lower"], how="left"
)
good_matches1 = merged1.loc[~pd.isna(merged1["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches1 = good_matches1.drop(columns=["fighter1_lower", "fighter2_lower"])
good_matches1[["fighter_1_id", "fighter_2_id"]] = good_matches1[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df2 = fighter_event_df.loc[
    pd.isna(merged1["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df2_temp = fighter_event_df2.copy()
fighter_event_df2_temp["fighter1_lower"] = (
    fighter_event_df2_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
)
fighter_event_df2_temp["fighter2_lower"] = (
    fighter_event_df2_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts2 = tapology_bouts.loc[
    ~tapology_bouts["tapology_bout_id"].isin(good_matches1["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts2_temp = tapology_bouts2.copy()
tapology_bouts2_temp = tapology_bouts2_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts2_temp["fighter1_lower"] = (
    tapology_bouts2_temp["fighter1"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts2_temp["fighter2_lower"] = (
    tapology_bouts2_temp["fighter2"].str.lower().str.replace(".", "", regex=False)
)
tapology_bouts2_temp = tapology_bouts2_temp.drop(columns=["fighter1", "fighter2"])
merged2 = fighter_event_df2_temp.merge(
    tapology_bouts2_temp,
    on=["event_id", "fighter1_lower", "fighter2_lower"],
    how="left",
)
good_matches2 = merged2.loc[~pd.isna(merged2["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches2 = good_matches2.drop(columns=["fighter1_lower", "fighter2_lower"])
good_matches2[["fighter_1_id", "fighter_2_id"]] = good_matches2[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df3 = fighter_event_df2.loc[
    pd.isna(merged2["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df3_temp = fighter_event_df3.copy()
tapology_bouts3 = tapology_bouts2.loc[
    ~tapology_bouts2["tapology_bout_id"].isin(good_matches2["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts3_temp = tapology_bouts3.copy()
tapology_bouts3_temp["fighter1_first_name"] = (
    tapology_bouts3_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
tapology_bouts3_temp["fighter2_first_name"] = (
    tapology_bouts3_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
tapology_bouts3_temp = (
    tapology_bouts3_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(
        subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False
    )
    .reset_index(drop=True)
)
fighter_event_df3_temp["fighter1_first_name"] = (
    fighter_event_df3_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
fighter_event_df3_temp["fighter2_first_name"] = (
    fighter_event_df3_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
fighter_event_df3_temp = fighter_event_df3_temp.drop_duplicates(
    subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False
).reset_index(drop=True)
merged3 = fighter_event_df3_temp.merge(
    tapology_bouts3_temp,
    on=["event_id", "fighter1_first_name", "fighter2_first_name"],
    how="left",
)
good_matches3 = merged3.loc[~pd.isna(merged3["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches3 = good_matches3.drop(
    columns=["fighter1_first_name", "fighter2_first_name"]
)
good_matches3[["fighter_1_id", "fighter_2_id"]] = good_matches3[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df4 = fighter_event_df3.loc[
    pd.isna(merged3["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df4_temp = fighter_event_df4.copy()
tapology_bouts4 = tapology_bouts3.loc[
    ~tapology_bouts3["tapology_bout_id"].isin(good_matches3["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts4_temp = tapology_bouts4.copy()
tapology_bouts4_temp = tapology_bouts4_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts4_temp["fighter1_first_name"] = (
    tapology_bouts4_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
tapology_bouts4_temp["fighter2_first_name"] = (
    tapology_bouts4_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
tapology_bouts4_temp = (
    tapology_bouts4_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(
        subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False
    )
    .reset_index(drop=True)
)
fighter_event_df4_temp["fighter1_first_name"] = (
    fighter_event_df4_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
fighter_event_df4_temp["fighter2_first_name"] = (
    fighter_event_df4_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[0]
    .str.replace(".", "", regex=False)
)
fighter_event_df4_temp = fighter_event_df4_temp.drop_duplicates(
    subset=["fighter1_first_name", "fighter2_first_name", "event_id"], keep=False
).reset_index(drop=True)
merged4 = fighter_event_df4_temp.merge(
    tapology_bouts4_temp,
    on=["event_id", "fighter1_first_name", "fighter2_first_name"],
    how="left",
)
good_matches4 = merged4.loc[~pd.isna(merged4["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches4 = good_matches4.drop(
    columns=["fighter1_first_name", "fighter2_first_name"]
)
good_matches4[["fighter_1_id", "fighter_2_id"]] = good_matches4[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df5 = fighter_event_df4.loc[
    pd.isna(merged4["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df5_temp = fighter_event_df5.copy()
tapology_bouts5 = tapology_bouts4.loc[
    ~tapology_bouts4["tapology_bout_id"].isin(good_matches4["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts5_temp = tapology_bouts5.copy()
tapology_bouts5_temp["fighter1_last_name"] = (
    tapology_bouts5_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
tapology_bouts5_temp["fighter2_last_name"] = (
    tapology_bouts5_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
tapology_bouts5_temp = (
    tapology_bouts5_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(
        subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False
    )
    .reset_index(drop=True)
)
fighter_event_df5_temp["fighter1_last_name"] = (
    fighter_event_df5_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
fighter_event_df5_temp["fighter2_last_name"] = (
    fighter_event_df5_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
fighter_event_df5_temp = fighter_event_df5_temp.drop_duplicates(
    subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False
).reset_index(drop=True)
merged5 = fighter_event_df5_temp.merge(
    tapology_bouts5_temp,
    on=["event_id", "fighter1_last_name", "fighter2_last_name"],
    how="left",
)
good_matches5 = merged5.loc[~pd.isna(merged5["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches5 = good_matches5.drop(columns=["fighter1_last_name", "fighter2_last_name"])
good_matches5[["fighter_1_id", "fighter_2_id"]] = good_matches5[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df6 = fighter_event_df5.loc[
    pd.isna(merged5["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df6_temp = fighter_event_df6.copy()
tapology_bouts6 = tapology_bouts5.loc[
    ~tapology_bouts5["tapology_bout_id"].isin(good_matches5["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts6_temp = tapology_bouts6.copy()
tapology_bouts6_temp = tapology_bouts6_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts6_temp["fighter1_last_name"] = (
    tapology_bouts6_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
tapology_bouts6_temp["fighter2_last_name"] = (
    tapology_bouts6_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
tapology_bouts6_temp = (
    tapology_bouts6_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(
        subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False
    )
    .reset_index(drop=True)
)
fighter_event_df6_temp["fighter1_last_name"] = (
    fighter_event_df6_temp["fighter1"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
fighter_event_df6_temp["fighter2_last_name"] = (
    fighter_event_df6_temp["fighter2"]
    .str.lower()
    .str.split()
    .str[-1]
    .str.replace(".", "", regex=False)
)
fighter_event_df6_temp = fighter_event_df6_temp.drop_duplicates(
    subset=["fighter1_last_name", "fighter2_last_name", "event_id"], keep=False
).reset_index(drop=True)
merged6 = fighter_event_df6_temp.merge(
    tapology_bouts6_temp,
    on=["event_id", "fighter1_last_name", "fighter2_last_name"],
    how="left",
)
good_matches6 = merged6.loc[~pd.isna(merged6["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches6 = good_matches6.drop(columns=["fighter1_last_name", "fighter2_last_name"])
good_matches6[["fighter_1_id", "fighter_2_id"]] = good_matches6[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df7 = fighter_event_df6.loc[
    pd.isna(merged6["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df7_temp = fighter_event_df7.copy()
tapology_bouts7 = tapology_bouts6.loc[
    ~tapology_bouts6["tapology_bout_id"].isin(good_matches6["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts7_temp = tapology_bouts7.copy()
tapology_bouts7_temp["fighter1_lower"] = (
    tapology_bouts7_temp["fighter1"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
tapology_bouts7_temp = (
    tapology_bouts7_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df7_temp = fighter_event_df7_temp.copy()
fighter_event_df7_temp["fighter1_lower"] = (
    fighter_event_df7_temp["fighter1"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
fighter_event_df7_temp = fighter_event_df7_temp.drop_duplicates(
    subset=["fighter1_lower", "event_id"], keep=False
).reset_index(drop=True)
merged7 = fighter_event_df7_temp.merge(
    tapology_bouts7_temp, on=["event_id", "fighter1_lower"], how="left"
)
good_matches7 = merged7.loc[~pd.isna(merged7["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches7 = good_matches7.drop(columns=["fighter1_lower"])
good_matches7[["fighter_1_id", "fighter_2_id"]] = good_matches7[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df8 = fighter_event_df7.loc[
    pd.isna(merged7["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df8_temp = fighter_event_df8.copy()
tapology_bouts8 = tapology_bouts7.loc[
    ~tapology_bouts7["tapology_bout_id"].isin(good_matches7["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts8_temp = tapology_bouts8.copy()
tapology_bouts8_temp["fighter2_lower"] = (
    tapology_bouts8_temp["fighter2"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
tapology_bouts8_temp = (
    tapology_bouts8_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df8_temp = fighter_event_df8_temp.copy()
fighter_event_df8_temp["fighter2_lower"] = (
    fighter_event_df8_temp["fighter2"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
fighter_event_df8_temp = fighter_event_df8_temp.drop_duplicates(
    subset=["fighter2_lower", "event_id"], keep=False
).reset_index(drop=True)
merged8 = fighter_event_df8_temp.merge(
    tapology_bouts8_temp, on=["event_id", "fighter2_lower"], how="left"
)
good_matches8 = merged8.loc[~pd.isna(merged8["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches8 = good_matches8.drop(columns=["fighter2_lower"])
good_matches8[["fighter_1_id", "fighter_2_id"]] = good_matches8[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df9 = fighter_event_df8.loc[
    pd.isna(merged8["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df9_temp = fighter_event_df9.copy()
tapology_bouts9 = tapology_bouts8.loc[
    ~tapology_bouts8["tapology_bout_id"].isin(good_matches8["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts9_temp = tapology_bouts9.copy()
tapology_bouts9_temp = tapology_bouts9_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts9_temp["fighter1_lower"] = (
    tapology_bouts9_temp["fighter1"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
tapology_bouts9_temp = (
    tapology_bouts9_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter1_lower", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df9_temp = fighter_event_df9_temp.copy()
fighter_event_df9_temp["fighter1_lower"] = (
    fighter_event_df9_temp["fighter1"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
fighter_event_df9_temp = fighter_event_df9_temp.drop_duplicates(
    subset=["fighter1_lower", "event_id"], keep=False
).reset_index(drop=True)
merged9 = fighter_event_df9_temp.merge(
    tapology_bouts9_temp, on=["event_id", "fighter1_lower"], how="left"
)
good_matches9 = merged9.loc[~pd.isna(merged9["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches9 = good_matches9.drop(columns=["fighter1_lower"])
good_matches9[["fighter_1_id", "fighter_2_id"]] = good_matches9[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df10 = fighter_event_df9.loc[
    pd.isna(merged9["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df10_temp = fighter_event_df10.copy()
tapology_bouts10 = tapology_bouts9.loc[
    ~tapology_bouts9["tapology_bout_id"].isin(good_matches9["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts10_temp = tapology_bouts10.copy()
tapology_bouts10_temp = tapology_bouts10_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts10_temp["fighter2_lower"] = (
    tapology_bouts10_temp["fighter2"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
tapology_bouts10_temp = (
    tapology_bouts10_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter2_lower", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df10_temp = fighter_event_df10_temp.copy()
fighter_event_df10_temp["fighter2_lower"] = (
    fighter_event_df10_temp["fighter2"]
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("jr", "", regex=False)
    .str.strip()
    .str.replace("dos santos", "", regex=False)
    .str.strip()
)
fighter_event_df10_temp = fighter_event_df10_temp.drop_duplicates(
    subset=["fighter2_lower", "event_id"], keep=False
).reset_index(drop=True)
merged10 = fighter_event_df10_temp.merge(
    tapology_bouts10_temp, on=["event_id", "fighter2_lower"], how="left"
)
good_matches10 = merged10.loc[~pd.isna(merged10["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches10 = good_matches10.drop(columns=["fighter2_lower"])
good_matches10[["fighter_1_id", "fighter_2_id"]] = good_matches10[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df11 = fighter_event_df10.loc[
    pd.isna(merged10["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df11_temp = fighter_event_df11.copy()
tapology_bouts11 = tapology_bouts10.loc[
    ~tapology_bouts10["tapology_bout_id"].isin(good_matches10["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts11_temp = tapology_bouts11.copy()
tapology_bouts11_temp["fighter1_first_name"] = (
    tapology_bouts11_temp["fighter1"].str.lower().str.split().str[0]
)
tapology_bouts11_temp = (
    tapology_bouts11_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter1_first_name", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df11_temp = fighter_event_df11_temp.copy()
fighter_event_df11_temp["fighter1_first_name"] = (
    fighter_event_df11_temp["fighter1"].str.lower().str.split().str[0]
)
fighter_event_df11_temp = fighter_event_df11_temp.drop_duplicates(
    subset=["fighter1_first_name", "event_id"], keep=False
).reset_index(drop=True)
merged11 = fighter_event_df11_temp.merge(
    tapology_bouts11_temp, on=["event_id", "fighter1_first_name"], how="left"
)
good_matches11 = merged11.loc[~pd.isna(merged11["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches11 = good_matches11.drop(columns=["fighter1_first_name"])
good_matches11[["fighter_1_id", "fighter_2_id"]] = good_matches11[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df12 = fighter_event_df11.loc[
    pd.isna(merged11["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df12_temp = fighter_event_df12.copy()
tapology_bouts12 = tapology_bouts11.loc[
    ~tapology_bouts11["tapology_bout_id"].isin(good_matches11["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts12_temp = tapology_bouts12.copy()
tapology_bouts12_temp["fighter2_first_name"] = (
    tapology_bouts12_temp["fighter2"].str.lower().str.split().str[0]
)
tapology_bouts12_temp = (
    tapology_bouts12_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter2_first_name", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df12_temp = fighter_event_df12_temp.copy()
fighter_event_df12_temp["fighter2_first_name"] = (
    fighter_event_df12_temp["fighter2"].str.lower().str.split().str[0]
)
fighter_event_df12_temp = fighter_event_df12_temp.drop_duplicates(
    subset=["fighter2_first_name", "event_id"], keep=False
).reset_index(drop=True)
merged12 = fighter_event_df12_temp.merge(
    tapology_bouts12_temp, on=["event_id", "fighter2_first_name"], how="left"
)
good_matches12 = merged12.loc[~pd.isna(merged12["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches12 = good_matches12.drop(columns=["fighter2_first_name"])
good_matches12[["fighter_1_id", "fighter_2_id"]] = good_matches12[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

fighter_event_df13 = fighter_event_df12.loc[
    pd.isna(merged12["tapology_bout_id"])
].reset_index(drop=True)
fighter_event_df13_temp = fighter_event_df13.copy()
tapology_bouts13 = tapology_bouts12.loc[
    ~tapology_bouts12["tapology_bout_id"].isin(good_matches12["tapology_bout_id"])
].reset_index(drop=True)
tapology_bouts13_temp = tapology_bouts13.copy()
tapology_bouts13_temp = tapology_bouts13_temp.rename(
    columns={
        "fighter_1_id": "fighter_2_id",
        "fighter_2_id": "fighter_1_id",
        "fighter1": "fighter2",
        "fighter2": "fighter1",
    }
)
tapology_bouts13_temp["fighter2_last_name"] = (
    tapology_bouts13_temp["fighter2"].str.lower().str.split().str[-1]
)
tapology_bouts13_temp = (
    tapology_bouts13_temp.drop(columns=["fighter1", "fighter2"])
    .drop_duplicates(subset=["fighter2_last_name", "event_id"], keep=False)
    .reset_index(drop=True)
)
fighter_event_df13_temp = fighter_event_df13_temp.copy()
fighter_event_df13_temp["fighter2_last_name"] = (
    fighter_event_df13_temp["fighter2"].str.lower().str.split().str[-1]
)
fighter_event_df13_temp = fighter_event_df13_temp.drop_duplicates(
    subset=["fighter2_last_name", "event_id"], keep=False
).reset_index(drop=True)
merged13 = fighter_event_df13_temp.merge(
    tapology_bouts13_temp, on=["event_id", "fighter2_last_name"], how="left"
)
good_matches13 = merged13.loc[~pd.isna(merged13["tapology_bout_id"])].reset_index(
    drop=True
)
good_matches13 = good_matches13.drop(columns=["fighter2_last_name"])
good_matches13[["fighter_1_id", "fighter_2_id"]] = good_matches13[
    ["fighter_1_id", "fighter_2_id"]
].astype(int)

good_matches14 = pd.DataFrame(
    {
        "fighter1": ["Danaa Batgerel", "Ning Guangyou"],
        "fighter2": ["Heili Alateng", "Yang Jianping"],
        "event_id": [1733, 835],
        "tapology_bout_id": [
            "448800-ufc-on-espn-15-batgerel-danaa-vs-heili-alateng",
            "146518-ufc-fight-night-48-guangyou-smasher-ning-vs-jianping-tiger-yang",
        ],
        "event_order": [491, 286],
        "bout_order": [2, 7],
        "fighter_1_id": [9468, 4699],
        "fighter_2_id": [9469, 4700],
    }
)
all_good_matches = pd.concat(
    [
        good_matches1,
        good_matches2,
        good_matches3,
        good_matches4,
        good_matches5,
        good_matches6,
        good_matches7,
        good_matches8,
        good_matches9,
        good_matches10,
        good_matches11,
        good_matches12,
        good_matches13,
        good_matches14,
    ],
    ignore_index=True,
)
all_good_matches[["bout_order", "event_order"]] = all_good_matches[
    ["bout_order", "event_order"]
].astype(int)

bout_prop_odds = bout_prop_odds.merge(
    all_good_matches, on=["event_id", "fighter1", "fighter2"], how="left"
)
bout_prop_odds["fighter_id"] = bout_prop_odds.apply(
    lambda x: np.nan
    if pd.isna(x["ref_fighter"])
    else x["fighter_1_id"]
    if x["ref_fighter"] == "fighter1"
    else x["fighter_2_id"]
    if x["ref_fighter"] == "fighter2"
    else "PROBLEM",
    axis=1,
)
bout_prop_odds = bout_prop_odds.drop(
    columns=["ref_fighter", "fighter_1_id", "fighter_2_id", "fighter1", "fighter2"]
)

bout_prop_odds = bout_prop_odds.melt(
    id_vars=[
        "tapology_bout_id",
        "event_id",
        "event_order",
        "bout_order",
        "fighter_id",
        "Bet",
        "is_not",
    ],
    value_vars=[
        "5Dimes",
        "BetDSI",
        "BookMaker",
        "SportBet",
        "Bet365",
        "Bovada",
        "William Hill",
        "Pinnacle",
        "SportsInt",
        "BetOnline",
        "Intertops",
    ],
    var_name="betsite",
    value_name="odds",
)
bout_prop_odds = bout_prop_odds.dropna(subset=["odds"]).reset_index(drop=True)
bout_prop_odds = bout_prop_odds.sort_values(
    by=["event_order", "bout_order", "fighter_id", "Bet", "is_not", "betsite"],
    na_position="first",
).reset_index(drop=True)
bout_prop_odds = bout_prop_odds.drop(columns=["event_order", "bout_order"]).rename(
    columns={"Bet": "description"}
)
bout_prop_odds["fighter_id"] = bout_prop_odds["fighter_id"].astype("Int64")
bout_prop_odds["odds"] = bout_prop_odds["odds"].astype(int)
bout_prop_odds.to_csv(
    os.path.join(clean_data_dir, "Best Fight Odds", "bout_proposition_odds2.csv"),
    index=False,
)

## Event Proposition Odds

In [None]:
event_prop_odds = (
    bestfightodds_prop_odds.loc[bestfightodds_prop_odds["bet_type"] == "Event Prop"]
    .reset_index(drop=True)
    .drop(columns=["bet_type", "fighter1", "fighter2"])
)

# not bets
event_prop_odds["Bet"] = (
    event_prop_odds["Bet"]
    .replace("Any other result", pd.NA)
    .fillna("(NOT) " + event_prop_odds["Bet"].shift())
)
event_prop_odds["is_not"] = event_prop_odds["Bet"].str.startswith("(NOT) ").astype(int)
event_prop_odds["Bet"] = (
    event_prop_odds["Bet"]
    .str.replace("(NOT) ", "", regex=False)
    .str.replace("½", ".5", regex=False)
)

# melt
event_prop_odds = (
    event_prop_odds.melt(
        id_vars=["event_id", "Bet", "is_not"], var_name="betsite", value_name="odds"
    )
    .dropna()
    .reset_index(drop=True)
)
event_prop_odds["odds"] = event_prop_odds["odds"].astype(int)

# tapology events
tapology_events = (
    pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[
        ["bestfightodds_id", "event_order"]
    ]
    .dropna()
    .rename(columns={"bestfightodds_id": "event_id"})
    .reset_index(drop=True)
)
event_prop_odds = event_prop_odds.merge(tapology_events, on="event_id", how="left")
event_prop_odds = event_prop_odds.sort_values(
    by=["event_order", "Bet", "is_not", "betsite"]
).reset_index(drop=True)
event_prop_odds = event_prop_odds.drop(columns=["event_order"]).rename(
    columns={"Bet": "description"}
)

event_prop_odds.to_csv(
    os.path.join(clean_data_dir, "Best Fight Odds", "event_proposition_odds.csv"),
    index=False,
)

## Moneyline Odds

Note: This requires Tapology data to already be cleaned prior to running

In [None]:
df_list = []

# erroneous scrapes
bad_files = [
    "ordinarybet_datatest_UFC_9__Motor_City_Madness.csv",
    "ordinarybet_datatest_UFC_14__Showdown.csv",
    "ordinarybet_datatest_UFC_15__Collision_Course.csv",
    "ordinarybet_datatest_UFC_18__The_Road_to_the_Heavyweight_Title.csv",
    "ordinarybet_datatest_UFC_32__Showdown_in_the_Meadowlands.csv",
    "ordinarybet_datatest_UFC_35__Throwdown.csv",
    "ordinarybet_datatest_UFC_48__Payback.csv",
    "ordinarybet_datatest_UFC_53__Heavy_Hitters.csv",
    "ordinarybet_datatest_UFC_56__Full_Force.csv",
]

# loop through all files in the zip file
with zipfile.ZipFile(
    os.path.join(raw_data_dir, "Best Fight Odds", "straight_over_time.zip")
) as z:
    for filename in z.namelist():
        if filename in bad_files:
            continue
        with z.open(filename) as f:
            df = pd.read_csv(f)
            event_name = (
                filename.split(".csv")[0]
                .replace("ordinarybet_datatest_", "")
                .replace("__", ": ")
                .replace("_", " ")
            )
            df["event_name"] = event_name

            df_list.append(df)

bestfightodds_moneyline_odds = pd.concat(df_list).reset_index(drop=True)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(columns=["class"])

# replace urls
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 73: Stacked", "url"
] = "https://www.bestfightodds.com/events/ufc-73-stacked-1"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 74: Respect", "url"
] = "https://www.bestfightodds.com/events/ufc-74-respect-7"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 76: Knockout", "url"
] = "https://www.bestfightodds.com/events/ufc-76-knockout-12"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 84: Ill Will", "url"
] = "https://www.bestfightodds.com/events/ufc-84-ill-will-47"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 85: Bedlam", "url"
] = "https://www.bestfightodds.com/events/ufc-85-bedlam-46"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 87: Seek and Destroy", "url"
] = "https://www.bestfightodds.com/events/ufc-87-seek-and-destroy-57"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 99: The Comeback", "url"
] = "https://www.bestfightodds.com/events/ufc-99-the-comeback-136"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 100", "url"
] = "https://www.bestfightodds.com/events/ufc-100-137"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 101: Declaration", "url"
] = "https://www.bestfightodds.com/events/ufc-101-declaration-145"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 109: Relentless", "url"
] = "https://www.bestfightodds.com/events/ufc-109-relentless-226"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 112: Invincible", "url"
] = "https://www.bestfightodds.com/events/ufc-112-invincible-245"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["event_name"] == "UFC 119: Mir vs: Cro Cop", "url"
] = "https://www.bestfightodds.com/events/ufc-119-mir-vs-cro-cop-296"

# drop event name and get event id
bestfightodds_moneyline_odds["event_id"] = (
    bestfightodds_moneyline_odds["url"]
    .str.split("/")
    .str[-1]
    .str.split("-")
    .str[-1]
    .astype("Int64")
)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(
    columns=["event_name", "url"]
)

# drop cancelled fights
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1444)
        & (bestfightodds_moneyline_odds["fighter1"] == "Al Iaquinta")
        & (bestfightodds_moneyline_odds["fighter2"] == "Paul Felder")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1289)
        & (bestfightodds_moneyline_odds["fighter1"] == "Aspen Ladd")
        & (bestfightodds_moneyline_odds["fighter2"] == "Jessica Eye")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1562)
        & (bestfightodds_moneyline_odds["fighter1"] == "Brian Kelleher")
        & (bestfightodds_moneyline_odds["fighter2"] == "Montel Jackson")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1647)
        & (bestfightodds_moneyline_odds["fighter1"] == "Diego Ferreira")
        & (bestfightodds_moneyline_odds["fighter2"] == "Francisco Trinaldo")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1713)
        & (bestfightodds_moneyline_odds["fighter1"] == "Giacomo Lemos")
        & (bestfightodds_moneyline_odds["fighter2"] == "Tanner Boser")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1849)
        & (bestfightodds_moneyline_odds["fighter1"] == "Emily Whitmire")
        & (bestfightodds_moneyline_odds["fighter2"] == "Polyana Viana")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1246)
        & (bestfightodds_moneyline_odds["fighter1"] == "Brett Johns")
        & (bestfightodds_moneyline_odds["fighter2"] == "Ian Entwistle")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1680)
        & (bestfightodds_moneyline_odds["fighter1"] == "John Lineker")
        & (bestfightodds_moneyline_odds["fighter2"] == "Rob Font")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1673)
        & (bestfightodds_moneyline_odds["fighter1"] == "Devin Clark")
        & (bestfightodds_moneyline_odds["fighter2"] == "Ivan Shtyrkov")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1467)
        & (bestfightodds_moneyline_odds["fighter1"] == "Jessica Aguilar")
        & (bestfightodds_moneyline_odds["fighter2"] == "Jodie Esquibel")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1478)
        & (bestfightodds_moneyline_odds["fighter1"] == "Davey Grant")
        & (bestfightodds_moneyline_odds["fighter2"] == "Manny Bermudez")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1626)
        & (bestfightodds_moneyline_odds["fighter1"] == "Ian Heinisch")
        & (bestfightodds_moneyline_odds["fighter2"] == "Tom Breese")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1423)
        & (bestfightodds_moneyline_odds["fighter1"] == "Nad Narimani")
        & (bestfightodds_moneyline_odds["fighter2"] == "Nasrat Haqparast")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1687)
        & (bestfightodds_moneyline_odds["fighter1"] == "Jordan Griffin")
        & (bestfightodds_moneyline_odds["fighter2"] == "Vince Murdock")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1637)
        & (bestfightodds_moneyline_odds["fighter1"] == "Jessica Penne")
        & (bestfightodds_moneyline_odds["fighter2"] == "Jodie Esquibel")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1587)
        & (bestfightodds_moneyline_odds["fighter1"] == "Andrea Lee")
        & (bestfightodds_moneyline_odds["fighter2"] == "Jessica-Rose Clark")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 485)
        & (bestfightodds_moneyline_odds["fighter1"] == "Buddy Roberts")
        & (bestfightodds_moneyline_odds["fighter2"] == "Sean Loeffler")
    )
]
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.loc[
    ~(
        (bestfightodds_moneyline_odds["event_id"] == 1647)
        & (bestfightodds_moneyline_odds["fighter1"] == "Melissa Gatto")
        & (bestfightodds_moneyline_odds["fighter2"] == "Talita Bernardo")
    )
].reset_index(drop=True)

# drop fighter 1 and fighter 2 columns
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.drop(
    columns=["fighter1", "fighter2"]
)

# rename William Hill
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["betsite"] == "William\xa0H.", "betsite"
] = "William Hill"

# timestamp seconds
bestfightodds_moneyline_odds["dates"] = (
    bestfightodds_moneyline_odds["dates"].divide(1000).astype(int)
)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.rename(
    columns={"dates": "timestamp"}
)

tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[
    ["id", "bestfightodds_id", "event_order"]
]
tapology_events = tapology_events.rename(
    columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"}
)
bestfightodds_moneyline_odds = bestfightodds_moneyline_odds.merge(
    tapology_events, on="event_id", how="left"
)

tapology_bouts = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"))[
    ["event_id", "fighter_1_id", "fighter_2_id"]
]
tapology_bouts = tapology_bouts.loc[
    tapology_bouts["event_id"].isin(bestfightodds_moneyline_odds["tapology_event_id"])
]
stacked_fighters = tapology_bouts.melt(
    id_vars="event_id",
    value_vars=["fighter_1_id", "fighter_2_id"],
    value_name="fighter_id",
).drop(columns="variable")
tapology_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv")
)[["id", "bestfightodds_id", "name"]]
stacked_fighters = (
    stacked_fighters.merge(
        tapology_fighters, left_on="fighter_id", right_on="id", how="left"
    )
    .drop(columns=["id", "fighter_id"])
    .rename(
        columns={
            "bestfightodds_id": "fighter_id",
            "event_id": "tapology_event_id",
            "name": "Bet",
        }
    )
)
stacked_fighters["Bet"] = (
    stacked_fighters["Bet"]
    .str.lower()
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)
stacked_fighters["Bet"] = stacked_fighters["Bet"].str.replace("jr.", "junior")
stacked_fighters["fighter_id"] = stacked_fighters["fighter_id"].astype("Int64")
bestfightodds_moneyline_odds["Bet"] = bestfightodds_moneyline_odds["Bet"].str.lower()

# replace duplicate names
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "paulo borrachinha", "Bet"
] = "paulo costa"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "paulo henrique costa", "Bet"
] = "paulo costa"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "rodrigo lima", "Bet"
] = "rodrigo de lima"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "rodrigo goiana de lima", "Bet"
] = "rodrigo de lima"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "t.j. waldburger", "Bet"
] = "tj waldburger"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "anthony waldburger", "Bet"
] = "tj waldburger"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "luiz henrique da silva", "Bet"
] = "luis henrique da silva"
bestfightodds_moneyline_odds.loc[
    bestfightodds_moneyline_odds["Bet"] == "henrique da silva", "Bet"
] = "luis henrique da silva"

bestfightodds_moneyline_odds2 = bestfightodds_moneyline_odds.merge(
    stacked_fighters, on=["tapology_event_id", "Bet"], how="left"
)
good_matches = bestfightodds_moneyline_odds2.loc[
    bestfightodds_moneyline_odds2["fighter_id"].notnull()
]

bestfightodds_moneyline_odds3 = bestfightodds_moneyline_odds2.loc[
    bestfightodds_moneyline_odds2["fighter_id"].isnull()
].drop(columns=["fighter_id"])

# select rows where fighter_id and tapology_event_id pair is not in good_matches
matched_tuples = set(zip(good_matches["fighter_id"], good_matches["tapology_event_id"]))
stacked_fighters2 = stacked_fighters.copy()
stacked_fighters2["temp_tuple"] = list(
    zip(stacked_fighters2["fighter_id"], stacked_fighters2["tapology_event_id"])
)
stacked_fighters2 = stacked_fighters2.loc[
    ~stacked_fighters2["temp_tuple"].isin(matched_tuples)
]
stacked_fighters2 = stacked_fighters2.drop(columns="temp_tuple")

stacked_fighters2["last_name_first"] = (
    stacked_fighters2["Bet"].str.split(" ").str[-1]
    + " "
    + stacked_fighters2["Bet"].str.split(" ").str[:-1].str.join(" ")
)
stacked_fighters2 = stacked_fighters2.drop_duplicates(
    subset=["tapology_event_id", "last_name_first"], keep=False
)
stacked_fighters2 = stacked_fighters2.drop(columns="Bet")

bestfightodds_moneyline_odds3["last_name_first"] = bestfightodds_moneyline_odds3[
    "Bet"
].copy()

bestfightodds_moneyline_odds3 = bestfightodds_moneyline_odds3.merge(
    stacked_fighters2, on=["tapology_event_id", "last_name_first"], how="left"
)
good_matches2 = bestfightodds_moneyline_odds3.loc[
    bestfightodds_moneyline_odds3["fighter_id"].notnull()
].drop(columns=["last_name_first"])

bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds3.loc[
    bestfightodds_moneyline_odds3["fighter_id"].isnull()
].copy()
bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds4.drop(
    columns=["fighter_id", "last_name_first"]
)
matched_tuples2 = (
    set(zip(good_matches2["fighter_id"], good_matches2["tapology_event_id"]))
    | matched_tuples
)
stacked_fighters3 = stacked_fighters.copy()
stacked_fighters3["temp_tuple"] = list(
    zip(stacked_fighters3["fighter_id"], stacked_fighters3["tapology_event_id"])
)
stacked_fighters3 = stacked_fighters3.loc[
    ~stacked_fighters3["temp_tuple"].isin(matched_tuples2)
]
stacked_fighters3 = stacked_fighters3.drop(columns="temp_tuple")

bestfightodds_moneyline_odds4["first_name"] = (
    bestfightodds_moneyline_odds4["Bet"].str.split(" ").str[0]
)
stacked_fighters3["first_name"] = stacked_fighters3["Bet"].str.split(" ").str[0]
stacked_fighters3 = stacked_fighters3.drop_duplicates(
    subset=["tapology_event_id", "first_name"], keep=False
)
stacked_fighters3 = stacked_fighters3.drop(columns="Bet")

bestfightodds_moneyline_odds4 = bestfightodds_moneyline_odds4.merge(
    stacked_fighters3, on=["tapology_event_id", "first_name"], how="left"
)
good_matches3 = bestfightodds_moneyline_odds4.loc[
    bestfightodds_moneyline_odds4["fighter_id"].notnull()
].drop(columns=["first_name"])

bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds4.loc[
    bestfightodds_moneyline_odds4["fighter_id"].isnull()
].copy()
bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds5.drop(
    columns=["fighter_id", "first_name"]
)
matched_tuples3 = (
    set(zip(good_matches3["fighter_id"], good_matches3["tapology_event_id"]))
    | matched_tuples2
)
stacked_fighters4 = stacked_fighters.copy()
stacked_fighters4["temp_tuple"] = list(
    zip(stacked_fighters4["fighter_id"], stacked_fighters4["tapology_event_id"])
)
stacked_fighters4 = stacked_fighters4.loc[
    ~stacked_fighters4["temp_tuple"].isin(matched_tuples3)
]
stacked_fighters4 = stacked_fighters4.drop(columns="temp_tuple")

stacked_fighters4["last_name"] = stacked_fighters4["Bet"].str.split(" ").str[-1]
stacked_fighters4 = stacked_fighters4.drop_duplicates(
    subset=["tapology_event_id", "last_name"], keep=False
)
stacked_fighters4 = stacked_fighters4.drop(columns="Bet")
bestfightodds_moneyline_odds5["last_name"] = (
    bestfightodds_moneyline_odds5["Bet"].str.split(" ").str[-1]
)
bestfightodds_moneyline_odds5 = bestfightodds_moneyline_odds5.merge(
    stacked_fighters4, on=["tapology_event_id", "last_name"], how="left"
)
good_matches4 = bestfightodds_moneyline_odds5.loc[
    bestfightodds_moneyline_odds5["fighter_id"].notnull()
].drop(columns=["last_name"])

bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds5.loc[
    bestfightodds_moneyline_odds5["fighter_id"].isnull()
].copy()
bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds6.drop(
    columns=["fighter_id", "last_name"]
)
matched_tuples4 = (
    set(zip(good_matches4["fighter_id"], good_matches4["tapology_event_id"]))
    | matched_tuples3
)
stacked_fighters5 = stacked_fighters.copy()
stacked_fighters5["temp_tuple"] = list(
    zip(stacked_fighters5["fighter_id"], stacked_fighters5["tapology_event_id"])
)
stacked_fighters5 = stacked_fighters5.loc[
    ~stacked_fighters5["temp_tuple"].isin(matched_tuples4)
]
stacked_fighters5 = stacked_fighters5.drop(columns="temp_tuple")

# manually assign
stacked_fighters5.loc[stacked_fighters5["Bet"] == "zhumabek tursyn", "name_alt"] = (
    "jumabieke tuerxun"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "joe duffy", "name_alt"] = (
    "joseph duffey"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "cris cyborg", "name_alt"] = (
    "cristiane justino"
)
stacked_fighters5.loc[
    stacked_fighters5["Bet"] == "abdul razak alhassan", "name_alt"
] = "razak al-hassan"
stacked_fighters5.loc[stacked_fighters5["Bet"] == "wulijiburen", "name_alt"] = (
    "wuliji buren"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "serghei spivac", "name_alt"] = (
    "sergey spivak"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "sumudaerji", "name_alt"] = (
    "su mudaerji"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "alatengheili", "name_alt"] = (
    "heili alateng"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "leonardo guimaraes", "name_alt"] = (
    "leonardo augusto leleco"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "aleksei oleinik", "name_alt"] = (
    "oleksiy oliynyk"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "seung woo choi", "name_alt"] = (
    "seungwoo choi"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "doo ho choi", "name_alt"] = (
    "dooho choi"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "dmitry smolyakov", "name_alt"] = (
    "dmitrii smoliakov"
)
stacked_fighters5.loc[stacked_fighters5["Bet"] == "zubaira tukhugov", "name_alt"] = (
    "zubair tuhugov"
)
stacked_fighters5.loc[
    stacked_fighters5["Bet"] == "luis henrique barbosa", "name_alt"
] = "luiz henrique"

stacked_fighters5 = stacked_fighters5.drop(columns="Bet")

bestfightodds_moneyline_odds6["name_alt"] = bestfightodds_moneyline_odds6["Bet"].copy()
bestfightodds_moneyline_odds6 = bestfightodds_moneyline_odds6.merge(
    stacked_fighters5, on=["tapology_event_id", "name_alt"], how="left"
)
good_matches5 = bestfightodds_moneyline_odds6.loc[
    bestfightodds_moneyline_odds6["fighter_id"].notnull()
].drop(columns=["name_alt"])

bestfightodds_moneyline_odds7 = bestfightodds_moneyline_odds6.loc[
    bestfightodds_moneyline_odds6["fighter_id"].isnull()
].copy()

all_matches = pd.concat(
    [good_matches, good_matches2, good_matches3, good_matches4, good_matches5],
    axis=0,
    ignore_index=True,
).reset_index(drop=True)
tapology_bouts2 = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "bouts.csv"))[
    ["id", "event_id", "fighter_1_id", "fighter_2_id", "bout_order"]
]
stacked = tapology_bouts2.melt(
    id_vars=["id", "event_id", "bout_order"],
    value_vars=["fighter_1_id", "fighter_2_id"],
    value_name="tapology_fighter_id",
).drop(columns="variable")
stacked = stacked.rename(
    columns={"id": "tapology_bout_id", "event_id": "tapology_event_id"}
)
tapology_fighters2 = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv")
)[["id", "bestfightodds_id"]].rename(
    columns={"id": "tapology_fighter_id", "bestfightodds_id": "fighter_id"}
)
tapology_events2 = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))[
    ["id", "bestfightodds_id"]
].rename(columns={"id": "tapology_event_id", "bestfightodds_id": "event_id"})
stacked = stacked.merge(tapology_fighters2, on="tapology_fighter_id", how="inner")
stacked = stacked.merge(tapology_events2, on="tapology_event_id", how="inner")
stacked = stacked.dropna(subset=["fighter_id", "event_id"]).reset_index(drop=True)
stacked = stacked.drop(columns=["tapology_fighter_id", "tapology_event_id"])
stacked["fighter_id"] = stacked["fighter_id"].astype(int)
stacked["event_id"] = stacked["event_id"].astype(int)
all_matches = all_matches.merge(stacked, on=["fighter_id", "event_id"], how="left")
all_matches = all_matches.sort_values(
    by=["event_order", "bout_order", "fighter_id", "betsite", "timestamp"]
).reset_index(drop=True)

all_matches = all_matches[["event_id", "fighter_id", "betsite", "timestamp", "odds"]]
all_matches.to_csv(
    os.path.join(clean_data_dir, "Best Fight Odds", "moneyline_odds.csv"), index=False
)

## Events

In [None]:
bestfightodds_prop_odds = pd.read_csv(
    os.path.join(raw_data_dir, "Best Fight Odds", "closing_with_props.csv"),
    parse_dates=["Card_Date"],
)

# drop bad events
bad_events = [
    "UFC 9: Motor City Madness",
    "UFC 14: Showdown",
    "UFC 15: Collision Course",
    "UFC 18: The Road to the Heavyweight Title",
    "UFC 32: Showdown in the Meadowlands",
    "UFC 35: Throwdown",
    "UFC 48: Payback",
    "UFC 53: Heavy Hitters",
    "UFC 56: Full Force",
]
bestfightodds_prop_odds = bestfightodds_prop_odds.loc[
    ~bestfightodds_prop_odds["Event"].isin(bad_events)
]

# fix event urls
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 73: Stacked", "url"
] = "https://www.bestfightodds.com/events/ufc-73-stacked-1"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 74: Respect", "url"
] = "https://www.bestfightodds.com/events/ufc-74-respect-7"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 76: Knockout", "url"
] = "https://www.bestfightodds.com/events/ufc-76-knockout-12"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 84: Ill Will", "url"
] = "https://www.bestfightodds.com/events/ufc-84-ill-will-47"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 85: Bedlam", "url"
] = "https://www.bestfightodds.com/events/ufc-85-bedlam-46"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 87: Seek and Destroy", "url"
] = "https://www.bestfightodds.com/events/ufc-87-seek-and-destroy-57"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 99: The Comeback", "url"
] = "https://www.bestfightodds.com/events/ufc-99-the-comeback-136"
bestfightodds_prop_odds.loc[bestfightodds_prop_odds["Event"] == "UFC 100", "url"] = (
    "https://www.bestfightodds.com/events/ufc-100-137"
)
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 101: Declaration", "url"
] = "https://www.bestfightodds.com/events/ufc-101-declaration-145"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 109: Relentless", "url"
] = "https://www.bestfightodds.com/events/ufc-109-relentless-226"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 112: Invincible", "url"
] = "https://www.bestfightodds.com/events/ufc-112-invincible-245"
bestfightodds_prop_odds.loc[
    bestfightodds_prop_odds["Event"] == "UFC 119: Mir vs. Cro Cop", "url"
] = "https://www.bestfightodds.com/events/ufc-119-mir-vs-cro-cop-296"

bestfightodds_prop_odds["event_id"] = (
    bestfightodds_prop_odds["url"]
    .str.split("/")
    .str[-1]
    .str.split("-")
    .str[-1]
    .astype(int)
)
bestfightodds_prop_odds = bestfightodds_prop_odds.rename(
    columns={"William_H": "William Hill"}
)

df = (
    bestfightodds_prop_odds[["event_id", "Event"]]
    .rename(columns={"Event": "name"})
    .drop_duplicates()
)
df = df.sort_values("event_id").reset_index(drop=True)
df = df.rename(columns={"event_id": "id"})
df.to_csv(os.path.join(clean_data_dir, "Best Fight Odds", "events.csv"), index=False)

## Fighters

Note: This requires Tapology data to already be cleaned prior to running

In [None]:
bestfightodds_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv")
)[["bestfightodds_id", "name", "nickname"]]
bestfightodds_fighters = bestfightodds_fighters.rename(
    columns={"bestfightodds_id": "id"}
)
bestfightodds_fighters = bestfightodds_fighters.dropna(subset=["id"])
bestfightodds_fighters["id"] = bestfightodds_fighters["id"].astype(int)
bestfightodds_fighters = bestfightodds_fighters.sort_values("id").reset_index(drop=True)
bestfightodds_fighters.to_csv(
    os.path.join(clean_data_dir, "Best Fight Odds", "fighters.csv"), index=False
)