# Data Cleaning - Bet MMA

In [2]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

In [5]:
# Duplicate fighter
gsp_map = {848: 1280}

## Bouts

In [6]:
betmma_bouts = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "bouts.csv"))

# Replace duplicate
betmma_bouts["fighter_1_id"] = betmma_bouts["fighter_1_id"].replace(gsp_map)
betmma_bouts["fighter_2_id"] = betmma_bouts["fighter_2_id"].replace(gsp_map)

betmma_bouts.to_csv(os.path.join(clean_data_dir, "Bet MMA", "bouts.csv"), index=False)

## Events

In [7]:
betmma_events = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "events.csv"))

# Get relative order of UFC events only, offset by 233 since history starts later
betmma_events["temp_event_order"] = (
    betmma_events["is_ufc_event"]
    .eq(1)  # Check where the column equals 1
    .cumsum()  # Cumulative sum to count occurrences
    .where(
        betmma_events["is_ufc_event"] == 1, np.nan
    )  # Set to NaN where the trigger column is 0
)
betmma_events["event_order"] = 233 + betmma_events["temp_event_order"]
betmma_events["event_order"] = betmma_events["event_order"].astype("Int64")

betmma_events = betmma_events[
    ["id", "name", "date", "location", "is_ufc_event", "event_order"]
]

betmma_events.to_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"), index=False)

## Fighter Histories

In [9]:
betmma_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "Bet MMA", "fighter_histories.csv")
)

# Replace duplicate fighter
betmma_fighter_histories["fighter_id"] = betmma_fighter_histories["fighter_id"].replace(
    gsp_map
)

# Fix order for duplicate fighter
betmma_fighter_histories.loc[
    (betmma_fighter_histories["fighter_id"] == 1280)
    & (betmma_fighter_histories["bout_id"] == 2508),
    "order",
] = 2

# Resort
betmma_fighter_histories = betmma_fighter_histories.sort_values(
    by=["fighter_id", "order"]
).reset_index(drop=True)

# Map outcomes to letter symbols
betmma_fighter_histories["outcome"] = betmma_fighter_histories["outcome"].map(
    {"Won": "W", "Lost": "L", "Draw": "D"}
)

# End round and timing clean up/conversions
betmma_fighter_histories.loc[
    betmma_fighter_histories["end_round"] == 0, "end_round"
] = np.nan
betmma_fighter_histories["end_round"] = betmma_fighter_histories["end_round"].astype(
    "Int64"
)
betmma_fighter_histories.loc[
    betmma_fighter_histories["end_round_time"] == "0:00", "end_round_time"
] = np.nan


def convert_time(time_str):
    if pd.isna(time_str):
        return np.nan

    minutes, seconds = map(int, time_str.split(":"))
    return minutes * 60 + seconds


betmma_fighter_histories["end_round_time_seconds"] = betmma_fighter_histories[
    "end_round_time"
].apply(convert_time)
betmma_fighter_histories["end_round_time_seconds"] = betmma_fighter_histories[
    "end_round_time_seconds"
].astype("Int64")


def total_seconds(row):
    if pd.isna(row["end_round_time_seconds"]) or pd.isna(row["end_round"]):
        return np.nan
    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]


betmma_fighter_histories["total_time_seconds"] = betmma_fighter_histories.apply(
    total_seconds, axis=1
)
betmma_fighter_histories["total_time_seconds"] = betmma_fighter_histories[
    "total_time_seconds"
].astype("Int64")

# Convert to Int64
betmma_fighter_histories["odds"] = betmma_fighter_histories["odds"].astype("Int64")

# Select columns
betmma_fighter_histories = betmma_fighter_histories[
    [
        "fighter_id",
        "order",
        "bout_id",
        "opponent_id",
        "outcome",
        "outcome_method",
        "end_round",
        "end_round_time_seconds",
        "total_time_seconds",
        "odds",
    ]
]

betmma_fighter_histories.to_csv(
    os.path.join(clean_data_dir, "Bet MMA", "fighter_histories.csv"), index=False
)

## Fighters

In [34]:
betmma_fighters = pd.read_csv(os.path.join(raw_data_dir, "Bet MMA", "fighters.csv"))
betmma_fighters["sherdog_id"] = betmma_fighters["sherdog_id"].astype("Int64")
betmma_fighters = betmma_fighters.rename(columns={"reach": "reach_inches"})

# Ignore duplicate fighter
betmma_fighters = betmma_fighters.loc[~betmma_fighters["id"].isin(gsp_map)]


# Height conversion to inches
def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan

    height_str = height_str.replace('"', "")
    feet, inches = map(float, height_str.split("'"))

    return feet * 12 + inches


betmma_fighters["height_inches"] = betmma_fighters["height"].apply(convert_height)

# Fix nationalities
betmma_fighters.loc[betmma_fighters["nationality"] == "-", "nationality"] = np.nan
betmma_fighters.loc[
    betmma_fighters["nationality"].str.endswith("d'Ivoire", na=False), "nationality"
] = "CÃ´te d'Ivoire"
betmma_fighters.loc[
    betmma_fighters["nationality"] == "Trinidad &amp; Tobago", "nationality"
] = "Trinidad and Tobago"
betmma_fighters.loc[
    betmma_fighters["nationality"] == "Bosnia &amp; Herzegovina", "nationality"
] = "Bosnia and Herzegovina"

# Select columns
betmma_fighters = betmma_fighters[
    [
        "id",
        "ufcstats_id",
        "sherdog_id",
        "name",
        "height_inches",
        "reach_inches",
        "stance",
        "nationality",
    ]
]

# Erroneous reach stats
betmma_fighters.loc[betmma_fighters["reach_inches"] > 100, "reach_inches"] = np.nan

# Fix up all wrong Sherdog IDs and UFC Stats IDs since scrape logic was poorly constructed
ufcstats_fixes = {
    96: "85d0fafb8b75d634",
    219: "b6452706b373eea1",
    222: "33a331684283900f",
    330: "6d1bffff14897645",
    331: "0d8011111be000b2",
    356: "08af939f41b5a57b",
    384: "f53d4f21d1b5f2dc",
    399: "99df7d0a2a08a8a8",
    414: "4fcf6e0c4e0e6664",
    475: "275aca31f61ba28c",
    569: "a67d071163962af8",
    956: "38c626ca912c7bac",
    1009: "b08012bbe542592a",
    1189: "2f181c0467965b98",
    1230: "d945aae53e3e54e6",
    1284: "99bd51917728c25d",
    1370: "c03520b5c88ed6b4",
    1373: "7447e9f28508106a",
    1391: "1338e2c7480bdf9e",
    1444: "5d1b7e3dd9e11074",
    1476: "3a46b268013afede",
    1483: "5442f1bc4b47eaf3",
    1496: "093e1f5bb73850be",
    1576: "1ebe20ebbfa15e29",
    1539: "b9f28e7045fdfce7",
    1633: "cc8c623cca88f54f",
    1637: "c21f26bbde777573",
    1656: "0d7b51c9d2649a6e",
    1682: "6e15f63b6c2e2c15",
    1691: "f14cf73e51b29254",
    1750: "009341ed974bad72",
    1756: "2cd428e9606856fd",
    1791: "9d83f6da776ff7d6",
    1792: "eae48ff31db420c2",
    1819: "e2f6b2769aaedd6c",
    1859: "beecb672a279223e",
    1894: "64facb6cc564262d",
    1962: "809bd1a871491508",
    1967: "5444c5a201d3ee5a",
    2009: "886264a0c9f4ea5e",
    2013: "2b6fc1c02736833d",
    2040: "351c4ec637380ad5",
    2121: "a54660deb6a8489c",
    2302: "f00ac08ab056af5d",
    2347: "681399317dbf4701",
    2467: "881bf86d4cba8578",
    2530: "9014c02eff8b3d62",
    2666: "764d39074a352e33",
    2723: "a53d30163304aa6e",
    2850: "d549cefc7c54ab78",
    2909: "745fa7b605f8e2da",
    2920: "b62d5280966b2460",
    3156: "41e83a89929d1327",
    3162: "a9e260472d321361",
    3388: "c739c2995a275314",
    3434: "cfc3e7bb44685289",
    3691: "b07aed698fba8624",
    3975: "06e4245d16fc5315",
    4009: "d8c811df0386d5e8",
    4051: "6747ccd6d1acd266",
    4083: "53c10176e3bc7416",
    4235: "4e6738062d469256",
    4236: "6c9b66b43663f2f7",
    4382: "ca28cdf526d6b6e9",
    5228: "b6c37948cb226e8c",
    5352: "18d01f7f8338ae72",
    5359: "6e743a33d56bdaa4",
    5924: "64d47ef881a437a4",
    5972: "ef5dcb10d2bd4b0f",
    5977: "23dec7c47cb418f8",
    5987: "06734ca9d88dec3a",
    7401: "7facc9c45d792985",
}
betmma_fighters["ufcstats_id"] = betmma_fighters.apply(
    lambda row: ufcstats_fixes[row["id"]]
    if row["id"] in ufcstats_fixes
    else row["ufcstats_id"],
    axis=1,
)

sherdog_fixes = {
    27: 25830,
    219: 61700,
    261: 42659,
    313: 20522,
    330: 64593,
    331: 30452,
    356: 56583,
    384: 26358,
    399: 25821,
    475: 76836,
    569: 91937,
    956: 56139,
    1009: 161107,
    1036: 155641,
    1087: 151701,
    1189: 201703,
    1204: 80671,
    1230: 130351,
    1284: 103525,
    1370: 157355,
    1373: 137171,
    1391: 56374,
    1444: 101189,
    1476: 108999,
    1483: 48039,
    1496: 86783,
    1538: 103961,
    1576: 186663,
    1539: 94587,
    1633: 223917,
    1637: 170203,
    1656: 146193,
    1682: 83851,
    1691: 184051,
    1750: 97529,
    1756: 102803,
    1791: 232601,
    1792: 213913,
    1819: 179095,
    1859: 171625,
    1894: 230239,
    1962: 232591,
    1967: 280417,
    2009: 81800,
    2013: 240893,
    2040: 115135,
    2121: 182325,
    2302: 149259,
    2347: 272505,
    2424: 238733,
    2461: 228381,
    2467: 217405,
    2497: 280041,
    2530: 84291,
    2666: 287525,
    2723: 85711,
    2850: 133677,
    2909: 300151,
    2920: 208467,
    3001: 173219,
    3156: 114657,
    3162: 311001,
    3262: 188721,
    3326: 303043,
    3388: 252115,
    3434: 187343,
    3652: 115453,
    3691: 202763,
    3975: 297989,
    4009: 386855,
    4051: 313721,
    4083: 135973,
    4093: 144007,
    4235: 288537,
    4236: 333829,
    4382: 274869,
    4651: 272659,
    5125: 263543,
    5228: 226661,
    5352: 139517,
    5359: 296553,
    5924: 217983,
    5972: 229253,
    5977: 100903,
    5987: 309711,
    6200: 319093,
    7401: 189203,
    7446: 362871,
    7525: 202195,
}
betmma_fighters["sherdog_id"] = betmma_fighters.apply(
    lambda row: sherdog_fixes[row["id"]]
    if row["id"] in sherdog_fixes
    else row["sherdog_id"],
    axis=1,
)

# Remove extra space
betmma_fighters.loc[betmma_fighters["id"] == 5125, "name"] = "Marco Tulio Silva"

betmma_fighters.to_csv(
    os.path.join(clean_data_dir, "Bet MMA", "fighters.csv"), index=False
)

## Late Replacements

No further cleaning needed

In [11]:
betmma_late_replacements = pd.read_csv(
    os.path.join(raw_data_dir, "Bet MMA", "late_replacements.csv")
)
betmma_late_replacements.to_csv(
    os.path.join(clean_data_dir, "Bet MMA", "late_replacements.csv"), index=False
)

## Missed Weights

No further cleaning needed

In [12]:
betmma_missed_weights = pd.read_csv(
    os.path.join(raw_data_dir, "Bet MMA", "missed_weights.csv")
)
betmma_missed_weights.to_csv(
    os.path.join(clean_data_dir, "Bet MMA", "missed_weights.csv"), index=False
)