# Data Cleaning - Sherdog

In [1]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bouts

In [2]:
sherdog_bouts = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "bouts.csv"))
sherdog_bouts["fighter_1_id"] = sherdog_bouts["fighter_1_id"].astype("Int64")
sherdog_bouts["fighter_2_id"] = sherdog_bouts["fighter_2_id"].astype("Int64")

outcome_map = {"win": "W", "loss": "L", "draw": "D", "nc": "NC", "yet to come": np.nan}
sherdog_bouts["fighter_1_outcome"] = sherdog_bouts["fighter_1_outcome"].map(outcome_map)
sherdog_bouts["fighter_2_outcome"] = sherdog_bouts["fighter_2_outcome"].map(outcome_map)

sherdog_bouts = sherdog_bouts.rename(columns={"weight_class": "weight_class_TEMP"})
sherdog_bouts["weight_class"] = sherdog_bouts["weight_class_TEMP"].apply(
    lambda x: np.nan
    if pd.isna(x)
    else "Catchweight"
    if "Catchweight" in x
    else np.nan
    if "lb" in x
    else x
)
sherdog_bouts.loc[
    sherdog_bouts["weight_class"] == "Pound for Pound", "weight_class"
] = np.nan


def extract_weight_lbs(weight_class_str):
    weight_map = {
        "Atomweight": 105,
        "Strawweight": 115,
        "Flyweight": 125,
        "Bantamweight": 135,
        "Featherweight": 145,
        "Lightweight": 155,
        "Welterweight": 170,
        "Middleweight": 185,
        "Light Heavyweight": 205,
        "Heavyweight": 265,
    }

    if pd.isna(weight_class_str):
        return np.nan
    elif "lb" in weight_class_str:
        return int(weight_class_str.replace("lb", "").replace("Catchweight", ""))

    return weight_map.get(weight_class_str, np.nan)


sherdog_bouts["weight_class_lbs"] = sherdog_bouts["weight_class_TEMP"].apply(
    extract_weight_lbs
)
sherdog_bouts["weight_class_lbs"] = sherdog_bouts["weight_class_lbs"].astype("Int64")


sherdog_bouts["outcome_method_broad"] = (
    sherdog_bouts["outcome_method"].str.split(r"\(|\)", expand=True).iloc[:, 0]
)
sherdog_bouts["outcome_method_broad"] = sherdog_bouts[
    "outcome_method_broad"
].str.strip()

# Manually clean up edge cases
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"] == "", "outcome_method_broad"
] = np.nan

dqs = ["Disqualification", "Disqualifcation", "Desqualification", "DG", "DQ"]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(dqs), "outcome_method_broad"
] = "Disqualification"

ko_tkos = [
    "ТКО",
    "Tko",
    "TKP",
    "ΤΚΟ",
    "Corner Stoppage",
    "Corner's towel",
    "Doctor's Stoppage",
    "Injury",
    "Retirement",
    "K.O",
    "K.O.",
    "KO",
    "Ko",
    "Knockout",
    "ko",
    "KO/TKO",
    "KO/TKO PUNCHES",
    "TKO Punches",
    "TKO",
    "RETIREMENT",
    "Dcotor Stoppade",
    "Nocaute",
]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(ko_tkos), "outcome_method_broad"
] = "KO/TKO"

draws = ["DRAW", "Draw Unanimous", "Drew", "Majority Draw", "Technical Draw"]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(draws), "outcome_method_broad"
] = "Draw"

decisions = [
    "Decision",
    "Decision Unanimous",
    "Decision unanimous",
    "Decisions",
    "Decison",
    "Decisão Unanime",
    "Desicion",
    "Desision",
    "Majority Decision",
    "Points",
    "Split Decision",
    "Split Division",
    "Split decision",
    "Technical Decision",
    "Unaminous Decision",
    "Unanimous",
    "Unanimous Decision",
    "Unanimous decision",
    "de",
    "Decisionn",
    "Deicision",
]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(decisions), "outcome_method_broad"
] = "Decision"

subs = [
    "Frontal Anaconda Choke",
    "Guillotine Choke",
    "Guillotine choke",
    "Kraken Choke",
    "Rear Naked Choke",
    "SUBMISSION",
    "Submision",
    "Submisison",
    "Submissio",
    "Submission",
    "Submissions",
    "Submisson",
    "Submssion",
    "Tapout",
    "Techinal Submission",
    "Techincal Submission",
    "Technial Submission",
    "Technical Submission",
    "Triangle Choke",
    "Verbal Submission",
    "su",
    "submison",
    "submission",
    "ubmission",
    "Bulldog Choke",
    "RNC",
    "Submissiom",
    "Submission-Guillotine Choke",
    "Submissoin",
    "sub",
]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(subs), "outcome_method_broad"
] = "Submission"

ncs = [
    "N/C",
    "NC",
    "ND",
    "No Conest",
    "No Contest",
    "No Contest - Clements Failed Drug Test",
    "No Contest - Collard Failed Drug Test",
    "No Contest - Overturned by Commission",
    "No Contest - Overturned by NSAC",
    "No Contest - Overturned by WTKA",
    "No Contest - Result overturned by FIGMMA",
    "No Contest - Strikes After The Fight",
    "No Decision",
    "No Decision - Overturned by CSAC",
    "No Decision - Overturned by FMMAF",
    "No Decision - Zappitella Failed Drug Test",
    "Overtuned",
    "Overturned by Promoter",
    "Illegal Punch",
    "Unintentional Eye Poke",
]
sherdog_bouts.loc[
    sherdog_bouts["outcome_method_broad"].isin(ncs), "outcome_method_broad"
] = "No Contest"

# End round
sherdog_bouts.loc[sherdog_bouts["end_round"] == 0, "end_round"] = np.nan
sherdog_bouts.loc[sherdog_bouts["end_round"] == 12, "end_round"] = 1
sherdog_bouts.loc[sherdog_bouts["end_round"] == 22, "end_round"] = 2
sherdog_bouts.loc[sherdog_bouts["end_round"] == 30, "end_round"] = 3
sherdog_bouts.loc[sherdog_bouts["end_round"] == 31, "end_round"] = 1
sherdog_bouts.loc[sherdog_bouts["end_round"] == 4355842, "end_round"] = 3
sherdog_bouts["end_round"] = sherdog_bouts["end_round"].astype("Int64")

# End round time
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    ";", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    '"', ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    " :", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    ")", "0", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "!", "1", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "$", "4", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "&", "7", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    ".:", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    ".", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "_", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "?", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "L", ":", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "d", "", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "q", "", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "r", "", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "s", "", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    "'", "", regex=False
)
sherdog_bouts["end_round_time"] = sherdog_bouts["end_round_time"].str.replace(
    ",", ":", regex=False
)

nans = [
    ":evon Shab",
    "Aabek To",
    "Anew He",
    "BMISSION:",
    "Joeph Pe",
    "KO/TKO: PU",
    "KO/TKO: ST",
    "M/A",
    "Nick Kaaze",
    "SUBMISSION",
]
sherdog_bouts.loc[sherdog_bouts["end_round_time"].isin(nans), "end_round_time"] = np.nan

sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2 m an 14", "end_round_time"] = (
    "2:14"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2 min 11e", "end_round_time"] = (
    "2:11"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:50 min", "end_round_time"] = (
    "2:50"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2: 5", "end_round_time"] = "2:05"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:4u", "end_round_time"] = "2:40"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 2:1", "end_round_time"] = (
    "2:10"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 2:5", "end_round_time"] = (
    "2:50"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "KO/TKO 3:0", "end_round_time"] = (
    "3:00"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "1:12:00 AM", "end_round_time"] = (
    "1:12"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "4:40:00 AM", "end_round_time"] = (
    "4:40"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:01:39", "end_round_time"] = (
    "1:39"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:02:48", "end_round_time"] = (
    "2:48"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:03:02", "end_round_time"] = (
    "3:02"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:03:50", "end_round_time"] = (
    "3:50"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "00:05:00", "end_round_time"] = (
    "5:00"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "01:15:00", "end_round_time"] = (
    "1:15"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "04:17:00", "end_round_time"] = (
    "4:17"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "05:00:00", "end_round_time"] = (
    "5:00"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "4:45:00", "end_round_time"] = (
    "4:45"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "0::32", "end_round_time"] = "0:32"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "1::45", "end_round_time"] = "1:45"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:26:", "end_round_time"] = "2:26"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2:30::", "end_round_time"] = (
    "2:30"
)
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "2::58", "end_round_time"] = "2:58"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "05", "end_round_time"] = "5:00"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "215", "end_round_time"] = "2:15"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "021", "end_round_time"] = "0:21"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "241", "end_round_time"] = "2:41"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "250", "end_round_time"] = "2:50"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "311", "end_round_time"] = "3:11"
sherdog_bouts.loc[sherdog_bouts["end_round_time"] == "135", "end_round_time"] = "1:35"

nans2 = [
    "1",
    "110087",
    "135",
    "2",
    "201641",
    "23",
    "3",
    "308073",
    "38",
    "383518",
    "429661",
    "431507",
    "467551",
    "5",
    "51",
]
sherdog_bouts.loc[sherdog_bouts["end_round_time"].isin(nans2), "end_round_time"] = (
    np.nan
)


# End round time seconds
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = time_str.split(":")
    if minutes == "":
        minutes = 0

    minutes = int(minutes)
    seconds = int(seconds)

    return minutes * 60 + seconds


sherdog_bouts["end_round_time_seconds"] = sherdog_bouts["end_round_time"].apply(
    convert_to_seconds
)
sherdog_bouts.loc[
    sherdog_bouts["end_round_time_seconds"] == 0, "end_round_time_seconds"
] = np.nan
sherdog_bouts["end_round_time_seconds"] = sherdog_bouts[
    "end_round_time_seconds"
].astype("Int64")


# Total time elapsed in seconds
def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]


sherdog_bouts["total_time_seconds"] = sherdog_bouts.apply(
    calculate_total_time_seconds, axis=1
)
sherdog_bouts["total_time_seconds"] = sherdog_bouts["total_time_seconds"].astype(
    "Int64"
)

sherdog_bouts = sherdog_bouts[
    [
        "event_id",
        "bout_order",
        "fighter_1_id",
        "fighter_2_id",
        "fighter_1_outcome",
        "fighter_2_outcome",
        "is_title_bout",
        "weight_class",
        "weight_class_lbs",
        "outcome_method",
        "outcome_method_broad",
        "end_round",
        "end_round_time_seconds",
        "total_time_seconds",
    ]
]

sherdog_bouts.to_csv(os.path.join(clean_data_dir, "Sherdog", "bouts.csv"), index=False)

## Events

In [6]:
sherdog_events = pd.read_csv(
    os.path.join(raw_data_dir, "Sherdog", "events.csv"), parse_dates=["date"]
)

# String cleaning
sherdog_events["name"] = (
    sherdog_events["name"]
    .str.replace("\n", " ", regex=False)
    .replace("  ", " ", regex=False)
    .str.strip()
)
sherdog_events["location"] = (
    sherdog_events["location"]
    .str.replace("\r", ",", regex=False)
    .replace("\n", " ", regex=False)
    .replace("  ", " ", regex=False)
    .str.strip()
)
sherdog_events["country"] = (
    sherdog_events["country"].str.split("\r\n").str[0].str.strip()
)
sherdog_events["event_order"] = sherdog_events["event_order"].astype("Int64")

# Correct wrong date
sherdog_events.loc[sherdog_events["date"] == "2030-01-11", "date"] = pd.to_datetime(
    "2022-11-30"
)

# Sort
sherdog_events = sherdog_events.sort_values(by=["date", "event_order"]).reset_index(
    drop=True
)

# Drop duplicate event IDs
sherdog_events = sherdog_events.drop_duplicates(subset=["id"], keep="last")

sherdog_events.to_csv(
    os.path.join(clean_data_dir, "Sherdog", "events.csv"), index=False
)

## Fighter Histories

In [4]:
sherdog_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "Sherdog", "fighter_histories.csv")
)

# Cast to Int64
sherdog_fighter_histories["opponent_id"] = sherdog_fighter_histories[
    "opponent_id"
].astype("Int64")

# Standardize outcome symbols
sherdog_fighter_histories["outcome"] = sherdog_fighter_histories["outcome"].map(
    {"win": "W", "loss": "L", "draw": "D", "nc": "NC"}
)

# Clean outcome methods
sherdog_fighter_histories["outcome_method_broad"] = (
    sherdog_fighter_histories["outcome_method"]
    .str.split(r"\(|\)", expand=True)
    .iloc[:, 0]
)
sherdog_fighter_histories["outcome_method_broad"] = sherdog_fighter_histories[
    "outcome_method_broad"
].str.strip()

# Manually handle edge cases
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"] == "", "outcome_method_broad"
] = np.nan

dqs = ["Disqualification", "Disqualifcation", "Desqualification", "DG", "DQ"]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(dqs), "outcome_method_broad"
] = "Disqualification"

ko_tkos = [
    "ТКО",
    "Tko",
    "TKP",
    "ΤΚΟ",
    "Corner Stoppage",
    "Corner's towel",
    "Doctor's Stoppage",
    "Injury",
    "Retirement",
    "K.O",
    "K.O.",
    "KO",
    "Ko",
    "Knockout",
    "ko",
    "KO/TKO",
    "KO/TKO PUNCHES",
    "TKO Punches",
    "TKO",
    "RETIREMENT",
]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(ko_tkos),
    "outcome_method_broad",
] = "KO/TKO"

draws = ["DRAW", "Draw Unanimous", "Drew", "Majority Draw", "Technical Draw"]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(draws),
    "outcome_method_broad",
] = "Draw"

decisions = [
    "Decision",
    "Decision Unanimous",
    "Decision unanimous",
    "Decisions",
    "Decison",
    "Decisão Unanime",
    "Desicion",
    "Desision",
    "Majority Decision",
    "Points",
    "Split Decision",
    "Split Division",
    "Split decision",
    "Technical Decision",
    "Unaminous Decision",
    "Unanimous",
    "Unanimous Decision",
    "Unanimous decision",
    "de",
]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(decisions),
    "outcome_method_broad",
] = "Decision"

subs = [
    "Frontal Anaconda Choke",
    "Guillotine Choke",
    "Guillotine choke",
    "Kraken Choke",
    "Rear Naked Choke",
    "SUBMISSION",
    "Submision",
    "Submisison",
    "Submissio",
    "Submission",
    "Submissions",
    "Submisson",
    "Submssion",
    "Tapout",
    "Techinal Submission",
    "Techincal Submission",
    "Technial Submission",
    "Technical Submission",
    "Triangle Choke",
    "Verbal Submission",
    "su",
    "submison",
    "submission",
    "ubmission",
]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(subs), "outcome_method_broad"
] = "Submission"

ncs = [
    "N/C",
    "NC",
    "ND",
    "No Conest",
    "No Contest",
    "No Contest - Clements Failed Drug Test",
    "No Contest - Collard Failed Drug Test",
    "No Contest - Overturned by Commission",
    "No Contest - Overturned by NSAC",
    "No Contest - Overturned by WTKA",
    "No Contest - Result overturned by FIGMMA",
    "No Contest - Strikes After The Fight",
    "No Decision",
    "No Decision - Overturned by CSAC",
    "No Decision - Overturned by FMMAF",
    "No Decision - Zappitella Failed Drug Test",
    "Overtuned",
    "Overturned by Promoter",
]
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["outcome_method_broad"].isin(ncs), "outcome_method_broad"
] = "No Contest"

# End round
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round"] == 0, "end_round"
] = np.nan
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round"] == 12, "end_round"
] = 1
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round"] == 22, "end_round"
] = 2
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round"] == 30, "end_round"
] = 3
sherdog_fighter_histories["end_round"] = sherdog_fighter_histories["end_round"].astype(
    "Int64"
)

# End round time
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace(";", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace('"', ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace(".", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace(",", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("?", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("L", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("_", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace(" min", "", regex=False)
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "M/A", "end_round_time"
] = np.nan
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "SUBMISSION", "end_round_time"
] = np.nan
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"].isin(["1", "2", "3", "5"]),
    "end_round_time",
] += ":00"  # type: ignore
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "05", "end_round_time"
] = "5:00"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "38", "end_round_time"
] = "0:38"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "51", "end_round_time"
] = "0:51"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "021", "end_round_time"
] = "0:21"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "215", "end_round_time"
] = "2:15"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "311", "end_round_time"
] = "3:11"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "201641", "end_round_time"
] = "5:00"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "383518", "end_round_time"
] = np.nan
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "429661", "end_round_time"
] = np.nan
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "2:30::", "end_round_time"
] = "2:30"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "2: 5", "end_round_time"
] = "2:05"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "2:26:", "end_round_time"
] = "2:26"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "01:15:00", "end_round_time"
] = "1:15"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "00:03:50", "end_round_time"
] = "3:50"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "05:00:00", "end_round_time"
] = "5:00"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "00:02:48", "end_round_time"
] = "2:48"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "00:05:00", "end_round_time"
] = "5:00"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "04:17:00", "end_round_time"
] = "4:17"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "00:01:39", "end_round_time"
] = "1:39"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "4:45:00", "end_round_time"
] = "4:45"
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time"] == "1:12:00 AM", "end_round_time"
] = "1:12"
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("!", "1", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("$", "4", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("&", "7", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("q", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("d", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("'", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace("::", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories[
    "end_round_time"
].str.replace(" :", ":", regex=False)


# End round time in seconds
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = time_str.split(":")
    if minutes == "":
        minutes = 0

    minutes = int(minutes)
    seconds = int(seconds)

    return minutes * 60 + seconds


sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories[
    "end_round_time"
].apply(convert_to_seconds)
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["end_round_time_seconds"] == 0, "end_round_time_seconds"
] = np.nan
sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories[
    "end_round_time_seconds"
].astype("Int64")


# Total time elapsed in seconds
def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]


sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories.apply(
    calculate_total_time_seconds, axis=1
)
sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories[
    "total_time_seconds"
].astype("Int64")

# Correct date
sherdog_fighter_histories.loc[
    sherdog_fighter_histories["event_id"] == 95098, "date"
] = pd.to_datetime("2022-11-30")

# Sort and revalidate bout order column
sherdog_fighter_histories = sherdog_fighter_histories.sort_values(
    by=["fighter_id", "date", "order"]
).reset_index(drop=True)
sherdog_fighter_histories["order"] = (
    sherdog_fighter_histories.groupby("fighter_id").cumcount() + 1
)

# Select columns
sherdog_fighter_histories = sherdog_fighter_histories[
    [
        "fighter_id",
        "order",
        "event_id",
        "date",
        "opponent_id",
        "outcome",
        "outcome_method",
        "outcome_method_broad",
        "end_round",
        "end_round_time_seconds",
        "total_time_seconds",
    ]
]

sherdog_fighter_histories.to_csv(
    os.path.join(clean_data_dir, "Sherdog", "fighter_histories.csv"), index=False
)

## Fighters

In [5]:
sherdog_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "Sherdog", "fighters.csv"),
    parse_dates=["date_of_birth", "pro_debut_date"],
)

# Convert to Int64
sherdog_fighters["height_inches"] = sherdog_fighters["height_inches"].astype("Int64")

sherdog_fighters.to_csv(
    os.path.join(clean_data_dir, "Sherdog", "fighters.csv"), index=False
)