# Main notebook for data cleaning

In [8]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## UFC Stats

- `fighter_histories` and `bouts` are already cleaned

In [None]:
ufcstats_fighters = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "fighters.csv"), parse_dates=["date_of_birth"])
# ufcstats_fighters

# Convert height and reach to integers, defaulted to float because of NaNs
ufcstats_fighters[["height_inches", "reach_inches"]] = ufcstats_fighters[["height_inches", "reach_inches"]].astype("Int64")
ufcstats_fighters.to_csv(os.path.join(clean_data_dir, "UFC Stats", "fighters.csv"), index=False)

In [20]:
ufcstats_events = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"])
# ufcstats_events

ufcstats_events["event_order"] = ufcstats_events["event_order"].astype("Int64")
ufcstats_events.to_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"), index=False)

In [23]:
ufcstats_round_stats = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "round_stats.csv"))
# ufcstats_round_stats.dtypes

ufcstats_round_stats["control_time_seconds"] = ufcstats_round_stats["control_time_seconds"].astype("Int64")
ufcstats_round_stats.to_csv(os.path.join(clean_data_dir, "UFC Stats", "round_stats.csv"), index=False)

## Wikipedia

- `venues` is already cleaned

In [25]:
wikipedia_events = pd.read_csv(os.path.join(raw_data_dir, "Wikipedia", "events.csv"), parse_dates=["date"])
wikipedia_events["attendance"] = wikipedia_events["attendance"].astype("Int64")
wikipedia_events.to_csv(os.path.join(clean_data_dir, "Wikipedia", "events.csv"), index=False)

## Sherdog

In [27]:
sherdog_fighters = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "fighters.csv"), parse_dates=["date_of_birth", "pro_debut_date"])
sherdog_fighters["height_inches"] = sherdog_fighters["height_inches"].astype("Int64")
sherdog_fighters.to_csv(os.path.join(clean_data_dir, "Sherdog", "fighters.csv"), index=False)

In [38]:
sherdog_events = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "events.csv"), parse_dates=["date"])
sherdog_events["name"] = sherdog_events["name"].str.replace("\n", " ", regex=False).replace("  ", " ", regex=False).str.strip()
sherdog_events["location"] = sherdog_events["location"].str.replace("\r", ",", regex=False).replace("\n", " ", regex=False).replace("  ", " ", regex=False).str.strip()
sherdog_events["country"] = sherdog_events["country"].str.split('\r\n').str[0].str.strip()
sherdog_events["event_order"] = sherdog_events["event_order"].astype("Int64")
sherdog_events.loc[sherdog_events["date"] == "2030-01-11", "date"] = pd.to_datetime("2022-11-30")
sherdog_events = sherdog_events.sort_values(by=["date", "event_order"]).reset_index(drop=True)

# print(sherdog_events.loc[sherdog_events["id"] == 97481, "location"].values)

sherdog_events.to_csv(os.path.join(clean_data_dir, "Sherdog", "events.csv"), index=False)

In [125]:
sherdog_fighter_histories = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "fighter_histories.csv"))
sherdog_fighter_histories["opponent_id"] = sherdog_fighter_histories["opponent_id"].astype("Int64")
sherdog_fighter_histories["outcome"] = sherdog_fighter_histories["outcome"].map({"win": "W", "loss": "L", "draw": "D", "nc": "NC"})

sherdog_fighter_histories["outcome_method_broad"] = sherdog_fighter_histories["outcome_method"].str.split(r'\(|\)', expand=True).iloc[:, 0]
sherdog_fighter_histories["outcome_method_broad"] = sherdog_fighter_histories["outcome_method_broad"].str.strip()

# stupid edge cases
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"] == "", "outcome_method_broad"] = np.nan

dqs = ["Disqualification", "Disqualifcation", "Desqualification", "DG", "DQ"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(dqs), "outcome_method_broad"] = "Disqualification"

ko_tkos = ["ТКО", "Tko", "TKP", "ΤΚΟ", "Corner Stoppage", "Corner's towel", "Doctor's Stoppage", "Injury", "Retirement",
           "K.O", "K.O.", "KO", "Ko", "Knockout", "ko", "KO/TKO", "KO/TKO PUNCHES", "TKO Punches", "TKO", "RETIREMENT"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(ko_tkos), "outcome_method_broad"] = "KO/TKO"

draws = ["DRAW", "Draw Unanimous", "Drew", "Majority Draw", "Technical Draw"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(draws), "outcome_method_broad"] = "Draw"

decisions = ["Decision", "Decision Unanimous", "Decision unanimous", "Decisions", "Decison", "Decisão Unanime", "Desicion",
             "Desision", "Majority Decision", "Points", "Split Decision", "Split Division", "Split decision", "Technical Decision",
             "Unaminous Decision", "Unanimous", "Unanimous Decision", "Unanimous decision", "de"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(decisions), "outcome_method_broad"] = "Decision"

subs = ["Frontal Anaconda Choke", "Guillotine Choke", "Guillotine choke", "Kraken Choke", "Rear Naked Choke", "SUBMISSION",
        "Submision", "Submisison", "Submissio", "Submission", "Submissions", "Submisson", "Submssion", "Tapout",
        "Techinal Submission", "Techincal Submission", "Technial Submission", "Technical Submission", "Triangle Choke",
        "Verbal Submission", "su", "submison", "submission", "ubmission"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(subs), "outcome_method_broad"] = "Submission"

ncs = ["N/C", "NC", "ND", "No Conest", "No Contest", "No Contest - Clements Failed Drug Test", 
       "No Contest - Collard Failed Drug Test", "No Contest - Overturned by Commission", 
       "No Contest - Overturned by NSAC", "No Contest - Overturned by WTKA", "No Contest - Result overturned by FIGMMA",
       "No Contest - Strikes After The Fight", "No Decision", "No Decision - Overturned by CSAC",
       "No Decision - Overturned by FMMAF", "No Decision - Zappitella Failed Drug Test", "Overtuned",
       "Overturned by Promoter"]
sherdog_fighter_histories.loc[sherdog_fighter_histories["outcome_method_broad"].isin(ncs), "outcome_method_broad"] = "No Contest"

# end round stuff
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 0, "end_round"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 12, "end_round"] = 1
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 22, "end_round"] = 2
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round"] == 30, "end_round"] = 3
sherdog_fighter_histories["end_round"] = sherdog_fighter_histories["end_round"].astype("Int64")

# end round time
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(";", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace('"', ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(".", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(",", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("?", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("L", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("_", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(" min", "", regex=False)
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "M/A", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "SUBMISSION", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"].isin(["1", "2", "3", "5"]), "end_round_time"] += ":00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "05", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "38", "end_round_time"] = "0:38"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "51", "end_round_time"] = "0:51"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "021", "end_round_time"] = "0:21"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "215", "end_round_time"] = "2:15"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "311", "end_round_time"] = "3:11"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "201641", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "383518", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "429661", "end_round_time"] = np.nan
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2:30::", "end_round_time"] = "2:30"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2: 5", "end_round_time"] = "2:05"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "2:26:", "end_round_time"] = "2:26"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "01:15:00", "end_round_time"] = "1:15"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:03:50", "end_round_time"] = "3:50"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "05:00:00", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:02:48", "end_round_time"] = "2:48"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:05:00", "end_round_time"] = "5:00"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "04:17:00", "end_round_time"] = "4:17"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "00:01:39", "end_round_time"] = "1:39"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "4:45:00", "end_round_time"] = "4:45"
sherdog_fighter_histories.loc[sherdog_fighter_histories["end_round_time"] == "1:12:00 AM", "end_round_time"] = "1:12"
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("!", "1", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("$", "4", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("&", "7", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("q", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("d", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("'", "", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace("::", ":", regex=False)
sherdog_fighter_histories["end_round_time"] = sherdog_fighter_histories["end_round_time"].str.replace(" :", ":", regex=False)

# end round time seconds
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    minutes, seconds = time_str.split(":")
    if minutes == "":
        minutes = 0
    
    minutes = int(minutes)
    seconds = int(seconds)

    return minutes * 60 + seconds

sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories["end_round_time"].apply(convert_to_seconds)
sherdog_fighter_histories["end_round_time_seconds"] = sherdog_fighter_histories["end_round_time_seconds"].astype("Int64")

# total time seconds
def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]

sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories.apply(calculate_total_time_seconds, axis=1)
sherdog_fighter_histories["total_time_seconds"] = sherdog_fighter_histories["total_time_seconds"].astype("Int64")

sherdog_fighter_histories.loc[sherdog_fighter_histories["event_id"] == 95098, "date"] = pd.to_datetime("2022-11-30")

sherdog_fighter_histories = sherdog_fighter_histories.sort_values(by=["fighter_id", "date", "order"]).reset_index(drop=True)
sherdog_fighter_histories["order"] = sherdog_fighter_histories.groupby("fighter_id").cumcount() + 1

sherdog_fighter_histories = sherdog_fighter_histories[
    ['fighter_id',
    'order',
    'event_id',
    'date',
    'opponent_id',
    'outcome',
    'outcome_method',
    'outcome_method_broad',
    'end_round',
    'end_round_time',
    'end_round_time_seconds',
    'total_time_seconds']
]

sherdog_fighter_histories.to_csv(os.path.join(clean_data_dir, "Sherdog", "fighter_histories.csv"), index=False)

In [127]:
sherdog_bouts = pd.read_csv(os.path.join(raw_data_dir, "Sherdog", "bouts.csv"))
sherdog_bouts["fighter_1_id"] = sherdog_bouts["fighter_1_id"].astype("Int64")
sherdog_bouts["fighter_2_id"] = sherdog_bouts["fighter_2_id"].astype("Int64")

sherdog_bouts

Unnamed: 0,event_id,bout_order,fighter_1_id,fighter_2_id,fighter_1_outcome,fighter_2_outcome,is_title_bout,weight_class,outcome_method,end_round,end_round_time
0,2457,1,292,1234,win,loss,0,,Submission (Rear-Naked Choke),1,11:55
1,3813,1,292,1234,win,loss,0,,Submission (Rear-Naked Choke),1,
2,1604,1,187,7489,win,loss,0,,TKO (Corner Stoppage),1,5:02
3,1604,2,120,7488,draw,draw,0,,Draw,1,20:00
4,1604,3,7646,7490,win,loss,0,,Submission (Rear-Naked Choke),1,1:36
...,...,...,...,...,...,...,...,...,...,...,...
443991,103748,7,81143,247303,win,loss,0,Welterweight,Decision (Unanimous),3,5:00
443992,103748,8,210693,237561,win,loss,0,Middleweight,KO (Punch),1,1:41
443993,103748,9,47065,78731,win,loss,0,Lightweight,Submission (Guillotine Choke),1,2:24
443994,103748,10,172169,174277,win,loss,1,Lightweight,TKO (Punches),2,1:00


In [128]:
sherdog_bouts["fighter_1_outcome"].unique()

array(['win', 'draw', 'nc', 'yet to come', 'loss'], dtype=object)