# Data Cleaning - UFC Stats

In [None]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bouts

In [5]:
# Bout IDs to swap red and blue corner distinction, manually checked using Getty Images + YouTube
bout_ids_to_flip = [
    "219bd976b8ca745d",
    "af178adff964d854",
    "920194911d727a38",
    "b675c94f20551631",
    "67948da92e6c9bdc",
    "628d02b6046d5f73",
    "1d00497a573d7f4c",
    "b376a6fb0ca4862e",
    "03d30a534dee8ac5",
    "c5bef3da17e595ee",
    "b976eb99de0f63ca",
    "f8f73efa2d4cc566",
    "4a482ad5b021bb25",
    "0fe3681eb934bdc3",
    "171007deb1cc56aa",
    "ff872fa3e9ec32a9",
    "b3a3341a5db2d484",
    "0700b74037329215",
    "5288ef3edf1eda6e",
    "98a1121ffbc47657",
    "688c8b695e521ce8",
    "98b1ee14ad4ea1c6",
    "bcefcb29bec709d6",
    "66f0238fac31130f",
    "ae674ec42bd0a979",
    "403bed9fe983b10a",
    "51ae1d8a663ae1fe",
    "9f14f080b08e6869",
    "c5e0e4ee11903076",
    "e2a61ec3bc83da98",
    "31362d9ea15127b2",
    "e915c1987050eac8",
    "b2013a8b4ed68c8d",
    "9a596aaa2b9a18db",
    "81734d11cb6a8ebb",
    "976f7f5e7537c62b",
    "b74ecc368d3ecbc4",
    "3e126f482e9a84b8",
    "1992dcae0699200e",
    "2822226962195259",
    "d23b05f33b741c47",
    "505d3d9b006014c8",
    "3818a33c9103c88f",
    "d460961065f1198b",
    "757c11f17278b06a",
    "9b8c29f50d452025",
    "194f8d501f318ada",
    "cc9cabc1fc747aaf",
    "f24db9d7c3636a2a",
    "781017bfa44d4058",
    "cfbeeffd65101ae1",
    "af4cbb9072814a86",
    "27868eac1d141498",
    "958bd435389dd12a",
    "bfd846ad4d597bbb",
    "75577b11361cb645",
    "2f736846aebbf12d",
    "c4e16d57dd9a1b39",
    "d90567f7d4372160",
    "8191b8edfde3b9c3",
    "09b36c7157770516",
    "0b58bd40373b1811",
    "f7cd0a90da3be3b2",
    "86b8419f22f398c0",
    "8add23232cdb36ee",
    "f27bf6f21410547f",
    "02c54d382fb1c347",
    "041dd0fae4650970",
    "20b84b2cbfb92dc7",
    "dd5f7a14f5782f79",
    "396214fd188c2549",
    "cb7cfa8b9900878f",
    "fb7d31b36bebc5af",
    "08f5bdfe45082ec5",
    "e6dd15ef9fd7927e",
    "92e9c53671ac8fe7",
    "afb9d3476c5caafa",
    "e4420b9812501c17",
    "6a663c04dfe5b88e",
    "868833557ae90877",
    "df7f3c60c523ca55",
    "0d395cb5d91f811f",
    "d89b849df7bff79b",
    "fd05ac3ac2f7d042",
    "21b02cb26307ac0f",
    "39b79764bbac93cb",
    "1df12710cfdb60f4",
    "038c472ae3fd7728",
    "ddc233f8ca7c05aa",
    "ae68274ab6f78cd8",
    "c6a5ed1127cbc9b9",
    "226de3e7e71f2b4f",
    "06142a1de3bbae75",
    "793432d042384a02",
    "0bf025d86cc2a409",
    "d29dd21d8881a0e0",
    "5060770f197e2147",
    "b4bf090eb5c57903",
    "f1423b63d81c88ea",
    "eb23f3574b656624",
    "15750dc75b7426f3",
    "440936f9db783997",
    "83314e1ed6f7d94e",
    "11ca5d09db47708b",
    "11871325db7a13ec",
    "036bb5b335e93ca8",
    "55bcf58be822d115",
    "f8896b43f4ecf861",
    "d9c3d7b09d8df3e4",
    "faec13fd40b6d797",
    "ef6a0ee848382391",
    "b71f07f052d0a48b",
    "a9dc59e90779c1e3",
    "28037f9be7e0781a",
    "6a61d45818534b19",
    "b0958ed411ae9675",
    "fd292c2a587bb2f5",
    "fa0d393a452c0809",
    "6ae10aa4c6dc986d",
    "2711e57bbef4d928",
    "e3c9c95b67580b9b",
    "39f07708af6ecfc7",
    "2a23f787fff1d837",
    "0454bfbc49e95c16",
    "7c620723b6840231",
    "c1be2b862f737c18",
    "329a097e0b53daac",
    "a2da03d9c25b49e5",
    "acfa3a3c0d09bb57",
    "2ff09ddaa2b78cc4",
    "ec271393bc31ab72",
    "03c2cebad8d8c012",
    "32ae3f0d43babd86",
    "d77a72e3fa8db853",
    "92975b516e6463b6",
    "6c569785225fd2e7",
    "cb510afff3dafdab",
    "2c07f8ba25640237",
    "bf8164367980fcd3",
    "ef8f31b86469a967",
    "0c6055509d98ed06",
    "119b156ded2af0f1",
    "9fa3f9f7caf429d6",
    "f5affe923c640d0e",
    "f6246d2b9f663e33",
    "7e50ae4dbdd380a1",
    "a29236fcb395f443",
    "24e14c4824144c64",
    "d7f9a09021a9a13c",
    "c99370e3e54bd5fd",
    "c4fa93a4f37a6ca7",
    "19f615a7a5cfd304",
    "be72958d9715757d",
    "504b540805598fa5",
    "1a21263dc5d866b6",
    "20628fd4e19a97e4",
    "adddc6e46da5ca19",
    "840863604b38a33f",
    "5e52b0bf9719f0ae",
    "b091e021e61f1950",
    "ca93e3f69fa3d725",
    "b4d624bdc27dff83",
    "aefca2869c87eb11",
    "0ea087a71863184d",
    "b091e021e61f1950",
    "5e52b0bf9719f0ae",
    "840863604b38a33f",
    "adddc6e46da5ca19",
    "20628fd4e19a97e4",
    "1a21263dc5d866b6",
    "504b540805598fa5",
    "be72958d9715757d",
    "19f615a7a5cfd304",
    "c4fa93a4f37a6ca7",
    "c99370e3e54bd5fd",
    "d7f9a09021a9a13c",
    "24e14c4824144c64",
]
column_swap_map = {
    "red_fighter_id": "blue_fighter_id",
    "blue_fighter_id": "red_fighter_id",
    "red_outcome": "blue_outcome",
    "blue_outcome": "red_outcome",
}

ufcstats_bouts = pd.read_csv(os.path.join(raw_data_dir, "UFC Stats", "bouts.csv"))

# Flip fighter ids and outcomes for specific bouts
ufcstats_bouts.update(
    ufcstats_bouts[ufcstats_bouts["id"].isin(bout_ids_to_flip)].rename(
        columns=column_swap_map
    )
)

ufcstats_bouts.to_csv(
    os.path.join(clean_data_dir, "UFC Stats", "bouts.csv"), index=False
)

## Events

In [6]:
ufcstats_events = pd.read_csv(
    os.path.join(raw_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"]
)

# Convert to Int64 for nullable integer
ufcstats_events["event_order"] = ufcstats_events["event_order"].astype("Int64")

ufcstats_events.to_csv(
    os.path.join(clean_data_dir, "UFC Stats", "events.csv"), index=False
)

## Fighter Histories

No further cleaning needed

In [4]:
ufcstats_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "UFC Stats", "fighter_histories.csv")
)
ufcstats_fighter_histories.to_csv(
    os.path.join(clean_data_dir, "UFC Stats", "fighter_histories.csv"), index=False
)

## Fighters

In [None]:
ufcstats_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "UFC Stats", "fighters.csv"),
    parse_dates=["date_of_birth"],
)

# Convert height and reach to Int64 since nulls exist
ufcstats_fighters[["height_inches", "reach_inches"]] = ufcstats_fighters[
    ["height_inches", "reach_inches"]
].astype("Int64")

ufcstats_fighters.to_csv(
    os.path.join(clean_data_dir, "UFC Stats", "fighters.csv"), index=False
)

## Round Stats

In [8]:
ufcstats_round_stats = pd.read_csv(
    os.path.join(raw_data_dir, "UFC Stats", "round_stats.csv")
)

# Convert control time to Int64 since nulls exist
ufcstats_round_stats["control_time_seconds"] = ufcstats_round_stats[
    "control_time_seconds"
].astype("Int64")

ufcstats_round_stats.to_csv(
    os.path.join(clean_data_dir, "UFC Stats", "round_stats.csv"), index=False
)