# Data Cleaning - Tapology

In [1]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bouts

In [4]:
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))

# We will handle gym/affiliation history separately later
tapology_bouts = tapology_bouts.drop(
    columns=[
        "fighter_1_gym_info",
        "fighter_1_gym_ids",
        "fighter_2_gym_info",
        "fighter_2_gym_ids",
    ]
)

# Basic string cleaning
tapology_bouts[["outcome_method", "outcome_method_details"]] = tapology_bouts[
    "outcome_method"
].str.split(", ", n=1, expand=True)
tapology_bouts = tapology_bouts.drop(columns=["end_round_time_info"])
tapology_bouts["billing"] = (
    tapology_bouts["billing"].str.split(" (", regex=False).str[0].str.strip()
)


# Helper function to extract weight class weight
def get_final_weight_class_weight_lbs(weight_class_str):
    if pd.isna(weight_class_str):
        return np.nan

    weight_class_split = weight_class_str.split(" (")
    weight_class_final_lbs = int(
        float(
            [
                x.replace(")", "").replace(" lbs", "")
                for x in weight_class_split
                if "lbs" in x
            ][0]
        )
    )

    return weight_class_final_lbs


tapology_bouts["weight_class_final_weight_lbs"] = tapology_bouts["weight_class"].apply(
    get_final_weight_class_weight_lbs
)


# Extract original weight class weight if weight class was changed
def get_original_weight_class_weight_lbs(weight_class_str):
    if pd.isna(weight_class_str) or "re-scheduled" not in weight_class_str:
        return np.nan

    weight_class_str_split = weight_class_str.split(" (")
    weight_class_original_lbs = int(
        float(
            [
                x.replace(")", "").replace("re-scheduled from ", "")
                for x in weight_class_str_split
                if "re-scheduled" in x
            ][0]
        )
    )

    return weight_class_original_lbs


tapology_bouts["weight_class_original_weight_lbs"] = tapology_bouts[
    "weight_class"
].apply(get_original_weight_class_weight_lbs)

# Extract numeric part of odds and convert to float
tapology_bouts["fighter_1_odds"] = (
    tapology_bouts["fighter_1_odds"]
    .str.split(" (", regex=False)
    .str[0]
    .str.strip()
    .astype(float)
)
tapology_bouts["fighter_2_odds"] = (
    tapology_bouts["fighter_2_odds"]
    .str.split(" (", regex=False)
    .str[0]
    .str.strip()
    .astype(float)
)

# Extract numeric part of weigh-in results and convert to float
tapology_bouts["fighter_1_weight_lbs"] = (
    tapology_bouts["fighter_1_weight"]
    .str.split(" (", regex=False)
    .str[0]
    .str.replace("lbs", "")
    .str.strip()
    .astype(float)
)
tapology_bouts["fighter_2_weight_lbs"] = (
    tapology_bouts["fighter_2_weight"]
    .str.split(" (", regex=False)
    .str[0]
    .str.replace("lbs", "")
    .str.strip()
    .astype(float)
)

# Select columns
tapology_bouts = tapology_bouts[
    [
        "id",
        "ufcstats_id",
        "event_id",
        "bout_order",
        "fighter_1_id",
        "fighter_2_id",
        "billing",
        "weight_class_final_weight_lbs",
        "weight_class_original_weight_lbs",
        "outcome_method",
        "outcome_method_details",
        "fighter_1_odds",
        "fighter_2_odds",
        "fighter_1_weight_lbs",
        "fighter_2_weight_lbs",
    ]
]

# Convert to Int64
tapology_bouts["weight_class_final_weight_lbs"] = tapology_bouts[
    "weight_class_final_weight_lbs"
].astype("Int64")
tapology_bouts["weight_class_original_weight_lbs"] = tapology_bouts[
    "weight_class_original_weight_lbs"
].astype("Int64")
tapology_bouts["fighter_1_odds"] = tapology_bouts["fighter_1_odds"].astype("Int64")
tapology_bouts["fighter_2_odds"] = tapology_bouts["fighter_2_odds"].astype("Int64")

# Correct bout orders
ordering_fix = {
    "2590-ufc-4-guy-the-sandman-mezger-vs-jason-bonecracker-fairn": 3,
    "2595-ufc-4-royce-gracie-vs-ron-black-dragon-van-clief": 4,
    "890-ufc-4-keith-the-giant-killer-hackney-vs-joe-son": 5,
    "ufc-4-steve-jennum-vs-melton-the-punisher-bowen": 6,
    "905-ufc-4-dan-the-beast-severn-vs-anthony-mad-dog-macias": 7,
    "912-ufc-4-royce-gracie-vs-keith-the-giant-killer-hackney": 8,
    "921-ufc-4-dan-the-beast-severn-vs-marcus-the-grasshopper-bossett": 9,
    "ufc-5-dave-dangerous-beneteau-vs-asbel-cancio": 1,
    "942-ufc-5-guy-the-sandman-mezger-vs-john-dowdy": 2,
    "ufc-7-paul-the-polar-bear-varelans-vs-gerry-harris": 4,
    "1363-ufc-7-mark-the-cobra-hall-vs-harold-howard": 5,
    "1394-ufc-75-david-tank-abbott-vs-steve-jennum": 3,
    "2876-ufc-75-dan-the-beast-severn-vs-paul-the-polar-bear-varelans": 4,
    "2879-ufc-75-marco-the-king-of-the-streets-ruas-vs-keith-the-giant-killer-hackney": 5,
    "2882-ufc-75-oleg-the-russian-bear-taktarov-vs-dave-dangerous-beneteau": 6,
    "2885-ufc-75-dan-the-beast-severn-vs-david-tank-abbott": 7,
    "2890-ufc-7-5-oleg-the-russian-bear-taktarov-vs-marco-the-king-of-the-streets-ruas": 8,
    "ufc-115-mark-the-cobra-hall-vs-felix-mitchell": 1,
    "ufc-115-tai-bowden-vs-jack-the-ripper-nilson": 3,
    "2674-ufc-115-don-the-predator-frye-vs-gary-big-daddy-goodridge-ii": 4,
    "2680-ufc-115-david-tank-abbott-vs-cal-worsham": 5,
    "2686-ufc-115-kimo-leopoldo-vs-paul-the-polar-bear-varelans": 6,
    "ufc-12-scott-the-pit-bull-ferrozzo-vs-jim-mullen": 5,
    "1057-ufc-12-vitor-the-phenom-belfort-vs-tra-trauma-telligman": 6,
    "882-ufc-14-the-smashing-machine-mark-kerr-vs-moti-the-hammer-horenstein": 5,
    "888-ufc-14-dan-the-bull-bobish-vs-brian-fury-johnston": 6,
    "ufc-16-chris-the-westside-strangler-brennan-vs-courtney-turner": 2,
    "2966-ufc-16-the-croation-sensation-pat-miletich-vs-chris-the-westside-strangler-brennan-ii": 6,
    "2969-ufc-16-tsuyoshi-tk-kosaka-vs-kimo-leopoldo": 7,
    "2971-ufc-16-frank-the-legend-shamrock-vs-igor-houdini-zinoviev": 8,
    "ufc-23-katsuhisa-shamoji-fujii-vs-masutatsu-yano": 1,
    "ufc-23-kenichi-yamamoto-vs-katsuhisa-shamoji-fujii": 5,
    "2689-ufc-23-pedro-the-rock-rizzo-vs-tsuyoshi-tk-kosaka": 6,
    "2694-ufc-23-kevin-the-monster-randleman-vs-pete-el-duro-williams": 7,
    "1211-ufc-75-jess-joker-liaudin-vs-anthony-the-crush-torres": 1,
    "1220-ufc-75-dennis-siver-vs-naoyuki-kotani": 2,
    "1223-ufc-75-thiago-silva-vs-tomasz-gorilla-drwal": 3,
    "1229-ufc-75-gleison-tibau-vs-terry-etim": 4,
    "2266-ufc-75-houston-the-assassin-alexander-vs-alessio-legionarius-sakara": 5,
    "2275-ufc-75-the-irish-hand-grenade-marcus-davis-vs-relentless-paul-taylor": 6,
    "2285-ufc-75-cheick-kongo-vs-mirko-cro-cop-filipovic": 7,
    "2292-ufc-75-michael-the-count-bisping-vs-matt-the-hammer-hamill": 8,
    "2301-ufc-75-quinton-rampage-jackson-vs-dan-hendo-henderson": 9,
}
tapology_bouts["bout_order"] = tapology_bouts.apply(
    lambda row: ordering_fix[row["id"]]
    if row["id"] in ordering_fix
    else row["bout_order"],
    axis=1,
)

# Sort by
tapology_bouts = (
    tapology_bouts.groupby("event_id", as_index=False, sort=False)
    .apply(lambda g: g.sort_values(by="bout_order"))
    .reset_index(drop=True)
)

tapology_bouts.to_csv(
    os.path.join(clean_data_dir, "Tapology", "bouts.csv"), index=False
)

  .apply(lambda g: g.sort_values(by="bout_order"))


## Community Picks

In [5]:
tapology_community_picks = pd.read_csv(
    os.path.join(raw_data_dir, "Tapology", "community_picks.csv")
)
tapology_community_picks["fighter_last_name"] = (
    tapology_community_picks["fighter_last_name"].str.split().str[-1]
)

# Check for identical fighter last names in the same bout, manually assigned fighter ids
identical_names = tapology_community_picks.groupby("bout_id")[
    "fighter_last_name"
].transform(lambda x: x.nunique() == 1)
subset_identical_names = tapology_community_picks.loc[identical_names, :].copy()
subset_identical_names.loc[:, "fighter_id"] = [
    "43895-roberto-sanchez",
    "28639-joby-sanchez",
    "69225-karine-silva-killer",
    "57056-ariane-lipski",
]

# Otherwise just join with bouts
subset_other = tapology_community_picks.loc[~identical_names, :].copy()
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))[
    ["id", "fighter_1_id", "fighter_2_id"]
]
fighter_1_stuff = tapology_bouts[["id", "fighter_1_id"]].rename(
    columns={"fighter_1_id": "fighter_id"}
)
fighter_2_stuff = tapology_bouts[["id", "fighter_2_id"]].rename(
    columns={"fighter_2_id": "fighter_id"}
)

# Match up community pick submissions with fighter IDs, since names can be ambiguous
bout_to_fighter_stacked = pd.concat(
    [fighter_1_stuff, fighter_2_stuff], ignore_index=True
).reset_index(drop=True)
tapology_fighters = pd.read_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv")
).rename(columns={"id": "fighter_id"})
bout_to_fighter_stacked = bout_to_fighter_stacked.merge(
    tapology_fighters[["fighter_id", "name"]], on="fighter_id", how="left"
)
bout_to_fighter_stacked["fighter_last_name"] = (
    bout_to_fighter_stacked["name"].str.split().str[-1]
)
bout_to_fighter_stacked = bout_to_fighter_stacked.rename(
    columns={"id": "bout_id"}
).drop(columns=["name"])

subset_other = subset_other.merge(
    bout_to_fighter_stacked, on=["bout_id", "fighter_last_name"], how="left"
).set_axis(subset_other.index, axis=0)

# Concatenate together and select columns
tapology_community_picks_clean = pd.concat(
    [subset_identical_names, subset_other]
).sort_index()
tapology_community_picks_clean = tapology_community_picks_clean[
    [
        "bout_id",
        "fighter_id",
        "ko_tko_percentage",
        "submission_percentage",
        "decision_percentage",
        "overall_percentage",
        "num_picks",
    ]
]

# Convert overall win percent to integer
tapology_community_picks_clean["overall_percentage"] = tapology_community_picks_clean[
    "overall_percentage"
].astype(int)

tapology_community_picks_clean.to_csv(
    os.path.join(clean_data_dir, "Tapology", "community_picks.csv"), index=False
)

## Events

In [6]:
tapology_events = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "events.csv"))

# Don't need Wikipedia link
tapology_events = tapology_events.drop(columns="wikipedia_url")

# Extract numeric part of Best Fight Odds URL to get ID
tapology_events["bestfightodds_id"] = (
    tapology_events["bestfightodds_id"].str.split("-").str[-1].astype("Int64")
)

# Fill in missing Best Fight Odds IDs manually
tapology_events.loc[tapology_events["id"] == "24353-ufc-179", "bestfightodds_id"] = 855
tapology_events.loc[tapology_events["id"] == "30831-ufc-193", "bestfightodds_id"] = 1002
tapology_events.loc[
    tapology_events["id"] == "37906-ufc-fight-night-87-jacare-vs-belfort",
    "bestfightodds_id",
] = 1070
tapology_events.loc[
    tapology_events["id"] == "38668-ufc-199-rockhold-vs-weidman-2", "bestfightodds_id"
] = 1081
tapology_events.loc[tapology_events["id"] == "41373-ufc-207", "bestfightodds_id"] = 1210
tapology_events.loc[tapology_events["id"] == "43945-ufc-211", "bestfightodds_id"] = 1252
tapology_events.loc[
    tapology_events["id"] == "38669-ufc-fight-night-88", "bestfightodds_id"
] = 1104
tapology_events.loc[
    tapology_events["id"] == "28239-ufc-fight-night-65", "bestfightodds_id"
] = 936
tapology_events.loc[
    tapology_events["id"] == "44376-ufc-fight-night", "bestfightodds_id"
] = 1277
tapology_events.loc[
    tapology_events["id"] == "44562-ufc-fight-night-110", "bestfightodds_id"
] = 1275
tapology_events.loc[
    tapology_events["id"] == "26376-ufc-fight-night-55", "bestfightodds_id"
] = 894
tapology_events.loc[
    tapology_events["id"] == "45800-ufc-fight-night", "bestfightodds_id"
] = 1332
tapology_events.loc[
    tapology_events["id"] == "44728-ufc-fight-night", "bestfightodds_id"
] = 1310
tapology_events.loc[
    tapology_events["id"] == "16683-ufc-on-fox-9", "bestfightodds_id"
] = 729

tapology_events.to_csv(
    os.path.join(clean_data_dir, "Tapology", "events.csv"), index=False
)

## Gyms

In [8]:
tapology_gyms = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "gyms.csv"))
tapology_gyms = tapology_gyms.drop(columns=["parent_name"])

# Add parent ids to gyms where applicable
parent_ids_not_in_gym_ids = (
    tapology_gyms.loc[~tapology_gyms["parent_id"].isin(tapology_gyms["id"])][
        "parent_id"
    ]
    .dropna()
    .unique()
)

tapology_parent_gyms_missing = [
    {
        "id": "122-ultimate-athletics",
        "name": "Ultimate Athletics",
        "name_alternative": None,
        "location": "Ithaca, New York",
        "parent_id": None,
    },
    {
        "id": "166-paraestra-tokyo",
        "name": "Paraestra Tokyo",
        "name_alternative": "パラエストラ東京",
        "location": "Tokyo, Japan",
        "parent_id": None,
    },
    {
        "id": "1891-ysa",
        "name": "YSA",
        "name_alternative": "YAMAMOTO SPORTS ACADEMY",
        "location": "Tokyo, Japan",
        "parent_id": None,
    },
    {
        "id": "2988-john-frankl-brazilian-jiu-jitsu",
        "name": "John Frankl Brazilian Jiu Jitsu",
        "name_alternative": "John Frankl Brazilian Jiu-Jitsu Competition Team, JFBJJ",
        "location": "Seoul, South Korea",
        "parent_id": None,
    },
    {
        "id": "3547-reversal-gym-tokyo-standout",
        "name": "Reversal Gym Tokyo Standout",
        "name_alternative": "リバーサルジム東京スタンドアウト(STANDOUT)",
        "location": "Shibuya, Tokyo, Japan",
        "parent_id": None,
    },
    {
        "id": "3595-shooto",
        "name": "Japan Shooto Association",
        "name_alternative": None,
        "location": "Japan",
        "parent_id": None,
    },
    {
        "id": "366-premier-martial-arts-watertown",
        "name": "Premier Martial Arts Watertown",
        "name_alternative": "PMA MMA",
        "location": "Watertown, New York",
        "parent_id": None,
    },
    {
        "id": "3667-ps-lab-tokyo",
        "name": "P's Lab Tokyo",
        "name_alternative": "P's LAB東京 ゴールドジム原宿, パンクラスP'sLAB東京",
        "location": "Shibuya, Tokyo, Japan",
        "parent_id": None,
    },
    {
        "id": "3930-nova-unio-argentina",
        "name": "Nova União Argentina",
        "name_alternative": None,
        "location": "Argentina",
        "parent_id": "53-nova-unio",
    },
    {
        "id": "4276-ufc-gym",
        "name": "UFC Gym",
        "name_alternative": None,
        "location": "Santa Ana, California",
        "parent_id": None,
    },
    {
        "id": "615-straight-blast-gym-international",
        "name": "SBGi Portland",
        "name_alternative": "Straight Blast Gym International Portland",
        "location": "Portland, Oregon",
        "parent_id": None,
    },
    {
        "id": "700-fang-shen-do",
        "name": "Fang Shen Do",
        "name_alternative": None,
        "location": "Canada",
        "parent_id": None,
    },
    {
        "id": "731-tapout-training-center",
        "name": "Tapout Training Center Las Vegas",
        "name_alternative": None,
        "location": "Las Vegas, Nevada",
        "parent_id": None,
    },
    {
        "id": "766-10th-planet-jiu-jitsu-van-nuys",
        "name": "10th Planet Jiu Jitsu Van Nuys",
        "name_alternative": None,
        "location": "Van Nuys, California",
        "parent_id": "228-10th-planet-jiu-jitsu",
    },
    {
        "id": "9533-chute-boxe",
        "name": "Chute Boxe",
        "name_alternative": None,
        "location": "Curitiba, Parana, Brazil",
        "parent_id": None,
    },
]

tapology_parent_gyms_df = pd.DataFrame(tapology_parent_gyms_missing)
tapology_gyms = (
    pd.concat([tapology_gyms, tapology_parent_gyms_df], ignore_index=True)
    .sort_values(by="id")
    .reset_index(drop=True)
)

# Concatenate missing gyms which were hand collected
missing_gyms = pd.read_csv("tapology_missing_gyms.csv")
tapology_gyms = (
    pd.concat([tapology_gyms, missing_gyms])
    .drop_duplicates("id")
    .reset_index(drop=True)
)

# Clean up name
tapology_gyms["name"] = tapology_gyms["name"].str.replace("’", "'")

tapology_gyms.to_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"), index=False)

## Fighter Gyms

In [9]:
tapology_bouts = pd.read_csv(os.path.join(raw_data_dir, "Tapology", "bouts.csv"))

# Isolate fighter-gym-bout relationships
temp = tapology_bouts[
    [
        "id",
        "fighter_1_id",
        "fighter_2_id",
        "fighter_1_gym_info",
        "fighter_1_gym_ids",
        "fighter_2_gym_info",
        "fighter_2_gym_ids",
    ]
].copy()

# Separate fighter 1 and fighter 2
temp_f1 = temp[
    ["fighter_1_id", "id", "fighter_1_gym_ids", "fighter_1_gym_info"]
].rename(
    columns={
        "fighter_1_id": "fighter_id",
        "fighter_1_gym_info": "gym_info",
        "fighter_1_gym_ids": "gym_ids",
        "id": "bout_id",
    }
)
temp_f2 = temp[
    ["fighter_2_id", "id", "fighter_2_gym_ids", "fighter_2_gym_info"]
].rename(
    columns={
        "fighter_2_id": "fighter_id",
        "fighter_2_gym_info": "gym_info",
        "fighter_2_gym_ids": "gym_ids",
        "id": "bout_id",
    }
)

# Concatenate into a stacked dataframe
tapology_fighter_gyms_by_bout = (
    pd.concat([temp_f1, temp_f2]).sort_index().reset_index(drop=True)
)
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.dropna(
    subset=["gym_info", "gym_ids"], how="all"
)

# Fill in missing gym IDs only if gym ID is missing
tapology_gyms = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"))
name_to_id = tapology_gyms.set_index("name")["id"].to_dict()
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_ids"
] = tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_info"
].map(name_to_id)

name_alt_to_id = tapology_gyms.set_index("name_alternative")["id"].to_dict()
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_ids"
] = tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_ids"].isnull(), "gym_info"
].map(name_alt_to_id)

# Rename duplicate gym names
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Ataque Duplo / Team Tavares",
    "gym_info",
] = "Ataque Duplo"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Ataque Duplo / Thiago Tavares Team",
    "gym_info",
] = "Ataque Duplo"

# Fill in missing gyms
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Beverly Hills Jiu-Jitsu Club",
    "gym_ids",
] = "3224-beverly-hills-jiu-jitsu-club"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Black House", "gym_ids"
] = "779-black-house-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Black Tiger Team", "gym_ids"
] = "5048-black-tiger-fight-club"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Cesar Gracie Jiu-Jitsu", "gym_ids"
] = "783-cesar-gracie-jiu-jitsu"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Chute Boxe Academy", "gym_ids"
] = "5110-chute-boxe-academy"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Dragon's Lair MMA", "gym_ids"
] = "6182-dragons-lair-melksham"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Elite Training Center", "gym_ids"
] = "2607-elite-training-center"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "FFC Gym", "gym_ids"
] = "9865-ffc-team"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Fisticuff/Purebred Omiya", "gym_ids"
] = "5967-fisticuffspurebred"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Frankiko Team / Trator Team",
    "gym_ids",
] = "9465-frankiko-team"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Gile Ribeiro Team / Noguchi",
    "gym_ids",
] = "9213-gile-ribeiro-team"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Gladiators Training Academy",
    "gym_ids",
] = "2544-eugene-jacksons-undisputed-gladiators"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "HIT Squad", "gym_ids"
] = "1170-finneys-hit-squad"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Hilti NHB", "gym_ids"
] = "522-hilti-bjj"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "IN FIGHT JAPAN", "gym_ids"
] = "4366-infight-japan"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Integrated Fighting", "gym_ids"
] = "1214-integrated-fighting-academy"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Kaisho Martial Arts", "gym_ids"
] = "9339-kaisho-kampsport-klubb"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Lovato's BJJ", "gym_ids"
] = "5236-lovatos-brazilian-jiu-jitsu-and-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "MMA Clinic", "gym_ids"
] = "5247-the-mma-clinic"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Next Generation UK", "gym_ids"
] = "3263-next-generation-mma-liverpool"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Perfect Team", "gym_ids"
] = "4933-perfect-team-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Pro Athletes", "gym_ids"
] = "7684-proathlets"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Ralph Gracie", "gym_ids"
] = "12357-ralph-gracie-jiu-jitsu"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Range", "gym_ids"
] = "10526-range-martial-arts-academy"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Renyi Fight Camp", "gym_ids"
] = "2118-renyi"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Ruas Vale Tudo", "gym_ids"
] = "4382-ruas-vale-tudo"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Sure Grip Vale Tudo", "gym_ids"
] = "507-team-sure-grip"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Tapout Training Center", "gym_ids"
] = "731-tapout-training-center"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "The Jungle MMA", "gym_ids"
] = "117-the-jungle-mma-fitness"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Throwdown Elite Training Center",
    "gym_ids",
] = "732-throwdown-training-center"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Throwdown Training Center", "gym_ids"
] = "732-throwdown-training-center"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Todd Medina Freestyle Team", "gym_ids"
] = "4478-tod-medinas-fight-school"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Trojan Freefighters", "gym_ids"
] = "8081-trojan-free-fighters-gloucester"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "VT-1 Gym", "gym_ids"
] = "11234-vt1-martial-arts"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "VT1 Gym", "gym_ids"
] = "11234-vt1-martial-arts"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "West Coast Fight Team", "gym_ids"
] = "1106-west-coast-fight-team"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Wolfslair Academy", "gym_ids"
] = "3321-wolfslair-mma-academy"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Wolfslair MMA Academy", "gym_ids"
] = "3321-wolfslair-mma-academy"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "World Class MMA", "gym_ids"
] = "5210-world-class-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Xplode MMA", "gym_ids"
] = "3389-xplode-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Team Xplode MMA", "gym_ids"
] = "3389-xplode-mma"
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_info"] == "Zen Jiu-Jitsu", "gym_ids"
] = "4058-zen-jiu-jitsu"

# Explode gym IDs and gym info into separate rows
tapology_fighter_gyms_by_bout["gym_ids"] = tapology_fighter_gyms_by_bout[
    "gym_ids"
].str.split("; ")
tapology_fighter_gyms_by_bout["gym_info"] = tapology_fighter_gyms_by_bout[
    "gym_info"
].str.split("; ")
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.explode(
    ["gym_ids", "gym_info"]
).reset_index(drop=True)
tapology_fighter_gyms_by_bout["gym_purpose"] = tapology_fighter_gyms_by_bout[
    "gym_info"
].str.extract(r"\((.*)\)")[0]
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_purpose"] == "St. Petersburg", "gym_purpose"
] = np.nan
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.rename(
    columns={"gym_ids": "gym_id"}
)

# Fill gym_name with gym_info if gym_name is missing
gyms = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "gyms.csv"))[["id", "name"]]
gyms = gyms.rename(columns={"id": "gym_id", "name": "gym_name"})
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout.merge(
    gyms, on="gym_id", how="left"
)
tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_name"].isnull(), "gym_name"
] = tapology_fighter_gyms_by_bout.loc[
    tapology_fighter_gyms_by_bout["gym_name"].isnull(), "gym_info"
]

# Select columns
tapology_fighter_gyms_by_bout = tapology_fighter_gyms_by_bout[
    ["fighter_id", "bout_id", "gym_id", "gym_name", "gym_purpose"]
]

tapology_fighter_gyms_by_bout.to_csv(
    os.path.join(clean_data_dir, "Tapology", "fighter_gyms.csv"), index=False
)

## Fighter Histories

In [10]:
tapology_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "Tapology", "fighter_histories.csv")
)

tapology_fighter_histories = (
    tapology_fighter_histories.loc[
        ~tapology_fighter_histories["outcome"].isin(["upcoming", "cancelled"])
    ]
    .copy()
    .reset_index(drop=True)
)
tapology_fighter_histories["outcome"] = tapology_fighter_histories[
    "outcome"
].str.title()
tapology_fighter_histories["order"] = (
    tapology_fighter_histories.groupby("fighter_id").cumcount() + 1
)

tapology_fighter_histories = tapology_fighter_histories.rename(
    columns={"weight_class": "weight_class_TEMP", "bout_id_int": "bout_id_integer"}
)


# Extract weight class name
def extract_weight_class(weight_class_str):
    if pd.isna(weight_class_str):
        return None

    weight_class_str_split = [x.strip() for x in weight_class_str.split(" · ")]
    if not weight_class_str_split[0][0].isdigit() and not weight_class_str_split[
        0
    ].startswith("Weigh-In:"):
        return weight_class_str_split[0].strip()
    else:
        return None


tapology_fighter_histories["weight_class"] = tapology_fighter_histories[
    "weight_class_TEMP"
].apply(extract_weight_class)


# Extract weight class weight
def extract_weight_class_lbs(weight_class_str):
    if pd.isna(weight_class_str):
        return None

    weight_lbs = None
    weight_class_str_split = [x.strip() for x in weight_class_str.split(" · ")]
    for x in weight_class_str_split:
        if x[0].isdigit() and not x.startswith("Weigh-In:"):
            weight_str = [y for y in x.split(" (") if "lbs" in y][0]
            weight_lbs = float(weight_str.replace("lbs", "").replace(")", "").strip())
            break

    return weight_lbs


tapology_fighter_histories["weight_class_lbs"] = tapology_fighter_histories[
    "weight_class_TEMP"
].apply(extract_weight_class_lbs)


def get_weigh_in_result_lbs(row):
    if not pd.isna(row["weight"]):
        weight_str = [x for x in row["weight"].split(" (") if "lbs" in x][0]

        return float(weight_str.replace("lbs", "").replace(")", "").strip())
    elif pd.isna(row["weight"]) and not pd.isna(row["weight_class_TEMP"]):
        if "Weigh-In:" not in row["weight_class_TEMP"]:
            return None
        weight_str = [
            x.strip() for x in row["weight_class_TEMP"].split(" · ") if "Weigh-In:" in x
        ][0]
        weight_str = [x for x in weight_str.split(" (") if "lbs" in x][0]
        weight_lbs = float(
            weight_str.replace("lbs", "")
            .replace(")", "")
            .replace("Weigh-In:", "")
            .strip()
        )

        return weight_lbs
    else:
        return None


tapology_fighter_histories["weigh_in_result_lbs"] = tapology_fighter_histories.apply(
    get_weigh_in_result_lbs,  # type: ignore
    axis=1,
)  # type: ignore

tapology_fighter_histories["odds"] = tapology_fighter_histories["odds"].astype("Int64")
tapology_fighter_histories["pick_em_percent"] = (
    tapology_fighter_histories["pick_em"].str.replace("%", "").astype("Int64")
)


def extract_outcome_stuff(outcome_details_str):
    if pd.isna(outcome_details_str):
        return None, None, None, None

    outcome_details_split = [
        x.strip() for x in outcome_details_str.split(" · ") if x.strip()
    ]
    outcome_method = None
    outcome_method_details = None
    end_round = None
    end_round_time_seconds = None
    for x in outcome_details_split:
        if x[0].isdigit() and ":" in x:
            assert end_round is None, f"{outcome_details_str}"
            time_split = x.split(":")
            assert len(time_split) == 2, f"{outcome_details_str}"

            end_round_time_seconds = int(time_split[0]) * 60 + int(time_split[1])
        elif x[0] == "R" and x[1].isdigit():
            assert end_round is None, f"{outcome_details_str}"
            end_round = int(x.replace("R", ""))
        elif outcome_method is None:
            outcome_method = x
        else:
            if outcome_method_details is None:
                outcome_method_details = x
            else:
                outcome_method_details += " - " + x

    return outcome_method, outcome_method_details, end_round, end_round_time_seconds


tapology_fighter_histories[
    ["outcome_method", "outcome_method_details", "end_round", "end_round_time_seconds"]
] = (
    tapology_fighter_histories["outcome_details"]
    .apply(extract_outcome_stuff)
    .apply(pd.Series)
)

tapology_fighter_histories["end_round"] = tapology_fighter_histories[
    "end_round"
].astype("Int64")
tapology_fighter_histories["end_round_time_seconds"] = tapology_fighter_histories[
    "end_round_time_seconds"
].astype("Int64")

tapology_fighter_histories = tapology_fighter_histories[
    [
        "fighter_id",
        "order",
        "bout_id",
        "bout_id_integer",
        "event_id",
        "event_name",
        "opponent_id",
        "billing",
        "round_time_format",
        "weight_class",
        "weight_class_lbs",
        "outcome",
        "outcome_method",
        "outcome_method_details",
        "end_round",
        "end_round_time_seconds",
        "fighter_record",
        "opponent_record",
        "weigh_in_result_lbs",
        "odds",
        "pick_em_percent",
    ]
]

tapology_fighter_histories.to_csv(
    os.path.join(clean_data_dir, "Tapology", "fighter_histories.csv"), index=False
)

## Fighters

In [12]:
tapology_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "Tapology", "fighters.csv"),
    parse_dates=["date_of_birth"],
)


# Convert height string to inches
def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan

    height_str = height_str.split("(")[0].replace('"', "").strip()
    feet, inches = map(int, height_str.split("'"))

    return feet * 12 + inches


tapology_fighters["height"] = tapology_fighters["height"].apply(convert_height)
tapology_fighters = tapology_fighters.rename(columns={"height": "height_inches"})
tapology_fighters["height_inches"] = tapology_fighters["height_inches"].astype("Int64")


# Do same for reach
def clean_reach(reach_str):
    if pd.isna(reach_str):
        return np.nan

    reach_str = reach_str.split("(")[0].replace('"', "").strip()

    return float(reach_str)


tapology_fighters["reach"] = tapology_fighters["reach"].apply(clean_reach)
tapology_fighters = tapology_fighters.rename(columns={"reach": "reach_inches"})

# Fill in missing Best Fight Odds IDs manually
tapology_fighters["bestfightodds_id"] = (
    tapology_fighters["bestfightodds_id"].str.split("-").str[-1].astype("Int64")
)
tapology_fighters.loc[
    tapology_fighters["id"] == "70996-fredy-serrano", "bestfightodds_id"
] = 5321
tapology_fighters.loc[
    tapology_fighters["id"] == "43585-levan-makashvili", "bestfightodds_id"
] = 5499
tapology_fighters.loc[
    tapology_fighters["id"] == "24897-sirwan-kakai-zohan", "bestfightodds_id"
] = 2503
tapology_fighters.loc[
    tapology_fighters["id"] == "41568-tony-sims", "bestfightodds_id"
] = 5715
tapology_fighters.loc[
    tapology_fighters["id"] == "20516-andrew-holbrook", "bestfightodds_id"
] = 5727
tapology_fighters.loc[
    tapology_fighters["id"] == "89121-adam-yandiev", "bestfightodds_id"
] = 8459
tapology_fighters.loc[
    tapology_fighters["id"] == "74166-jin-soo-son", "bestfightodds_id"
] = 8537
tapology_fighters.loc[
    tapology_fighters["id"] == "123362-michel-batista", "bestfightodds_id"
] = 7819
tapology_fighters.loc[
    tapology_fighters["id"] == "24429-reginaldo-vieira", "bestfightodds_id"
] = 5722
tapology_fighters.loc[
    tapology_fighters["id"] == "4420-erik-montano", "bestfightodds_id"
] = 5985
tapology_fighters.loc[
    tapology_fighters["id"] == "33349-geane-herrera-la-pulga", "bestfightodds_id"
] = 5729
tapology_fighters.loc[tapology_fighters["id"] == "steve-bosse", "bestfightodds_id"] = (
    5606
)
tapology_fighters.loc[
    tapology_fighters["id"] == "60336-james-mulheron", "bestfightodds_id"
] = 6427
tapology_fighters.loc[
    tapology_fighters["id"] == "4430-bojan-mihajlovic", "bestfightodds_id"
] = 6191
tapology_fighters.loc[
    tapology_fighters["id"] == "25001-bharat-khandare-daring", "bestfightodds_id"
] = 7727
tapology_fighters.loc[
    tapology_fighters["id"] == "90671-carls-john-de-tomas-goldenboy", "bestfightodds_id"
] = 7126
tapology_fighters.loc[
    tapology_fighters["id"] == "55567-jesus-pinedo", "bestfightodds_id"
] = 8714
tapology_fighters.loc[
    tapology_fighters["id"] == "24602-joe-meunier", "bestfightodds_id"
] = 6502
tapology_fighters.loc[
    tapology_fighters["id"] == "39663-abdul-kerim-edilov", "bestfightodds_id"
] = 7351
tapology_fighters.loc[
    tapology_fighters["id"] == "98431-khalid-murtazaliev", "bestfightodds_id"
] = 8169
tapology_fighters.loc[
    tapology_fighters["id"] == "mark-scanlon-scanno", "bestfightodds_id"
] = 2162
tapology_fighters.loc[
    tapology_fighters["id"] == "jesse-bongfeldt-water", "bestfightodds_id"
] = 2169
tapology_fighters.loc[
    tapology_fighters["id"] == "65391-kwan-ho-kwak", "bestfightodds_id"
] = 6818
tapology_fighters.loc[
    tapology_fighters["id"] == "19970-yusuke-kasuya", "bestfightodds_id"
] = 5793
tapology_fighters.loc[tapology_fighters["id"] == "alex-ricci", "bestfightodds_id"] = (
    2588
)
tapology_fighters.loc[
    tapology_fighters["id"] == "4847-cindy-dandois-battlecat", "bestfightodds_id"
] = 5334
tapology_fighters.loc[
    tapology_fighters["id"] == "11166-wagner-campos-galeto", "bestfightodds_id"
] = 3422
tapology_fighters.loc[
    tapology_fighters["id"] == "delson-heleno-pe-de-chumbo", "bestfightodds_id"
] = 329
tapology_fighters.loc[
    tapology_fighters["id"] == "7793-anistavo-gasparzinho", "bestfightodds_id"
] = 3417
tapology_fighters.loc[
    tapology_fighters["id"] == "31203-pedro-nobre-the-rock", "bestfightodds_id"
] = 3819
tapology_fighters.loc[
    tapology_fighters["id"] == "34691-adam-cella", "bestfightodds_id"
] = 4006
tapology_fighters.loc[
    tapology_fighters["id"] == "37069-justin-jones", "bestfightodds_id"
] = 4930
tapology_fighters.loc[
    tapology_fighters["id"] == "15647-emily-kagen", "bestfightodds_id"
] = 3802
tapology_fighters.loc[
    tapology_fighters["id"] == "42696-bentley-syler", "bestfightodds_id"
] = 5320
tapology_fighters.loc[
    tapology_fighters["id"] == "27661-rocky-lee", "bestfightodds_id"
] = 5528
tapology_fighters.loc[
    tapology_fighters["id"] == "44658-izabela-badurek", "bestfightodds_id"
] = 5540
tapology_fighters.loc[
    tapology_fighters["id"] == "23167-jonavin-webb", "bestfightodds_id"
] = 5597
tapology_fighters.loc[
    tapology_fighters["id"] == "50067-ericka-almeida", "bestfightodds_id"
] = 5633
tapology_fighters.loc[
    tapology_fighters["id"] == "steven-kennedy", "bestfightodds_id"
] = 5684
tapology_fighters.loc[
    tapology_fighters["id"] == "11579-steve-montgomery", "bestfightodds_id"
] = 5070
tapology_fighters.loc[
    tapology_fighters["id"] == "32886-fernando-bruno", "bestfightodds_id"
] = 5719
tapology_fighters.loc[
    tapology_fighters["id"] == "21482-roger-zapata-viva", "bestfightodds_id"
] = 5511
tapology_fighters.loc[
    tapology_fighters["id"] == "55402-bruno-korea", "bestfightodds_id"
] = 5922
tapology_fighters.loc[
    tapology_fighters["id"] == "12613-anthony-christodoulou-tony", "bestfightodds_id"
] = 5416
tapology_fighters.loc[
    tapology_fighters["id"] == "12724-lukasz-sajewski-wookie", "bestfightodds_id"
] = 5113
tapology_fighters.loc[
    tapology_fighters["id"] == "54486-joe-merritt", "bestfightodds_id"
] = 5714
tapology_fighters.loc[
    tapology_fighters["id"] == "32791-anton-zafir", "bestfightodds_id"
] = 5970
tapology_fighters.loc[
    tapology_fighters["id"] == "85356-vernon-ramos", "bestfightodds_id"
] = 5989
tapology_fighters.loc[
    tapology_fighters["id"] == "36474-enrique-marin", "bestfightodds_id"
] = 5984
tapology_fighters.loc[
    tapology_fighters["id"] == "abner-lloveras", "bestfightodds_id"
] = 6087
tapology_fighters.loc[
    tapology_fighters["id"] == "43005-joey-gomez", "bestfightodds_id"
] = 6023
tapology_fighters.loc[
    tapology_fighters["id"] == "10444-mehdi-baghdad", "bestfightodds_id"
] = 4650
tapology_fighters.loc[
    tapology_fighters["id"] == "felipe-olivieri", "bestfightodds_id"
] = 5966
tapology_fighters.loc[
    tapology_fighters["id"] == "51885-kelly-faszholz", "bestfightodds_id"
] = 6036
tapology_fighters.loc[
    tapology_fighters["id"] == "82492-cristina-stanciu", "bestfightodds_id"
] = 6269
tapology_fighters.loc[
    tapology_fighters["id"] == "18345-cody-east-the-freight-train", "bestfightodds_id"
] = 5221
tapology_fighters.loc[
    tapology_fighters["id"] == "46288-jason-novelli", "bestfightodds_id"
] = 5779
tapology_fighters.loc[
    tapology_fighters["id"] == "54222-chris-avila", "bestfightodds_id"
] = 5381
tapology_fighters.loc[
    tapology_fighters["id"] == "16092-leonardo-guimaraes-leleco", "bestfightodds_id"
] = 6243

# Correct wrong Best Fight Odds IDs
tapology_fighters.loc[
    tapology_fighters["id"] == "339509-robelis-despaigne", "bestfightodds_id"
] = 16067
tapology_fighters.loc[
    tapology_fighters["id"] == "4635-kim-dong-hyun", "bestfightodds_id"
] = 6915
tapology_fighters.loc[
    tapology_fighters["id"] == "59819-bruno-silva-blindado", "bestfightodds_id"
] = 11513

tapology_fighters.to_csv(
    os.path.join(clean_data_dir, "Tapology", "fighters.csv"), index=False
)

## Rehydration Weights

In [13]:
tapology_rehydration_weights = pd.read_csv(
    os.path.join(raw_data_dir, "Tapology", "csac_rehydration_weights.csv")
)

# Extract numeric parts of weight strings and convert to float
tapology_rehydration_weights["weigh_in_result"] = (
    tapology_rehydration_weights["weigh_in_result"]
    .str.split("(")
    .str[0]
    .str.replace("lbs", "")
    .astype(float)
)
tapology_rehydration_weights["fight_night_weight"] = (
    tapology_rehydration_weights["fight_night_weight"]
    .str.split("(")
    .str[0]
    .str.replace("lbs", "")
    .astype(float)
)
tapology_rehydration_weights["weight_gain"] = (
    tapology_rehydration_weights["weight_gain"]
    .str.split("(")
    .str[0]
    .str.replace("lbs", "")
    .astype(float)
)

tapology_rehydration_weights = tapology_rehydration_weights.rename(
    columns={
        "weigh_in_result": "weigh_in_result_lbs",
        "fight_night_weight": "fight_night_weight_lbs",
        "weight_gain": "weight_gain_lbs",
    }
)

tapology_rehydration_weights.to_csv(
    os.path.join(clean_data_dir, "Tapology", "rehydration_weights.csv"), index=False
)