# Data Cleaning - MMA Decisions

In [23]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

In [24]:
# Isolate duplicated fighters due to bad data quality
dupe_fighter_map = {
    4934: 6373,
    6748: 6394,
}

## Bouts

In [25]:
mmadecisions_bouts = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "bouts.csv")
)

# Replace duplicate fighter IDs
mmadecisions_bouts["fighter_1_id"] = mmadecisions_bouts["fighter_1_id"].replace(
    dupe_fighter_map
)
mmadecisions_bouts["fighter_2_id"] = mmadecisions_bouts["fighter_2_id"].replace(
    dupe_fighter_map
)

mmadecisions_bouts.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "bouts.csv"), index=False
)

## Deductions

In [26]:
mmadecisions_deductions = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "deductions.csv")
)

# Replace duplicate fighter IDs
mmadecisions_deductions["fighter_id"] = mmadecisions_deductions["fighter_id"].replace(
    dupe_fighter_map
)

mmadecisions_deductions.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "deductions.csv"), index=False
)

## Events

In [27]:
mmadecisions_events = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"]
)

# Correct PFL event incorrectly labeled as UFC
mmadecisions_events.loc[mmadecisions_events["id"] == 1503, "promotion"] = "PFL"

mmadecisions_events.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), index=False
)

## Fighters

In [28]:
mmadecisions_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "fighters.csv"),
    parse_dates=["date_of_birth"],
)

# Duplicate fighters
mmadecisions_fighters = mmadecisions_fighters[
    ~mmadecisions_fighters["id"].isin(dupe_fighter_map)
]


# Helper function to convert height string to inches
def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan

    height_str_split = height_str.split("'")
    feet = height_str_split[0]
    inches = height_str_split[1]
    feet = int(feet)
    inches = float(inches.replace('"', ""))

    return feet * 12 + inches


# Apply function
mmadecisions_fighters["height"] = mmadecisions_fighters["height"].apply(convert_height)
mmadecisions_fighters = mmadecisions_fighters.rename(
    columns={"height": "height_inches"}
)

mmadecisions_fighters.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "fighters.csv"), index=False
)

## Judge Scores

In [29]:
mmadecisions_judge_scores = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "judge_scores.csv")
)

# Convert columns to Int64
mmadecisions_judge_scores["judge_id"] = mmadecisions_judge_scores["judge_id"].astype(
    "Int64"
)
mmadecisions_judge_scores["fighter_1_score"] = mmadecisions_judge_scores[
    "fighter_1_score"
].astype("Int64")
mmadecisions_judge_scores["fighter_2_score"] = mmadecisions_judge_scores[
    "fighter_2_score"
].astype("Int64")

mmadecisions_judge_scores.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "judge_scores.csv"), index=False
)

## Judges

No further cleaning needed

In [30]:
mmadecisions_judges = pd.read_csv(
    os.path.join(raw_data_dir, "MMA Decisions", "judges.csv")
)
mmadecisions_judges.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "judges.csv"), index=False
)

## Media Scores

In [31]:
mmadecisions_media_scores = pd.read_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "media_scores.csv")
)

# Create ordering column as auxiliary info
mmadecisions_media_scores["order"] = (
    mmadecisions_media_scores.groupby("bout_id").cumcount() + 1
)
mmadecisions_media_scores.to_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "media_scores.csv"), index=False
)