# Data Cleaning - ESPN

In [1]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bout Stats

In [2]:
espn_bout_stats = pd.read_csv(
    os.path.join(raw_data_dir, "ESPN", "fighter_bout_statistics.csv")
)

# Convert to Int64
stat_cols = [
    x
    for x in espn_bout_stats.columns
    if x
    not in {
        "fighter_id",
        "order",
        "bout_id",
        "event_id",
    }
]
espn_bout_stats[stat_cols] = espn_bout_stats[stat_cols].astype("Int64")

# Select columns and sort
espn_bout_stats = espn_bout_stats.drop(columns=["order", "event_id"])
espn_bout_stats = espn_bout_stats[["bout_id", "fighter_id"] + stat_cols]
espn_bout_stats = espn_bout_stats.sort_values(by=["bout_id", "fighter_id"]).reset_index(
    drop=True
)

espn_bout_stats.to_csv(
    os.path.join(clean_data_dir, "ESPN", "bout_stats.csv"), index=False
)

## Bouts

In [4]:
espn_bouts = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "bouts.csv"))

# Convert winner fighter ID to Int64 and rename columns
# Previous column names were misleading
espn_bouts["winner_id"] = espn_bouts["winner_id"].astype("Int64")
espn_bouts = espn_bouts.rename(
    columns={"red_fighter_id": "fighter_1_id", "blue_fighter_id": "fighter_2_id"}
)

espn_bouts.to_csv(os.path.join(clean_data_dir, "ESPN", "bouts.csv"), index=False)

## Events

In [5]:
espn_events = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "events.csv"))

# Convert venue ID to Int64
espn_events["venue_id"] = espn_events["venue_id"].astype("Int64")

# Extract additional data from timestamp
espn_events = espn_events.rename(columns={"date": "date_TEMP"})
espn_events["date"] = pd.to_datetime(espn_events["date_TEMP"]).dt.date
espn_events["hour_utc"] = pd.to_datetime(espn_events["date_TEMP"]).dt.hour

espn_events = espn_events[["id", "name", "date", "hour_utc", "venue_id", "event_order"]]

espn_events.to_csv(os.path.join(clean_data_dir, "ESPN", "events.csv"), index=False)

## Fighter Histories

In [6]:
espn_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "ESPN", "fighter_histories.csv")
)
espn_fighter_histories["opponent_id"] = espn_fighter_histories["opponent_id"].astype(
    "Int64"
)

# Extract additional data from timestamp
espn_fighter_histories = espn_fighter_histories.rename(columns={"date": "date_TEMP"})
espn_fighter_histories["date"] = pd.to_datetime(
    espn_fighter_histories["date_TEMP"]
).dt.date
espn_fighter_histories["hour_utc"] = pd.to_datetime(
    espn_fighter_histories["date_TEMP"]
).dt.hour

# Calculate end round related columns
espn_fighter_histories["end_round"] = espn_fighter_histories["end_round"].astype(
    "Int64"
)
espn_fighter_histories.loc[espn_fighter_histories["end_round"] == 0, "end_round"] = (
    np.nan
)


# Time string conversion
def convert_time(time_str):
    if pd.isna(time_str) or time_str == "-":
        return np.nan

    minutes, seconds = map(int, time_str.split(":"))
    return minutes * 60 + seconds


espn_fighter_histories["end_round_time_seconds"] = espn_fighter_histories[
    "end_round_time"
].apply(convert_time)
espn_fighter_histories["end_round_time_seconds"] = espn_fighter_histories[
    "end_round_time_seconds"
].astype("Int64")


def calculate_total_time_seconds(row):
    if pd.isna(row["end_round"]) or pd.isna(row["end_round_time_seconds"]):
        return np.nan

    return (row["end_round"] - 1) * 300 + row["end_round_time_seconds"]


espn_fighter_histories["total_time_seconds"] = espn_fighter_histories.apply(
    calculate_total_time_seconds, axis=1
)
espn_fighter_histories["total_time_seconds"] = espn_fighter_histories[
    "total_time_seconds"
].astype("Int64")

# Select columns
espn_fighter_histories = espn_fighter_histories[
    [
        "fighter_id",
        "order",
        "bout_id",
        "event_id",
        "event_name",
        "date",
        "hour_utc",
        "opponent_id",
        "outcome",
        "outcome_method",
        "end_round",
        "end_round_time_seconds",
        "total_time_seconds",
        "is_title_bout",
    ]
]

espn_fighter_histories.to_csv(
    os.path.join(clean_data_dir, "ESPN", "fighter_histories.csv"), index=False
)

## Fighters

In [7]:
espn_fighters = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "fighters.csv"))
espn_fighters["team_id"] = espn_fighters["team_id"].astype("Int64")

# Date conversion
espn_fighters["date_of_birth"] = pd.to_datetime(
    espn_fighters["date_of_birth"], format="%d/%m/%Y"
)

# Replace with null
espn_fighters.loc[espn_fighters["stance"] == "--", "stance"] = np.nan

# Convert to float
espn_fighters["reach_inches"] = (
    espn_fighters["reach"].str.replace('"', "").astype("float")
)


# Convert height string to inches
def convert_height(height_str):
    if pd.isna(height_str):
        return np.nan

    height_str = height_str.replace("'", "").replace('"', "")
    feet, inches = map(int, height_str.split())

    return feet * 12 + inches


espn_fighters["height_inches"] = espn_fighters["height"].apply(convert_height)
espn_fighters["height_inches"] = espn_fighters["height_inches"].astype("Int64")

espn_fighters = espn_fighters[
    [
        "id",
        "name",
        "nickname",
        "date_of_birth",
        "reach_inches",
        "height_inches",
        "stance",
        "team_id",
        "nationality",
        "fighting_style",
    ]
]

espn_fighters.to_csv(os.path.join(clean_data_dir, "ESPN", "fighters.csv"), index=False)

## Teams

No further cleaning needed

In [8]:
espn_teams = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "teams.csv"))
espn_teams.to_csv(os.path.join(clean_data_dir, "ESPN", "teams.csv"), index=False)

## Venues

No further cleaning needed

In [9]:
espn_venues = pd.read_csv(os.path.join(raw_data_dir, "ESPN", "venues.csv"))
espn_venues.to_csv(os.path.join(clean_data_dir, "ESPN", "venues.csv"), index=False)