# Data Cleaning - Fight Matrix

In [1]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bouts

In [2]:
fightmatrix_bouts = pd.read_csv(os.path.join(raw_data_dir, "Fight Matrix", "bouts.csv"))

# Replace -1s with NaN
fightmatrix_bouts = fightmatrix_bouts.replace(-1, np.nan)

# Replace -500s with pre
fightmatrix_bouts.loc[
    fightmatrix_bouts["fighter_1_glicko_1_post"] == -500, "fighter_1_glicko_1_post"
] = fightmatrix_bouts["fighter_1_glicko_1_pre"]
fightmatrix_bouts.loc[
    fightmatrix_bouts["fighter_2_glicko_1_post"] == -500, "fighter_2_glicko_1_post"
] = fightmatrix_bouts["fighter_2_glicko_1_pre"]

# Cast all ratings to Int64
fightmatrix_bouts[
    [
        "fighter_1_elo_k170_pre",
        "fighter_1_elo_k170_post",
        "fighter_1_elo_modified_pre",
        "fighter_1_elo_modified_post",
        "fighter_1_glicko_1_pre",
        "fighter_1_glicko_1_post",
        "fighter_2_elo_k170_pre",
        "fighter_2_elo_k170_post",
        "fighter_2_elo_modified_pre",
        "fighter_2_elo_modified_post",
        "fighter_2_glicko_1_pre",
        "fighter_2_glicko_1_post",
    ]
] = fightmatrix_bouts[
    [
        "fighter_1_elo_k170_pre",
        "fighter_1_elo_k170_post",
        "fighter_1_elo_modified_pre",
        "fighter_1_elo_modified_post",
        "fighter_1_glicko_1_pre",
        "fighter_1_glicko_1_post",
        "fighter_2_elo_k170_pre",
        "fighter_2_elo_k170_post",
        "fighter_2_elo_modified_pre",
        "fighter_2_elo_modified_post",
        "fighter_2_glicko_1_pre",
        "fighter_2_glicko_1_post",
    ]
].astype("Int64")


fightmatrix_bouts.to_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "bouts.csv"), index=False
)

## Events

In [3]:
fightmatrix_events = pd.read_csv(
    os.path.join(raw_data_dir, "Fight Matrix", "events.csv"), parse_dates=["date"]
)

# Cast to Int64
fightmatrix_events["event_order"] = fightmatrix_events["event_order"].astype("Int64")

fightmatrix_events.to_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "events.csv"), index=False
)

## Fighter Histories

For context, biggest issue with this data was that for early MMA events, fighters could fight multiple times in one evening. Fight Matrix does a bad job at keeping track of the order of such fights on each fighter's profile, so we flag these fights with `bad_ordering_flag` = 1 to correct later. My strategy was to deduce the order of these fights by using the pre- and post-fight Elo/Glicko ratings when possible, and manually handle any remaining ambiguous cases.

In [4]:
fightmatrix_fighter_histories = pd.read_csv(
    os.path.join(raw_data_dir, "Fight Matrix", "fighter_histories.csv"),
    parse_dates=["date"],
)

# Remove rows with -1
fightmatrix_fighter_histories = fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["fighter_elo_k170_pre"] != -1
]
fightmatrix_fighter_histories["temp_order"] = (
    fightmatrix_fighter_histories.groupby("fighter_id").cumcount() + 1
)

# Replace -500s
fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["fighter_glicko_1_post"] == -500,
    "fighter_glicko_1_post",
] = fightmatrix_fighter_histories["fighter_glicko_1_pre"]
fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["opponent_glicko_1_post"] == -500,
    "opponent_glicko_1_post",
] = fightmatrix_fighter_histories["opponent_glicko_1_pre"]

# Get fighter ids where no fights have bad_order_flag = 1
bad_fighter_ids = fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["bad_ordering_flag"] == 1, "fighter_id"
].unique()
fighter_histories_good = fightmatrix_fighter_histories.loc[
    ~fightmatrix_fighter_histories["fighter_id"].isin(bad_fighter_ids)
]
fighter_histories_good = fighter_histories_good.drop(columns=["bad_ordering_flag"])
fighter_histories_good = fighter_histories_good.rename(columns={"temp_order": "order"})
fighter_histories_good = fighter_histories_good.replace(-1, np.nan)

# Correct the order for fighters with bad_ordering_flag = 1
# Main idea is to create a DAG (directed acyclic graph) using the Elo/Glicko ratings, find the source node, and traverse it
fighter_histories_bad = fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["fighter_id"].isin(bad_fighter_ids)
]

corrected = []
still_bad_fighter_ids = []
for fighter_id in bad_fighter_ids:
    history_slice = fighter_histories_bad.loc[
        fighter_histories_bad["fighter_id"] == fighter_id
    ]
    dates_sorted = sorted(history_slice["date"].unique())

    row_dicts_in_order = []
    for date in dates_sorted:
        rows_for_date = history_slice.loc[history_slice["date"] == date]
        if len(rows_for_date) == 1:
            row_dicts_in_order.append(rows_for_date.iloc[0].to_dict())
        else:
            elo_tuple_pre_set = set(
                zip(
                    rows_for_date["fighter_elo_k170_pre"],
                    rows_for_date["fighter_elo_modified_pre"],
                    rows_for_date["fighter_glicko_1_pre"],
                )
            )
            elo_tuple_post_set = set(
                zip(
                    rows_for_date["fighter_elo_k170_post"],
                    rows_for_date["fighter_elo_modified_post"],
                    rows_for_date["fighter_glicko_1_post"],
                )
            )
            if len(row_dicts_in_order) == 0:
                start_row = rows_for_date.loc[
                    (rows_for_date["fighter_elo_k170_pre"] == 1000)
                    & (rows_for_date["fighter_elo_modified_pre"] == 1000)
                    & (rows_for_date["fighter_glicko_1_pre"] == 1000)
                ]
                if len(start_row) == 1:
                    row_dicts_in_order.append(start_row.iloc[0].to_dict())
                else:
                    start_elo_candidates = list(elo_tuple_pre_set - elo_tuple_post_set)
                    finish_elo_candidates = list(elo_tuple_post_set - elo_tuple_pre_set)
                    if (
                        len(start_elo_candidates) == 1
                        and len(finish_elo_candidates) == 1
                    ):
                        start_row = rows_for_date.loc[
                            (
                                rows_for_date["fighter_elo_k170_pre"]
                                == start_elo_candidates[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_pre"]
                                == start_elo_candidates[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_pre"]
                                == start_elo_candidates[0][2]
                            )
                            & (
                                rows_for_date["fighter_elo_k170_post"]
                                != finish_elo_candidates[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_post"]
                                != finish_elo_candidates[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_post"]
                                != finish_elo_candidates[0][2]
                            )
                        ]

                        if len(start_row) == 1:
                            row_dicts_in_order.append(start_row.iloc[0].to_dict())
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    elif (
                        len(rows_for_date) == 2
                        and len(finish_elo_candidates) == 1
                        and len(start_elo_candidates) == 0
                    ):
                        start_row = rows_for_date.loc[
                            (
                                rows_for_date["fighter_elo_k170_post"]
                                != finish_elo_candidates[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_post"]
                                != finish_elo_candidates[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_post"]
                                != finish_elo_candidates[0][2]
                            )
                        ]

                        if len(start_row) == 1:
                            row_dicts_in_order.append(start_row.iloc[0].to_dict())
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    else:
                        still_bad_fighter_ids.append(fighter_id)
                        break

                # Exclude start row
                rows_for_date = rows_for_date.drop(start_row.index)

            while len(rows_for_date) > 0:
                # Find row with pre fight elos equal to the last added row's post fight elos
                row_match = rows_for_date.loc[
                    (
                        rows_for_date["fighter_elo_k170_pre"]
                        == row_dicts_in_order[-1]["fighter_elo_k170_post"]
                    )
                    & (
                        rows_for_date["fighter_elo_modified_pre"]
                        == row_dicts_in_order[-1]["fighter_elo_modified_post"]
                    )
                    & (
                        rows_for_date["fighter_glicko_1_pre"]
                        == row_dicts_in_order[-1]["fighter_glicko_1_post"]
                    )
                ]

                if len(row_match) == 1:
                    row_dicts_in_order.append(row_match.iloc[0].to_dict())
                    rows_for_date = rows_for_date.drop(row_match.index)
                else:
                    elo_tuple_pre_set2 = set(
                        zip(
                            rows_for_date["fighter_elo_k170_pre"],
                            rows_for_date["fighter_elo_modified_pre"],
                            rows_for_date["fighter_glicko_1_pre"],
                        )
                    )
                    elo_tuple_post_set2 = set(
                        zip(
                            rows_for_date["fighter_elo_k170_post"],
                            rows_for_date["fighter_elo_modified_post"],
                            rows_for_date["fighter_glicko_1_post"],
                        )
                    )

                    start_elo_candidates2 = list(
                        elo_tuple_pre_set2 - elo_tuple_post_set2
                    )
                    finish_elo_candidates2 = list(
                        elo_tuple_post_set2 - elo_tuple_pre_set2
                    )

                    if (
                        len(start_elo_candidates2) == 1
                        and len(finish_elo_candidates2) == 1
                    ):
                        row_match = rows_for_date.loc[
                            (
                                rows_for_date["fighter_elo_k170_pre"]
                                == start_elo_candidates2[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_pre"]
                                == start_elo_candidates2[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_pre"]
                                == start_elo_candidates2[0][2]
                            )
                            & (
                                rows_for_date["fighter_elo_k170_post"]
                                != finish_elo_candidates2[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_post"]
                                != finish_elo_candidates2[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_post"]
                                != finish_elo_candidates2[0][2]
                            )
                        ]

                        if len(row_match) == 1:
                            row_dicts_in_order.append(row_match.iloc[0].to_dict())
                            rows_for_date = rows_for_date.drop(row_match.index)
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    elif (
                        len(rows_for_date) == 2
                        and len(finish_elo_candidates2) == 1
                        and len(start_elo_candidates2) == 0
                    ):
                        row_match = rows_for_date.loc[
                            (
                                rows_for_date["fighter_elo_k170_post"]
                                != finish_elo_candidates2[0][0]
                            )
                            & (
                                rows_for_date["fighter_elo_modified_post"]
                                != finish_elo_candidates2[0][1]
                            )
                            & (
                                rows_for_date["fighter_glicko_1_post"]
                                != finish_elo_candidates2[0][2]
                            )
                        ]

                        if len(row_match) == 1:
                            row_dicts_in_order.append(row_match.iloc[0].to_dict())
                            rows_for_date = rows_for_date.drop(row_match.index)
                        else:
                            still_bad_fighter_ids.append(fighter_id)
                            break
                    else:
                        still_bad_fighter_ids.append(fighter_id)
                        break

    if fighter_id in still_bad_fighter_ids:
        continue
    else:
        for i, row in enumerate(row_dicts_in_order):
            row["order"] = i + 1
        corrected.extend(row_dicts_in_order)

fighter_histories_bad_corrected = pd.DataFrame(corrected)
fighter_histories_bad_corrected = fighter_histories_bad_corrected[
    [
        "fighter_id",
        "order",
        "event_id",
        "date",
        "opponent_id",
        "outcome",
        "outcome_method",
        "end_round",
        "fighter_elo_k170_pre",
        "fighter_elo_k170_post",
        "fighter_elo_modified_pre",
        "fighter_elo_modified_post",
        "fighter_glicko_1_pre",
        "fighter_glicko_1_post",
        "opponent_elo_k170_pre",
        "opponent_elo_k170_post",
        "opponent_elo_modified_pre",
        "opponent_elo_modified_post",
        "opponent_glicko_1_pre",
        "opponent_glicko_1_post",
    ]
]

fighter_histories_still_bad = fightmatrix_fighter_histories.loc[
    fightmatrix_fighter_histories["fighter_id"].isin(still_bad_fighter_ids)
]

# Manual fixes
fighter_histories_still_bad_corrected = pd.read_csv(
    "./fightmatrix_histories_edge_case_fixes.csv", parse_dates=["date"]
)
fighter_histories_still_bad_corrected = fighter_histories_still_bad_corrected[
    [
        "fighter_id",
        "order",
        "event_id",
        "date",
        "opponent_id",
        "outcome",
        "outcome_method",
        "end_round",
        "fighter_elo_k170_pre",
        "fighter_elo_k170_post",
        "fighter_elo_modified_pre",
        "fighter_elo_modified_post",
        "fighter_glicko_1_pre",
        "fighter_glicko_1_post",
        "opponent_elo_k170_pre",
        "opponent_elo_k170_post",
        "opponent_elo_modified_pre",
        "opponent_elo_modified_post",
        "opponent_glicko_1_pre",
        "opponent_glicko_1_post",
    ]
]

# Finally concatenate everything
fightmatrix_fighter_histories_clean = pd.concat(
    [
        fighter_histories_good,
        fighter_histories_bad_corrected,
        fighter_histories_still_bad_corrected,
    ],
    ignore_index=True,
)

# Sort
fightmatrix_fighter_histories_clean = fightmatrix_fighter_histories_clean.sort_values(
    by=["fighter_id", "order"]
).reset_index(drop=True)

fightmatrix_fighter_histories_clean.to_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "fighter_histories.csv"), index=False
)

## Fighters

In [6]:
fightmatrix_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "Fight Matrix", "fighters.csv"),
    parse_dates=["pro_debut_date", "ufc_debut_date"],
)

# Cast to Int64
fightmatrix_fighters["sherdog_id"] = fightmatrix_fighters["sherdog_id"].astype("Int64")

# Manually fix incorrect Tapology ID
fightmatrix_fighters.loc[fightmatrix_fighters["id"] == 503, "tapology_id"] = (
    "javier-vazquez-showtime"
)
fightmatrix_fighters.loc[fightmatrix_fighters["id"] == 168942, "tapology_id"] = (
    "148375-rafael-gigante"
)

fightmatrix_fighters.to_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "fighters.csv"), index=False
)

## Rankings

In [7]:
fightmatrix_rankings = pd.read_csv(
    os.path.join(raw_data_dir, "Fight Matrix", "rankings.csv")
)

# Drop duplicates
fightmatrix_rankings = fightmatrix_rankings.drop_duplicates(
    subset=["issue_date", "weight_class", "fighter_id"], keep="first"
).reset_index(drop=True)

fightmatrix_rankings.to_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "rankings.csv"), index=False
)