# Matching - Event IDs

In [1]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
clean_data_dir = os.path.join(data_dir, "clean")

In [2]:
# For almost all events, IDs can be matched correctly by lining up date order
# Create temporary supplemental column in UFC Stats events dataframe to facilitate this
ufcstats_events = pd.read_csv(
    os.path.join(clean_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"]
)
ufcstats_events = (
    ufcstats_events.loc[ufcstats_events["is_ufc_event"] == 1]
    .drop(columns=["is_ufc_event"])
    .reset_index(drop=True)
)
ufcstats_events["event_order"] = ufcstats_events["event_order"].astype(int)

## Tapology, Sherdog, Best Fight Odds

Tapology has links to corresponding UFC Stats, Sherdog, and Best Fight Odds events, so we can take a shortcut.

In [3]:
tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))

# For available ids, all match up in event order
# No UFC events on same day before 42nd, so this is a sufficient check
temp = tapology_events.loc[tapology_events["ufcstats_id"].notnull()]
temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 41]
all(temp["ufcstats_id"] == temp2["id"])

True

In [4]:
matching1 = pd.DataFrame(
    {
        # Observe we reference UFC Stats df
        "ufcstats_id": ufcstats_events["id"].values,
        # Tapology for rest
        "tapology_id": tapology_events["id"].values,
        "sherdog_id": tapology_events["sherdog_id"].values,
        "bestfightodds_id": tapology_events["bestfightodds_id"].values,
    }
)
matching1["bestfightodds_id"] = matching1["bestfightodds_id"].astype("Int64")

## Wikipedia

In [6]:
wikipedia_events = pd.read_csv(
    os.path.join(clean_data_dir, "Wikipedia", "events.csv"), parse_dates=["date"]
)

# Wikipedia and UFC Stats events are in the same order already
matching2 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "wikipedia_id": wikipedia_events["id"].values,
    }
)

## Fight Matrix

In [7]:
fightmatrix_events = pd.read_csv(
    os.path.join(clean_data_dir, "Fight Matrix", "events.csv"), parse_dates=["date"]
)
fightmatrix_events = (
    fightmatrix_events.loc[fightmatrix_events["is_ufc_event"] == 1]
    .drop(columns=["is_ufc_event"])
    .reset_index(drop=True)
)
fightmatrix_events["event_order"] = fightmatrix_events["event_order"].astype(int)

# Handle a few edge cases from same-day events
# Swap 222 and 223 event orders
fightmatrix_events.loc[fightmatrix_events["event_order"] == 222, "event_order"] = -99
fightmatrix_events.loc[fightmatrix_events["event_order"] == 223, "event_order"] = 222
fightmatrix_events.loc[fightmatrix_events["event_order"] == -99, "event_order"] = 223
fightmatrix_events = fightmatrix_events.sort_values("event_order").reset_index(
    drop=True
)

# Swap 378 and 379 event orders
fightmatrix_events.loc[fightmatrix_events["event_order"] == 378, "event_order"] = -99
fightmatrix_events.loc[fightmatrix_events["event_order"] == 379, "event_order"] = 378
fightmatrix_events.loc[fightmatrix_events["event_order"] == -99, "event_order"] = 379
fightmatrix_events = fightmatrix_events.sort_values("event_order").reset_index(
    drop=True
)

matching3 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightmatrix_id": fightmatrix_events["id"].values,
    }
)

## ESPN

In [8]:
espn_events = pd.read_csv(
    os.path.join(clean_data_dir, "ESPN", "events.csv"), parse_dates=["date"]
)

# Handle a few edge cases
# Swap 278 and 279 event orders
espn_events.loc[espn_events["event_order"] == 278, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 279, "event_order"] = 278
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 279
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

# Swap 286 and 287 event orders
espn_events.loc[espn_events["event_order"] == 286, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 287, "event_order"] = 286
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 287
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

# Swap 293 and 294 event orders
espn_events.loc[espn_events["event_order"] == 293, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 294, "event_order"] = 293
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 294
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

matching4 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "espn_id": espn_events["id"].values,
    }
)

## FightOdds.io

In [9]:
fightoddsio_events = pd.read_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "events.csv"), parse_dates=["date"]
)

# Handle a few edge cases
# Swap 286 and 287 event orders
fightoddsio_events.loc[fightoddsio_events["event_order"] == 286, "event_order"] = -99
fightoddsio_events.loc[fightoddsio_events["event_order"] == 287, "event_order"] = 286
fightoddsio_events.loc[fightoddsio_events["event_order"] == -99, "event_order"] = 287
fightoddsio_events = fightoddsio_events.sort_values("event_order").reset_index(
    drop=True
)

# Swap 293 and 294 event orders
fightoddsio_events.loc[fightoddsio_events["event_order"] == 293, "event_order"] = -99
fightoddsio_events.loc[fightoddsio_events["event_order"] == 294, "event_order"] = 293
fightoddsio_events.loc[fightoddsio_events["event_order"] == -99, "event_order"] = 294
fightoddsio_events = fightoddsio_events.sort_values("event_order").reset_index(
    drop=True
)

matching5 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightoddsio_id": fightoddsio_events["id"].values,
    }
)

## MMA Decisions

In [10]:
mmadecisions_events = pd.read_csv(
    os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"]
)
mmadecisions_events = mmadecisions_events.loc[
    mmadecisions_events["promotion"] == "UFC"
].reset_index(drop=True)

# Handle a few edge cases
# Swap 392 and 393 event orders
mmadecisions_events.loc[mmadecisions_events["event_order"] == 392, "event_order"] = -99
mmadecisions_events.loc[mmadecisions_events["event_order"] == 393, "event_order"] = 392
mmadecisions_events.loc[mmadecisions_events["event_order"] == -99, "event_order"] = 393
mmadecisions_events = mmadecisions_events.sort_values("event_order").reset_index(
    drop=True
)

# Swap 539 and 540 event orders
mmadecisions_events.loc[mmadecisions_events["event_order"] == 539, "event_order"] = -99
mmadecisions_events.loc[mmadecisions_events["event_order"] == 540, "event_order"] = 539
mmadecisions_events.loc[mmadecisions_events["event_order"] == -99, "event_order"] = 540
mmadecisions_events = mmadecisions_events.sort_values("event_order").reset_index(
    drop=True
)

event_orders_no_decisions = [1, 2, 3, 4, 6, 13, 45, 93, 296, 616]
temp = ufcstats_events.loc[
    ~ufcstats_events["event_order"].isin(event_orders_no_decisions)
].reset_index(drop=True)

matching6 = pd.DataFrame(
    {
        "ufcstats_id": temp["id"].values,
        "mmadecisions_id": mmadecisions_events["id"].values,
    }
)

## Bet MMA

In [11]:
betmma_events = pd.read_csv(
    os.path.join(clean_data_dir, "Bet MMA", "events.csv"), parse_dates=["date"]
)
betmma_events = (
    betmma_events.loc[betmma_events["is_ufc_event"] == 1]
    .drop(columns=["is_ufc_event"])
    .reset_index(drop=True)
)
betmma_events["event_order"] = betmma_events["event_order"].astype(int)

# Handle a few edge cases
# Swap 274 and 275 event orders
betmma_events.loc[betmma_events["event_order"] == 274, "event_order"] = -99
betmma_events.loc[betmma_events["event_order"] == 275, "event_order"] = 274
betmma_events.loc[betmma_events["event_order"] == -99, "event_order"] = 275
betmma_events = betmma_events.sort_values("event_order").reset_index(drop=True)

# Swap 293 and 294 event orders
betmma_events.loc[betmma_events["event_order"] == 293, "event_order"] = -99
betmma_events.loc[betmma_events["event_order"] == 294, "event_order"] = 293
betmma_events.loc[betmma_events["event_order"] == -99, "event_order"] = 294
betmma_events = betmma_events.sort_values("event_order").reset_index(drop=True)

temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 233].reset_index(drop=True)

matching7 = pd.DataFrame(
    {
        "ufcstats_id": temp2["id"].values,
        "betmma_id": betmma_events["id"].values,
    }
)

## Combine all

In [12]:
# Create final matching dataframe
matching = matching1.merge(matching2, on="ufcstats_id", how="left")
matching = matching.merge(matching3, on="ufcstats_id", how="left")
matching = matching.merge(matching4, on="ufcstats_id", how="left")
matching = matching.merge(matching5, on="ufcstats_id", how="left")
matching = matching.merge(matching6, on="ufcstats_id", how="left")
matching = matching.merge(matching7, on="ufcstats_id", how="left")
matching["mmadecisions_id"] = matching["mmadecisions_id"].astype("Int64")
matching["betmma_id"] = matching["betmma_id"].astype("Int64")
matching = matching[
    [
        "ufcstats_id",
        "bestfightodds_id",
        "betmma_id",
        "espn_id",
        "fightmatrix_id",
        "fightoddsio_id",
        "mmadecisions_id",
        "sherdog_id",
        "tapology_id",
        "wikipedia_id",
    ]
]
matching.to_csv(os.path.join(clean_data_dir, "event_mapping.csv"), index=False)