In [2]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
clean_data_dir = os.path.join(data_dir, "clean")

In [16]:
ufcstats_events = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"])
ufcstats_events = ufcstats_events.loc[ufcstats_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
ufcstats_events["event_order"] = ufcstats_events["event_order"].astype(int)
ufcstats_events

Unnamed: 0,id,name,date,location,event_order
0,6420efac0578988b,UFC 1: The Beginning,1993-11-12,"Denver, Colorado, USA",1
1,a6a9ab5a824e8f66,UFC 2: No Way Out,1994-03-11,"Denver, Colorado, USA",2
2,1a49e0670dfaca31,UFC 3: The American Dream,1994-09-09,"Charlotte, North Carolina, USA",3
3,b60391da771deefe,UFC 4: Revenge of the Warriors,1994-12-16,"Tulsa, Oklahoma, USA",4
4,dedc3bb440d09554,UFC 5: The Return of the Beast,1995-04-07,"Charlotte, North Carolina, USA",5
...,...,...,...,...,...
711,585f9ffdb0cd0466,UFC Fight Night: Magny vs. Prates,2024-11-09,"Las Vegas, Nevada, USA",712
712,daff32bc96d1eabf,UFC 309: Jones vs. Miocic,2024-11-16,"New York City, New York, USA",713
713,e955046551f8c7dd,UFC Fight Night: Yan vs. Figueiredo,2024-11-23,"Macau, China",714
714,ad23903ef3af7406,UFC 310: Pantoja vs. Asakura,2024-12-07,"Las Vegas, Nevada, USA",715


## Tapology, Sherdog, Best Fight Odds

In [47]:
tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))

# For available ids, all match up in event order, so good to go
# temp = tapology_events.loc[tapology_events["ufcstats_id"].notnull()]
# temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 41]
# all(temp["ufcstats_id"] == temp2["id"])

matching1 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "tapology_id": tapology_events["id"].values,
        "sherdog_id": tapology_events["sherdog_id"].values,
        "bestfightodds_id": tapology_events["bestfightodds_id"].values,
    }
)
matching1["bestfightodds_id"] = matching1["bestfightodds_id"].astype("Int64")

## Wikipedia

In [49]:
wikipedia_events = pd.read_csv(os.path.join(clean_data_dir, "Wikipedia", "events.csv"), parse_dates=["date"])

# wikipedia and ufcstats events are in the same order already
matching2 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "wikipedia_id": wikipedia_events["id"].values,
    }
)

## Fight Matrix

In [139]:
fightmatrix_events = pd.read_csv(os.path.join(clean_data_dir, "Fight Matrix", "events.csv"), parse_dates=["date"])
fightmatrix_events = fightmatrix_events.loc[fightmatrix_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
fightmatrix_events["event_order"] = fightmatrix_events["event_order"].astype(int)

# swap 222 and 223 event orders
fightmatrix_events.loc[fightmatrix_events["event_order"] == 222, "event_order"] = -99
fightmatrix_events.loc[fightmatrix_events["event_order"] == 223, "event_order"] = 222
fightmatrix_events.loc[fightmatrix_events["event_order"] == -99, "event_order"] = 223
fightmatrix_events = fightmatrix_events.sort_values("event_order").reset_index(drop=True)

matching3 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightmatrix_id": fightmatrix_events["id"].values,
    }
)

## ESPN

In [67]:
espn_events = pd.read_csv(os.path.join(clean_data_dir, "ESPN", "events.csv"), parse_dates=["date"])

# event order already matches
matching4 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "espn_id": espn_events["id"].values,
    }
)

## FightOdds.io

In [74]:
fightoddsio_events = pd.read_csv(os.path.join(clean_data_dir, "FightOdds.io", "events.csv"), parse_dates=["date"])

# event order already matches
matching5 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightoddsio_id": fightoddsio_events["id"].values,
    }
)

## MMA Decisions

In [151]:
mmadecisions_events = pd.read_csv(os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"])
mmadecisions_events = mmadecisions_events.loc[mmadecisions_events["promotion"] == "UFC"].reset_index(drop=True)

# swap 392 and 393 event orders
mmadecisions_events.loc[mmadecisions_events["event_order"] == 392, "event_order"] = -99
mmadecisions_events.loc[mmadecisions_events["event_order"] == 393, "event_order"] = 392
mmadecisions_events.loc[mmadecisions_events["event_order"] == -99, "event_order"] = 393
mmadecisions_events = mmadecisions_events.sort_values("event_order").reset_index(drop=True)

event_orders_no_decisions = [1, 2, 3, 4, 6, 13, 45, 93, 296, 616]
temp = ufcstats_events.loc[~ufcstats_events["event_order"].isin(event_orders_no_decisions)].reset_index(drop=True)

matching6 = pd.DataFrame(
    {
        "ufcstats_id": temp["id"].values,
        "mmadecisions_id": mmadecisions_events["id"].values,
    }
)

## Bet MMA

In [163]:
betmma_events = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"), parse_dates=["date"])
betmma_events = betmma_events.loc[betmma_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
betmma_events["event_order"] = betmma_events["event_order"].astype(int)

# swap 274 and 275 event orders
betmma_events.loc[betmma_events["event_order"] == 274, "event_order"] = -99
betmma_events.loc[betmma_events["event_order"] == 275, "event_order"] = 274
betmma_events.loc[betmma_events["event_order"] == -99, "event_order"] = 275
betmma_events = betmma_events.sort_values("event_order").reset_index(drop=True)

temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 233].reset_index(drop=True)

matching7 = pd.DataFrame(
    {
        "ufcstats_id": temp2["id"].values,
        "betmma_id": betmma_events["id"].values,
    }
)

In [180]:
# create final matching dataframe
matching = matching1.merge(matching2, on="ufcstats_id", how="left")
matching = matching.merge(matching3, on="ufcstats_id", how="left")
matching = matching.merge(matching4, on="ufcstats_id", how="left")
matching = matching.merge(matching5, on="ufcstats_id", how="left")
matching = matching.merge(matching6, on="ufcstats_id", how="left")
matching = matching.merge(matching7, on="ufcstats_id", how="left")
matching["mmadecisions_id"] = matching["mmadecisions_id"].astype("Int64")
matching["betmma_id"] = matching["betmma_id"].astype("Int64")
matching = matching[
    [
        "ufcstats_id",
        "bestfightodds_id",
        "betmma_id",
        "espn_id",
        "fightmatrix_id",
        "fightoddsio_id",
        "mmadecisions_id",
        "sherdog_id",
        "tapology_id",
        "wikipedia_id",
    ]
]
matching.to_csv(os.path.join(clean_data_dir, "events_linkage.csv"), index=False)