In [90]:
# standard library imports
import os

# third party imports
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
clean_data_dir = os.path.join(data_dir, "clean")

In [91]:
ufcstats_events = pd.read_csv(os.path.join(clean_data_dir, "UFC Stats", "events.csv"), parse_dates=["date"])
ufcstats_events = ufcstats_events.loc[ufcstats_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
ufcstats_events["event_order"] = ufcstats_events["event_order"].astype(int)
ufcstats_events

Unnamed: 0,id,name,date,location,event_order
0,6420efac0578988b,UFC 1: The Beginning,1993-11-12,"Denver, Colorado, USA",1
1,a6a9ab5a824e8f66,UFC 2: No Way Out,1994-03-11,"Denver, Colorado, USA",2
2,1a49e0670dfaca31,UFC 3: The American Dream,1994-09-09,"Charlotte, North Carolina, USA",3
3,b60391da771deefe,UFC 4: Revenge of the Warriors,1994-12-16,"Tulsa, Oklahoma, USA",4
4,dedc3bb440d09554,UFC 5: The Return of the Beast,1995-04-07,"Charlotte, North Carolina, USA",5
...,...,...,...,...,...
711,585f9ffdb0cd0466,UFC Fight Night: Magny vs. Prates,2024-11-09,"Las Vegas, Nevada, USA",712
712,daff32bc96d1eabf,UFC 309: Jones vs. Miocic,2024-11-16,"New York City, New York, USA",713
713,e955046551f8c7dd,UFC Fight Night: Yan vs. Figueiredo,2024-11-23,"Macau, China",714
714,ad23903ef3af7406,UFC 310: Pantoja vs. Asakura,2024-12-07,"Las Vegas, Nevada, USA",715


## Tapology, Sherdog, Best Fight Odds

In [92]:
tapology_events = pd.read_csv(os.path.join(clean_data_dir, "Tapology", "events.csv"))

# For available ids, all match up in event order, so good to go
# temp = tapology_events.loc[tapology_events["ufcstats_id"].notnull()]
# temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 41]
# all(temp["ufcstats_id"] == temp2["id"])

matching1 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "tapology_id": tapology_events["id"].values,
        "sherdog_id": tapology_events["sherdog_id"].values,
        "bestfightodds_id": tapology_events["bestfightodds_id"].values,
    }
)
matching1["bestfightodds_id"] = matching1["bestfightodds_id"].astype("Int64")

## Wikipedia

In [93]:
wikipedia_events = pd.read_csv(os.path.join(clean_data_dir, "Wikipedia", "events.csv"), parse_dates=["date"])

# wikipedia and ufcstats events are in the same order already
matching2 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "wikipedia_id": wikipedia_events["id"].values,
    }
)

## Fight Matrix

In [94]:
fightmatrix_events = pd.read_csv(os.path.join(clean_data_dir, "Fight Matrix", "events.csv"), parse_dates=["date"])
fightmatrix_events = fightmatrix_events.loc[fightmatrix_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
fightmatrix_events["event_order"] = fightmatrix_events["event_order"].astype(int)

# swap 222 and 223 event orders
fightmatrix_events.loc[fightmatrix_events["event_order"] == 222, "event_order"] = -99
fightmatrix_events.loc[fightmatrix_events["event_order"] == 223, "event_order"] = 222
fightmatrix_events.loc[fightmatrix_events["event_order"] == -99, "event_order"] = 223
fightmatrix_events = fightmatrix_events.sort_values("event_order").reset_index(drop=True)

# swap 378 and 379 event orders
fightmatrix_events.loc[fightmatrix_events["event_order"] == 378, "event_order"] = -99
fightmatrix_events.loc[fightmatrix_events["event_order"] == 379, "event_order"] = 378
fightmatrix_events.loc[fightmatrix_events["event_order"] == -99, "event_order"] = 379
fightmatrix_events = fightmatrix_events.sort_values("event_order").reset_index(drop=True)

matching3 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightmatrix_id": fightmatrix_events["id"].values,
    }
)

## ESPN

In [95]:
espn_events = pd.read_csv(os.path.join(clean_data_dir, "ESPN", "events.csv"), parse_dates=["date"])

# swap 278 and 279 event orders
espn_events.loc[espn_events["event_order"] == 278, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 279, "event_order"] = 278
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 279
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

# swap 286 and 287 event orders
espn_events.loc[espn_events["event_order"] == 286, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 287, "event_order"] = 286
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 287
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

# swap 293 and 294 event orders
espn_events.loc[espn_events["event_order"] == 293, "event_order"] = -99
espn_events.loc[espn_events["event_order"] == 294, "event_order"] = 293
espn_events.loc[espn_events["event_order"] == -99, "event_order"] = 294
espn_events = espn_events.sort_values("event_order").reset_index(drop=True)

matching4 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "espn_id": espn_events["id"].values,
    }
)

## FightOdds.io

In [96]:
fightoddsio_events = pd.read_csv(os.path.join(clean_data_dir, "FightOdds.io", "events.csv"), parse_dates=["date"])

# swap 286 and 287 event orders
fightoddsio_events.loc[fightoddsio_events["event_order"] == 286, "event_order"] = -99
fightoddsio_events.loc[fightoddsio_events["event_order"] == 287, "event_order"] = 286
fightoddsio_events.loc[fightoddsio_events["event_order"] == -99, "event_order"] = 287
fightoddsio_events = fightoddsio_events.sort_values("event_order").reset_index(drop=True)

# swap 293 and 294 event orders
fightoddsio_events.loc[fightoddsio_events["event_order"] == 293, "event_order"] = -99
fightoddsio_events.loc[fightoddsio_events["event_order"] == 294, "event_order"] = 293
fightoddsio_events.loc[fightoddsio_events["event_order"] == -99, "event_order"] = 294
fightoddsio_events = fightoddsio_events.sort_values("event_order").reset_index(drop=True)

matching5 = pd.DataFrame(
    {
        "ufcstats_id": ufcstats_events["id"].values,
        "fightoddsio_id": fightoddsio_events["id"].values,
    }
)

## MMA Decisions

In [97]:
mmadecisions_events = pd.read_csv(os.path.join(clean_data_dir, "MMA Decisions", "events.csv"), parse_dates=["date"])
mmadecisions_events = mmadecisions_events.loc[mmadecisions_events["promotion"] == "UFC"].reset_index(drop=True)

# swap 392 and 393 event orders
mmadecisions_events.loc[mmadecisions_events["event_order"] == 392, "event_order"] = -99
mmadecisions_events.loc[mmadecisions_events["event_order"] == 393, "event_order"] = 392
mmadecisions_events.loc[mmadecisions_events["event_order"] == -99, "event_order"] = 393
mmadecisions_events = mmadecisions_events.sort_values("event_order").reset_index(drop=True)

# swap 539 and 540 event orders
mmadecisions_events.loc[mmadecisions_events["event_order"] == 539, "event_order"] = -99
mmadecisions_events.loc[mmadecisions_events["event_order"] == 540, "event_order"] = 539
mmadecisions_events.loc[mmadecisions_events["event_order"] == -99, "event_order"] = 540
mmadecisions_events = mmadecisions_events.sort_values("event_order").reset_index(drop=True)

event_orders_no_decisions = [1, 2, 3, 4, 6, 13, 45, 93, 296, 616]
temp = ufcstats_events.loc[~ufcstats_events["event_order"].isin(event_orders_no_decisions)].reset_index(drop=True)

matching6 = pd.DataFrame(
    {
        "ufcstats_id": temp["id"].values,
        "mmadecisions_id": mmadecisions_events["id"].values,
    }
)

## Bet MMA

In [98]:
betmma_events = pd.read_csv(os.path.join(clean_data_dir, "Bet MMA", "events.csv"), parse_dates=["date"])
betmma_events = betmma_events.loc[betmma_events["is_ufc_event"] == 1].drop(columns=["is_ufc_event"]).reset_index(drop=True)
betmma_events["event_order"] = betmma_events["event_order"].astype(int)

# swap 274 and 275 event orders
betmma_events.loc[betmma_events["event_order"] == 274, "event_order"] = -99
betmma_events.loc[betmma_events["event_order"] == 275, "event_order"] = 274
betmma_events.loc[betmma_events["event_order"] == -99, "event_order"] = 275
betmma_events = betmma_events.sort_values("event_order").reset_index(drop=True)

# swap 293 and 294 event orders
betmma_events.loc[betmma_events["event_order"] == 293, "event_order"] = -99
betmma_events.loc[betmma_events["event_order"] == 294, "event_order"] = 293
betmma_events.loc[betmma_events["event_order"] == -99, "event_order"] = 294
betmma_events = betmma_events.sort_values("event_order").reset_index(drop=True)

temp2 = ufcstats_events.loc[ufcstats_events["event_order"] > 233].reset_index(drop=True)

matching7 = pd.DataFrame(
    {
        "ufcstats_id": temp2["id"].values,
        "betmma_id": betmma_events["id"].values,
    }
)

In [99]:
# create final matching dataframe
matching = matching1.merge(matching2, on="ufcstats_id", how="left")
matching = matching.merge(matching3, on="ufcstats_id", how="left")
matching = matching.merge(matching4, on="ufcstats_id", how="left")
matching = matching.merge(matching5, on="ufcstats_id", how="left")
matching = matching.merge(matching6, on="ufcstats_id", how="left")
matching = matching.merge(matching7, on="ufcstats_id", how="left")
matching["mmadecisions_id"] = matching["mmadecisions_id"].astype("Int64")
matching["betmma_id"] = matching["betmma_id"].astype("Int64")
matching = matching[
    [
        "ufcstats_id",
        "bestfightodds_id",
        "betmma_id",
        "espn_id",
        "fightmatrix_id",
        "fightoddsio_id",
        "mmadecisions_id",
        "sherdog_id",
        "tapology_id",
        "wikipedia_id",
    ]
]
matching.to_csv(os.path.join(clean_data_dir, "events_linkage.csv"), index=False)

In [100]:
ufcstats_events.loc[ufcstats_events.duplicated(subset="date", keep=False)]

Unnamed: 0,id,name,date,location,event_order
273,b757c73f443d4fca,UFC Fight Night: Munoz vs Mousasi,2014-05-31,"Berlin, Germany",274
274,59aaf2730b84698a,UFC Fight Night: Miocic vs Maldonado,2014-05-31,"Sao Paulo, Brazil",275
277,ac9521250dc1a14c,UFC Fight Night: Te Huna vs Marquardt,2014-06-28,"Auckland, New Zealand",278
278,c0ed7b208197e8de,UFC Fight Night: Swanson vs Stephens,2014-06-28,"San Antonio, Texas, USA",279
285,770b9d4813c25902,UFC Fight Night: Bisping vs Le,2014-08-23,"Macau, China",286
286,063649e21bc9d6d5,UFC Fight Night: Henderson vs Dos Anjos,2014-08-23,"Tulsa, Oklahoma, USA",287
292,8dc4f34c1f50d00d,UFC Fight Night: Nelson vs Story,2014-10-04,"Stockholm, Sweden",293
293,0313bf497de9c470,UFC Fight Night: MacDonald vs Saffiedine,2014-10-04,"Halifax, Nova Scotia, Canada",294
377,6810d8d2dd557cf9,UFC Fight Night: Mousasi vs. Hall 2,2016-11-19,"Belfast, Northern Ireland, United Kingdom",378
378,5cde96e0a1a1fffe,UFC Fight Night: Bader vs. Nogueira,2016-11-19,"Sao Paulo, Sao Paulo, Brazil",379
