# Data Cleaning - FightOdds.io

In [2]:
# standard library imports
import os

# third party imports
import numpy as np
import pandas as pd

# local imports


data_dir = os.path.join(os.path.dirname("__file__"), "..", "..", "data")
raw_data_dir = os.path.join(data_dir, "raw")
clean_data_dir = os.path.join(data_dir, "clean")

## Bouts

In [3]:
fightoddsio_bouts = pd.read_csv(os.path.join(raw_data_dir, "FightOdds.io", "bouts.csv"))

# Filter for bouts that were cancelled, we will use later for filtering when cleaning odds data
cancelled_bouts = fightoddsio_bouts.loc[
    fightoddsio_bouts["is_cancelled"] == 1
].reset_index(drop=True)

# Subset bouts that actually took place
fightoddsio_bouts = fightoddsio_bouts.loc[
    fightoddsio_bouts["is_cancelled"] == 0
].reset_index(drop=True)

# Standardize weight class name and convert weight to Int64
fightoddsio_bouts["weight_class"] = fightoddsio_bouts["weight_class"].str.title()
fightoddsio_bouts["weight_lbs"] = fightoddsio_bouts["weight_lbs"].astype("Int64")

# Clean up end round and end round time
fightoddsio_bouts["end_round"] = fightoddsio_bouts["end_round"].astype("Int64")
fightoddsio_bouts.loc[
    fightoddsio_bouts["end_round_time"] == "Roun", "end_round_time"
] = np.nan

# Convert average odds to Int64
fightoddsio_bouts["fighter_1_odds"] = fightoddsio_bouts["fighter_1_odds"].astype(
    "Int64"
)
fightoddsio_bouts["fighter_2_odds"] = fightoddsio_bouts["fighter_2_odds"].astype(
    "Int64"
)

# Drop flag column
fightoddsio_bouts = fightoddsio_bouts.drop(columns=["is_cancelled"])

fightoddsio_bouts.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "bouts.csv"), index=False
)

## Events

No further cleaning needed

In [4]:
fightoddsio_events = pd.read_csv(
    os.path.join(raw_data_dir, "FightOdds.io", "events.csv"), parse_dates=["date"]
)
fightoddsio_events.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "events.csv"), index=False
)

## Fighters

In [5]:
fightoddsio_fighters = pd.read_csv(
    os.path.join(raw_data_dir, "FightOdds.io", "fighters.csv"),
    parse_dates=["date_of_birth"],
)

# Fighters that actually never fought due to cancellation/erroneous data
fighters_never_fought = {
    "RmlnaHRlck5vZGU6MTA0Mjg=",
    "RmlnaHRlck5vZGU6MTA4NDU=",
    "RmlnaHRlck5vZGU6MTA4NTk=",
    "RmlnaHRlck5vZGU6MTAwNDU=",
    "RmlnaHRlck5vZGU6MTIzMDk=",
    "RmlnaHRlck5vZGU6MTIzMTE=",
    "RmlnaHRlck5vZGU6MTQ1OTY=",
    "RmlnaHRlck5vZGU6MTU0NzE=",
    "RmlnaHRlck5vZGU6MTY1NjM=",
    "RmlnaHRlck5vZGU6MTY4NTE=",
    "RmlnaHRlck5vZGU6MTgzODM=",
    "RmlnaHRlck5vZGU6MTk5NTE=",
    "RmlnaHRlck5vZGU6MTkzMDc=",
    "RmlnaHRlck5vZGU6MjAwNzU=",
    "RmlnaHRlck5vZGU6MjAyMA==",
    "RmlnaHRlck5vZGU6MjAzMzM=",
    "RmlnaHRlck5vZGU6MjE3MA==",
    "RmlnaHRlck5vZGU6MjEzMzk=",
    "RmlnaHRlck5vZGU6MjM2MTE=",
    "RmlnaHRlck5vZGU6MjgxOTA=",
    "RmlnaHRlck5vZGU6MjgyOTk=",
    "RmlnaHRlck5vZGU6Mjk5NTA=",
    "RmlnaHRlck5vZGU6MjkzMjg=",
    "RmlnaHRlck5vZGU6MzAxOTE=",
    "RmlnaHRlck5vZGU6MzU2OA==",
    "RmlnaHRlck5vZGU6MzYyNjI=",
    "RmlnaHRlck5vZGU6Mzc3MA==",
    "RmlnaHRlck5vZGU6NDAyNw==",
    "RmlnaHRlck5vZGU6NDI2Mjc=",
    "RmlnaHRlck5vZGU6NDUxNzM=",
    "RmlnaHRlck5vZGU6NDc4MTI=",
    "RmlnaHRlck5vZGU6NDcwNjg=",
    "RmlnaHRlck5vZGU6NTI1MDE=",
    "RmlnaHRlck5vZGU6NTIxOTE=",
    "RmlnaHRlck5vZGU6NTgzOTQ=",
    "RmlnaHRlck5vZGU6NTk4NA==",
    "RmlnaHRlck5vZGU6NjczMQ==",
    "RmlnaHRlck5vZGU6Njg4Mw==",
    "RmlnaHRlck5vZGU6NzM3Mg==",
    "RmlnaHRlck5vZGU6ODEzNg==",
}

# Drop all such fighters
fightoddsio_fighters = (
    fightoddsio_fighters.loc[~fightoddsio_fighters["id"].isin(fighters_never_fought)]
    .sort_values(by="pk")
    .reset_index(drop=True)
)

# Convert to Int64
fightoddsio_fighters["leg_reach_inches"] = fightoddsio_fighters[
    "leg_reach_inches"
].astype("Int64")

# Standardize fighting styles into broader categories + correct typos
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Stirker", "fighting_style"
] = "Striker"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Strker", "fighting_style"
] = "Striker"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Power Striker", "fighting_style"
] = "Striker"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Pressure Striker", "fighting_style"
] = "Striker"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Technical Striker", "fighting_style"
] = "Striker"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Wrestling", "fighting_style"
] = "Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Sambo", "fighting_style"
] = "Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Brazilian Jiu-Jitsu", "fighting_style"
] = "Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "BJJ Grappler", "fighting_style"
] = "Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Judo Grappler", "fighting_style"
] = "Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "BJJ Grappler / Striker", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / BJJ Grappler", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / Bjj Grappler", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Power Striker/Grappler", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker/ Grappler", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / Grappler", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / Judo", "fighting_style"
] = "Striker/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / Sambo", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Stirker/Wrestler", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Power Striker/Wrestler", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Boxer / Wrestler", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Wrestler / Boxing", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker / Wrestler", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Striker/ Wrestler", "fighting_style"
] = "Striker/Wrestler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["fighting_style"] == "Wrestler / BJJ Grappler",
    "fighting_style",
] = "Wrestler/Grappler"
fightoddsio_fighters.loc[
    fightoddsio_fighters["id"] == "RmlnaHRlck5vZGU6NDExMA==", "fighting_style"
] = "Grappler"

# Remove leading/trailing whitespace
fightoddsio_fighters["nationality"] = fightoddsio_fighters["nationality"].str.strip()

fightoddsio_fighters.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "fighters.csv"), index=False
)

## Moneyline Odds

In [6]:
cancelled_bout_ids = cancelled_bouts["id"].unique()
fightoddsio_moneyline_odds = pd.read_csv(
    os.path.join(raw_data_dir, "FightOdds.io", "moneyline_odds_summaries.csv")
)

# Drop cancelled bouts from moneyline odds data
fightoddsio_moneyline_odds = fightoddsio_moneyline_odds.loc[
    ~fightoddsio_moneyline_odds["bout_id"].isin(cancelled_bout_ids)
].reset_index(drop=True)

# Remove any rows where there is no odds data at all
odds_cols = [
    "fighter_1_odds_open",
    "fighter_1_odds_worst",
    "fighter_1_odds_current",
    "fighter_1_odds_best",
    "fighter_2_odds_open",
    "fighter_2_odds_worst",
    "fighter_2_odds_current",
    "fighter_2_odds_best",
]
fightoddsio_moneyline_odds = (
    fightoddsio_moneyline_odds.loc[
        ~fightoddsio_moneyline_odds[odds_cols].isnull().all(axis=1)
    ]
    .copy()
    .reset_index(drop=True)
)

# Cast all to Int64
fightoddsio_moneyline_odds[odds_cols] = fightoddsio_moneyline_odds[odds_cols].astype(
    "Int64"
)

fightoddsio_moneyline_odds.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "moneyline_odds.csv"), index=False
)

## Proposition Odds

In [7]:
fightoddsio_proposition_odds = pd.read_csv(
    os.path.join(raw_data_dir, "FightOdds.io", "expected_outcome_summaries.csv")
)
fightoddsio_proposition_odds["fighter_pk"] = fightoddsio_proposition_odds[
    "fighter_pk"
].astype("Int64")

# Ignore prop bets on totals (total sig strikes + takedowns) since we didn't scrape the reference values
fightoddsio_proposition_odds = fightoddsio_proposition_odds.loc[
    ~fightoddsio_proposition_odds["offer_type_id"].isin(["TOTAL_SS", "TOTAL_TD"])
].reset_index(drop=True)

fightoddsio_proposition_odds.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "proposition_odds.csv"), index=False
)

## Sportsbooks

No further cleaning needed

In [8]:
fightoddsio_sportsbooks = pd.read_csv(
    os.path.join(raw_data_dir, "FightOdds.io", "sportsbooks.csv")
)
fightoddsio_sportsbooks.to_csv(
    os.path.join(clean_data_dir, "FightOdds.io", "sportsbooks.csv"), index=False
)