# Get fighter data

This is a one-time notebook to gather all data on the profile pages of each fighter. Data on the fights themselves are scraped in get_bouts_data.ipynb.

In [1]:
import pandas as pd
import numpy as np
import requests

from string import ascii_lowercase
from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

### Scrape fighter names and URLs

In [2]:
# Scrape fighter names and corresponding URLs to profile pages

# fighter_url_dict = {}
# fighter_name = ""
# fighter_name_group_urls = [f"http://ufcstats.com/statistics/fighters?char={c}&page=all" for c in ascii_lowercase]

# for url in fighter_name_group_urls:
#     source_code = requests.get(url, allow_redirects=False)
#     plain_text = source_code.text.encode("ascii", "replace")
#     soup = BeautifulSoup(plain_text, "html.parser")
#     table = soup.find("tbody")
#     names = table.findAll("a", {"class": "b-link b-link_style_black"}, href=True)
    
#     for i, name in enumerate(names):
#         if (i + 1) % 3 != 0:
#             if fighter_name == "":
#                 fighter_name = name.text
#             else:
#                 fighter_name = fighter_name + " " + name.text
#         else:
#             fighter_url_dict[fighter_name] = name["href"]
#             fighter_name = ""

In [3]:
# Save everything so I don't need to scrape again in the future

# df = pd.DataFrame(fighter_url_dict.items(), columns=["name", "url"])
# df.to_hdf("../data/fighters.h5", key="urls", mode="w")

### Scrape fighter stats

In [4]:
fighter_urls = pd.read_hdf("../data/fighters.h5", key="urls")
# fighter_urls

In [5]:
def scrape(name, url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html5lib")
    record = soup.find("span", {"class" : "b-content__title-record"})
    divs = soup.findAll("li", {"class": "b-list__box-list-item b-list__box-list-item_type_block"})
    
    info = [name]
    if record:
        info.append(
            record.text.replace("  ", "")
                .replace("\n", "")
                .replace("Record: ", "")
        )
    else:
        info.append("MISSING")
    
    for i, div in enumerate(divs):
        if i == 9:
            continue
        info.append(
            div.text.replace("  ", "")
                .replace("\n", "")
                .replace("Height:", "")
                .replace("Weight:", "")
                .replace("Reach:", "")
                .replace("STANCE:", "")
                .replace("DOB:", "")
                .replace("SLpM:", "")
                .replace("Str. Acc.:", "")
                .replace("SApM:", "")
                .replace("Str. Def:", "")
                .replace("TD Avg.:", "")
                .replace("TD Acc.:", "")
                .replace("TD Def.:", "")
                .replace("Sub. Avg.:", "")
        )
    
    return info

In [6]:
# data = []
# with ThreadPoolExecutor(max_workers=10) as executor:
#     for result in tqdm(executor.map(scrape, fighter_urls["name"], fighter_urls["url"]), total=fighter_urls.shape[0], 
#                        desc="Scraping fighter profile stats"):
#         data.append(result)

In [7]:
# fighters = pd.DataFrame(data, columns=["Name", "Record", "Height", "Weight", "Reach", "Stance", "DOB", "SLpM", "Str. Acc.", 
#                                        "SApM", "Str. Def.", "TD Avg.", "TD Acc.", "TD Def.", "Sub. Avg."])
# fighters

In [8]:
# fighters.to_hdf("../data/fighters.h5", key="raw", mode="r+")

### Cleaning time

In [9]:
df = pd.read_hdf("../data/fighters.h5", key="raw")
# df

In [42]:
# Remove empty strings and --
df2 = df.replace("", np.nan).replace("--", np.nan)

# Clean Record, transform into separate wins and losses columns; ignore draws and no contests
df2["Wins"] = df2["Record"].str.split("-").str[0].astype(int)
df2["Losses"] = df2["Record"].str.split("-").str[1].astype(int)

# Convert Height all into inches
def convert_inches(height):
    if pd.notna(height):
        feet, inches = height.split()
        return 12 * int(feet[:-1]) + int(inches[:-1])
    
df2["Height"] = df2["Height"].apply(lambda x: convert_inches(x))

# Clean Weight
df2["Weight"] = df2["Weight"].str.replace(r" lbs.", "", regex=True).astype(float)

# Clean Reach and convert to float
df2["Reach"] = df2["Reach"].str.replace(r"\"", "", regex=True).astype(float)

# Stance is okay

# Format DOB into datetime
df2["DOB"] = pd.to_datetime(df2["DOB"])

# Convert SLpM to float
df2["SLpM"] = df2["SLpM"].astype(float)

# Convert Str. Acc. into proportion and float
df2["Str. Acc."] = df2["Str. Acc."].str.replace(r"%", "", regex=True).astype(float).divide(100)

# Convert SApM to float
df2["SApM"] = df2["SApM"].astype(float)

# Convert Str. Def. into proportion and float
df2["Str. Def."] = df2["Str. Def."].str.replace(r"%", "", regex=True).astype(float).divide(100)

# Convert TD Avg. to float
df2["TD Avg."] = df2["TD Avg."].astype(float)

# Convert TD Acc. into proportion and float
df2["TD Acc."] = df2["TD Acc."].str.replace(r"%", "", regex=True).astype(float).divide(100)

# Convert TD Def. into proportion and float
df2["TD Def."] = df2["TD Def."].str.replace(r"%", "", regex=True).astype(float).divide(100)

# Convert Sub. Avg. to float
df2["Sub. Avg."] = df2["Sub. Avg."].astype(float)

In [43]:
# Drop Record and move Wins and Losses to the front
df3 = df2.drop("Record", axis=1)
wins = df3.pop("Wins")
losses = df3.pop("Losses")
df3.insert(1, losses.name, losses)
df3.insert(1, wins.name, wins)

df3

Unnamed: 0,Name,Wins,Losses,Height,Weight,Reach,Stance,DOB,SLpM,Str. Acc.,SApM,Str. Def.,TD Avg.,TD Acc.,TD Def.,Sub. Avg.
0,Tom Aaron,5,3,,155.0,,,1978-07-13,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
1,Danny Abbadi,4,6,71.0,155.0,,Orthodox,1983-07-03,3.29,0.38,4.41,0.57,0.00,0.00,0.77,0.0
2,Nariman Abbasov,28,4,68.0,155.0,66.0,Orthodox,1994-02-01,3.00,0.20,5.67,0.46,0.00,0.00,0.66,0.0
3,David Abbott,10,15,72.0,265.0,,Switch,NaT,1.35,0.30,3.55,0.38,1.07,0.33,0.66,0.0
4,Hamdy Abdelwahab,6,0,74.0,264.0,72.0,Southpaw,1993-01-22,3.87,0.52,3.13,0.59,3.00,0.75,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3912,Dave Zitanick,5,7,,170.0,,,1980-03-05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
3913,Alex Zuniga,6,3,,145.0,,,NaT,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
3914,George Zuniga,3,1,69.0,185.0,,,NaT,7.64,0.38,5.45,0.37,0.00,0.00,1.00,0.0
3915,Allan Zuniga,13,1,67.0,155.0,70.0,Orthodox,1992-04-04,3.93,0.52,1.80,0.61,0.00,0.00,0.57,1.0


In [44]:
df3.to_hdf("../data/fighters.h5", key="clean", mode="r+")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['Name', 'Stance'], dtype='object')]

  df3.to_hdf("../data/fighters.h5", key="clean", mode="r+")
