# Get bouts data

In [1]:
import pandas as pd
import numpy as np
import requests

from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

### Scrape URLs to all bouts

In [2]:
fighters = pd.read_csv("data/fighters.csv")
fighters

Unnamed: 0,Name,Wins,Losses,Height,Weight,Reach,Stance,DOB,SLpM,Str. Acc.,SApM,Str. Def.,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,URL
0,Tom Aaron,5,3,,155.0,,,1978-07-13,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,http://ufcstats.com/fighter-details/93fe7332d1...
1,Danny Abbadi,4,6,71.0,155.0,,Orthodox,1983-07-03,3.29,0.38,4.41,0.57,0.00,0.00,0.77,0.0,http://ufcstats.com/fighter-details/15df64c02b...
2,Nariman Abbasov,28,4,68.0,155.0,66.0,Orthodox,1994-02-01,3.00,0.20,5.67,0.46,0.00,0.00,0.66,0.0,http://ufcstats.com/fighter-details/59a9d6dac6...
3,David Abbott,10,15,72.0,265.0,,Switch,,1.35,0.30,3.55,0.38,1.07,0.33,0.66,0.0,http://ufcstats.com/fighter-details/b361180739...
4,Hamdy Abdelwahab,5,0,74.0,264.0,72.0,Southpaw,1993-01-22,3.87,0.52,3.13,0.59,3.00,0.75,0.00,0.0,http://ufcstats.com/fighter-details/3329d692ae...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3923,Dave Zitanick,5,7,,170.0,,,1980-03-05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,http://ufcstats.com/fighter-details/be124bdd60...
3924,Alex Zuniga,6,3,,145.0,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,http://ufcstats.com/fighter-details/02d808afb9...
3925,George Zuniga,3,1,69.0,185.0,,,,7.64,0.38,5.45,0.37,0.00,0.00,1.00,0.0,http://ufcstats.com/fighter-details/1291dd6b8a...
3926,Allan Zuniga,13,1,67.0,155.0,70.0,Orthodox,1992-04-04,3.93,0.52,1.80,0.61,0.00,0.00,0.57,1.0,http://ufcstats.com/fighter-details/523af801b3...


In [3]:
def scrape_fight_urls(url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    
    # Ignore upcoming fights
    rows = soup.findAll("tr", {"class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"})
    
    bouts = []
    for row in rows:
        entry = []
        for atag in row.findAll("a", {"class": "b-flag"}, href=True):
            entry.append(atag["href"])
        for i, ptag in enumerate(row.findAll("p", {"class": "b-fight-details__table-text"})):
            if i == 11 or i == 12:
                entry.append(ptag.text.strip())
        bouts.append(entry)
    
    return bouts

In [4]:
bout_urls = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for result in tqdm(executor.map(scrape_fight_urls, fighters["URL"]), 
                       total=fighters.shape[0], desc="Scraping bout links"):
        bout_urls += result

Scraping bout links: 100%|█████████████████████████████████████████████████████████| 3928/3928 [03:07<00:00, 20.98it/s]


In [5]:
df = pd.DataFrame(bout_urls, columns=["url", "event", "date"]).drop_duplicates(keep="first")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(by=["date", "event"]).reset_index(drop=True)
df

Unnamed: 0,url,event,date
0,http://ufcstats.com/fight-details/00b0796724ec...,UFC 1: The Beginning,1993-11-12
1,http://ufcstats.com/fight-details/2d2bbc86e941...,UFC 1: The Beginning,1993-11-12
2,http://ufcstats.com/fight-details/64139d1d505e...,UFC 1: The Beginning,1993-11-12
3,http://ufcstats.com/fight-details/ac7ca2ec38b9...,UFC 1: The Beginning,1993-11-12
4,http://ufcstats.com/fight-details/567a09fd200c...,UFC 1: The Beginning,1993-11-12
...,...,...,...
9391,http://ufcstats.com/fight-details/d6945828f5fa...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9392,http://ufcstats.com/fight-details/b9539d5e379b...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9393,http://ufcstats.com/fight-details/c3af2b6ab126...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9394,http://ufcstats.com/fight-details/8d5b88e0ff25...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11


In [7]:
# df.to_csv("data/test.csv", index=False, header=True)

### Reorder bouts

dang it

In [8]:
temp = df.groupby("event").first().reset_index(drop=False)
temp = temp.sort_values(by=["date", "event"]).reset_index(drop=True)

In [9]:
def scrape_event_urls(url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    header = soup.find("h2", {"class": "b-content__title"})
    event_link = header.find("a", {"class": "b-link"}, href=True)
    
    return event_link["href"]

In [10]:
event_urls = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for result in tqdm(executor.map(scrape_event_urls, temp["url"]), total=temp.shape[0]):
        event_urls.append(result)

100%|██████████████████████████████████████████████████████████████████████████████| 1072/1072 [01:21<00:00, 13.14it/s]


In [11]:
def scrape_bout_urls_in_order(url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    rows = soup.findAll("tr", {"class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"})
    urls = []
    
    for row in rows:
        for atag in row.findAll("a", {"class": "b-flag"}, href=True):
            urls.append(atag["href"])
    
    return urls[::-1]

In [12]:
urls_in_order = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for result in tqdm(executor.map(scrape_bout_urls_in_order, event_urls), total=len(event_urls)):
        urls_in_order += result

100%|██████████████████████████████████████████████████████████████████████████████| 1072/1072 [01:04<00:00, 16.56it/s]


In [13]:
order = pd.DataFrame(urls_in_order, columns=["url"])
order = order.drop_duplicates(keep="first").reset_index(drop=True)

In [14]:
bout_urls_in_order = order.merge(df, how="left", on="url")
bout_urls_in_order

Unnamed: 0,url,event,date
0,http://ufcstats.com/fight-details/567a09fd200c...,UFC 1: The Beginning,1993-11-12
1,http://ufcstats.com/fight-details/2d2bbc86e941...,UFC 1: The Beginning,1993-11-12
2,http://ufcstats.com/fight-details/cecdc0da5842...,UFC 1: The Beginning,1993-11-12
3,http://ufcstats.com/fight-details/46acd54cc0c9...,UFC 1: The Beginning,1993-11-12
4,http://ufcstats.com/fight-details/ac7ca2ec38b9...,UFC 1: The Beginning,1993-11-12
...,...,...,...
9391,http://ufcstats.com/fight-details/2c58a97f2fe6...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9392,http://ufcstats.com/fight-details/b9539d5e379b...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9393,http://ufcstats.com/fight-details/d6945828f5fa...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9394,http://ufcstats.com/fight-details/3b561a154aa2...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11


In [15]:
bout_urls_in_order.to_csv("data/bout_urls.csv", index=False, header=True)

### Scrape bout stats

In [16]:
bout_urls = pd.read_csv("data/bout_urls.csv")
bout_urls

Unnamed: 0,url,event,date
0,http://ufcstats.com/fight-details/567a09fd200c...,UFC 1: The Beginning,1993-11-12
1,http://ufcstats.com/fight-details/2d2bbc86e941...,UFC 1: The Beginning,1993-11-12
2,http://ufcstats.com/fight-details/cecdc0da5842...,UFC 1: The Beginning,1993-11-12
3,http://ufcstats.com/fight-details/46acd54cc0c9...,UFC 1: The Beginning,1993-11-12
4,http://ufcstats.com/fight-details/ac7ca2ec38b9...,UFC 1: The Beginning,1993-11-12
...,...,...,...
9391,http://ufcstats.com/fight-details/2c58a97f2fe6...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9392,http://ufcstats.com/fight-details/b9539d5e379b...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9393,http://ufcstats.com/fight-details/d6945828f5fa...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11
9394,http://ufcstats.com/fight-details/3b561a154aa2...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11


In [17]:
def scrape_bout_stats(url, event, date):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    
    bout_details = [url, event, date]   
    bout_details += [x.text.strip() for x in soup.findAll("h3", {"class": "b-fight-details__person-name"})]
    bout_details += [x.text.strip() for x in soup.findAll("i", {"class": "b-fight-details__person-status"})]
    bout_details.append(soup.find("i", {"class": "b-fight-details__fight-title"}).text.strip())
    bout_details.append(soup.find("i", {"class": "b-fight-details__text-item_first"}).text.replace("Method:", "").strip())
    for item in soup.find("p", {"class": "b-fight-details__text"}).findAll("i", {"class": "b-fight-details__text-item"}):
        bout_details.append(item.text.replace("Round:", "")
                                    .replace("Time:", "")
                                    .replace("Time format:", "")
                                    .replace("Referee:", "")
                                    .strip())
    
    tbody = soup.find("tbody", {"class": "b-fight-details__table-body"})
    if tbody:
        row = tbody.find("tr", {"class": "b-fight-details__table-row"})
        stats = row.findAll("p", {"class": "b-fight-details__table-text"})
        for i, s in enumerate(stats):
            if i >= 2:
                bout_details.append(s.text.strip())
    else:
        bout_details.extend([np.nan for i in range(18)])

    return bout_details

In [18]:
bout_stats = []
with ThreadPoolExecutor(max_workers=4) as executor:
    for result in tqdm(executor.map(scrape_bout_stats, bout_urls["url"], bout_urls["event"], bout_urls["date"]), 
                       total=bout_urls.shape[0], desc="Scraping bout stats"):
        bout_stats.append(result)

Scraping bout stats: 100%|█████████████████████████████████████████████████████████| 9396/9396 [24:31<00:00,  6.39it/s]


In [19]:
cols = ["URL", "Event", "Date", "R_Name", "B_Name", "R_Result", "B_Result", "Bout Type", "Method", "Round", "Time", "Format", 
        "Referee", "R_KD", "B_KD", "R_Sig. Str.", "B_Sig. Str.", "R_Sig. Str. %", "B_Sig. Str. %", "R_Total Str.", 
        "B_Total Str.", "R_TD", "B_TD", "R_TD %", "B_TD %", "R_Sub. Att", "B_Sub. Att", "R_Rev.", "B_Rev.", "R_Ctrl", "B_Ctrl"]
df = pd.DataFrame(bout_stats, columns=cols)
df

Unnamed: 0,URL,Event,Date,R_Name,B_Name,R_Result,B_Result,Bout Type,Method,Round,...,R_TD,B_TD,R_TD %,B_TD %,R_Sub. Att,B_Sub. Att,R_Rev.,B_Rev.,R_Ctrl,B_Ctrl
0,http://ufcstats.com/fight-details/567a09fd200c...,UFC 1: The Beginning,1993-11-12,Gerard Gordeau,Teila Tuli,W,L,Open Weight Bout,KO/TKO,1,...,0 of 0,0 of 1,---,0%,0,0,0,0,--,--
1,http://ufcstats.com/fight-details/2d2bbc86e941...,UFC 1: The Beginning,1993-11-12,Kevin Rosier,Zane Frazier,W,L,Open Weight Bout,KO/TKO,1,...,0 of 0,0 of 0,---,---,0,0,0,0,--,--
2,http://ufcstats.com/fight-details/cecdc0da5842...,UFC 1: The Beginning,1993-11-12,Royce Gracie,Art Jimmerson,W,L,Open Weight Bout,Submission,1,...,1 of 1,0 of 0,100%,---,0,0,0,0,--,--
3,http://ufcstats.com/fight-details/46acd54cc0c9...,UFC 1: The Beginning,1993-11-12,Ken Shamrock,Patrick Smith,W,L,Open Weight Bout,Submission,1,...,1 of 2,0 of 0,50%,---,2,0,0,0,--,--
4,http://ufcstats.com/fight-details/ac7ca2ec38b9...,UFC 1: The Beginning,1993-11-12,Gerard Gordeau,Kevin Rosier,W,L,Open Weight Bout,KO/TKO,1,...,0 of 0,0 of 0,---,---,0,0,0,0,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9391,http://ufcstats.com/fight-details/2c58a97f2fe6...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Mario Bautista,Guido Cannetti,W,L,Bantamweight Bout,Submission,1,...,3 of 4,1 of 1,75%,100%,1,0,0,0,2:36,0:16
9392,http://ufcstats.com/fight-details/b9539d5e379b...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Said Nurmagomedov,Jonathan Martinez,L,W,Bantamweight Bout,Decision - Unanimous,3,...,3 of 9,0 of 1,33%,0%,2,1,0,0,4:21,6:09
9393,http://ufcstats.com/fight-details/d6945828f5fa...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Nikita Krylov,Ryan Spann,W,L,Catch Weight Bout,Submission,1,...,2 of 2,1 of 2,100%,50%,2,1,2,2,2:39,0:32
9394,http://ufcstats.com/fight-details/3b561a154aa2...,UFC Fight Night: Yan vs. Dvalishvili,2023-03-11,Alexander Volkov,Alexandr Romanov,W,L,Heavyweight Bout,KO/TKO,1,...,0 of 0,0 of 5,---,0%,0,0,0,0,1:03,0:38


In [20]:
df.to_csv("data/bouts_raw.csv")