# Get bouts data

In [1]:
import pandas as pd
import numpy as np
import requests

from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

### Scrape URLs to all bouts

In [2]:
fighter_urls = pd.read_hdf("../data/fighters.h5", key="urls")

In [3]:
def scrape(fighter_url):
    source_code = requests.get(fighter_url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    
    # Ignore upcoming fights
    rows = soup.findAll("tr", {"class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"})
    
    bouts = []
    for row in rows:
        entry = []
        for atag in row.findAll("a", {"class": "b-flag"}, href=True):
            entry.append(atag["href"])
        for i, ptag in enumerate(row.findAll("p", {"class": "b-fight-details__table-text"})):
            if i == 11 or i == 12:
                entry.append(ptag.text.strip())
        bouts.append(entry)
    
    return bouts

In [4]:
# bout_urls = []
# with ThreadPoolExecutor(max_workers=10) as executor:
#     for result in tqdm(executor.map(scrape, fighter_urls["url"]), total=fighter_urls.shape[0], desc="Scraping bout links"):
#         bout_urls += result

In [5]:
# df = pd.DataFrame(bout_urls, columns=["url", "event", "date"]).drop_duplicates(keep="first")
# df["date"] = pd.to_datetime(df["date"])
# df = df.sort_values(by=["date", "event"]).reset_index(drop=True)
# df

In [6]:
# df.to_hdf("../data/bouts.h5", key="urls", mode="w")

### Reorder bouts

dang it

In [7]:
# bout_urls = pd.read_hdf("../data/bouts.h5", key="urls")
# temp = bout_urls.groupby("event").first().reset_index(drop=False)
# temp = temp.sort_values(by=["date", "event"]).reset_index(drop=True)
# temp

In [8]:
def scrape_event_urls(url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    header = soup.find("h2", {"class": "b-content__title"})
    event_link = header.find("a", {"class": "b-link"}, href=True)
    
    return event_link["href"]

In [9]:
# event_urls = []
# with ThreadPoolExecutor(max_workers=10) as executor:
#     for result in tqdm(executor.map(scrape_event_urls, temp["url"]), total=temp.shape[0]):
#         event_urls.append(result)

In [10]:
# event_urls

In [11]:
def scrape_bout_urls_in_order(url):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    rows = soup.findAll("tr", {"class": "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"})
    urls = []
    
    for row in rows:
        for atag in row.findAll("a", {"class": "b-flag"}, href=True):
            urls.append(atag["href"])
    
    return urls[::-1]

In [12]:
# urls_in_order = []
# with ThreadPoolExecutor(max_workers=10) as executor:
#     for result in tqdm(executor.map(scrape_bout_urls_in_order, event_urls), total=len(event_urls)):
#         urls_in_order += result

In [13]:
# order = pd.DataFrame(urls_in_order, columns=["url"])
# order = order.drop_duplicates(keep="first").reset_index(drop=True)
# order

In [14]:
# bout_urls_in_order = order.merge(bout_urls, how="left", on="url")
# bout_urls_in_order

In [15]:
# bout_urls_in_order.to_hdf("../data/bouts.h5", key="urls_in_order", mode="w")

### Scrape bout stats

In [16]:
bout_urls = pd.read_hdf("../data/bouts.h5", key="urls_in_order")
bout_urls

Unnamed: 0,url,event,date
0,http://ufcstats.com/fight-details/567a09fd200c...,UFC 1: The Beginning,1993-11-12
1,http://ufcstats.com/fight-details/2d2bbc86e941...,UFC 1: The Beginning,1993-11-12
2,http://ufcstats.com/fight-details/cecdc0da5842...,UFC 1: The Beginning,1993-11-12
3,http://ufcstats.com/fight-details/46acd54cc0c9...,UFC 1: The Beginning,1993-11-12
4,http://ufcstats.com/fight-details/ac7ca2ec38b9...,UFC 1: The Beginning,1993-11-12
...,...,...,...
9319,http://ufcstats.com/fight-details/2d1ac675e6c1...,UFC 283: Teixeira vs. Hill,2023-01-21
9320,http://ufcstats.com/fight-details/022198293028...,UFC 283: Teixeira vs. Hill,2023-01-21
9321,http://ufcstats.com/fight-details/1c51f405e5c4...,UFC 283: Teixeira vs. Hill,2023-01-21
9322,http://ufcstats.com/fight-details/0462bcc27498...,UFC 283: Teixeira vs. Hill,2023-01-21


In [17]:
def get_bout_stats(url, event, date):
    source_code = requests.get(url, allow_redirects=False)
    plain_text = source_code.text.encode("ascii", "replace")
    soup = BeautifulSoup(plain_text, "html.parser")
    
    bout_details = [url, event, date]
    R_stats = []
    B_stats = []
    
    bout_details += [x.text.strip() for x in soup.findAll("h3", {"class": "b-fight-details__person-name"})]
    bout_details += [x.text.strip() for x in soup.findAll("i", {"class": "b-fight-details__person-status"})]
    bout_details.append(soup.find("i", {"class": "b-fight-details__fight-title"}).text.strip())
    bout_details.append(soup.find("i", {"class": "b-fight-details__text-item_first"}).text.replace("Method:", "").strip())
    for item in soup.findAll("i", {"class": "b-fight-details__text-item"}):
        bout_details.append(item.text.replace("Round:", "")
                                    .replace("Time:", "")
                                    .replace("Time format:", "")
                                    .replace("Referee:", "")
                                    .strip())
    
    pass

In [18]:
# bout_stats = []
# with ThreadPoolExecutor() as executor:
#     for result in tqdm(executor.map(get_bout_stats, bout_urls["url"], bout_urls["event"], bout_urls["date"])):
#         bout_stats += result

In [19]:
# columns = ["URL", "Event", "Date", "R_fighter", "B_fighter", "R_result", "B_result", "Bout Type"]
# df = pd.DataFrame(bout_stats)

In [20]:
# source_code = requests.get("http://ufcstats.com/fight-details/dde7d29cc443e263", allow_redirects=False)
# plain_text = source_code.text.encode("ascii", "replace")
# soup = BeautifulSoup(plain_text, "html.parser")

In [21]:
# bout_details = []
# R_stats = []
# B_stats = []

# bout_details += [x.text.strip() for x in soup.findAll("h3", {"class": "b-fight-details__person-name"})] + \
#                 [x.text.strip() for x in soup.findAll("i", {"class": "b-fight-details__person-status"})]
# bout_details.append(soup.find("i", {"class": "b-fight-details__fight-title"}).text.strip())
# for item in soup.findAll("i", {"class": "b-fight-details__text-item_first"}):
#     bout_details.append(item.text.replace("Method:", "")
#                                 .replace("Details:", "")
#                                 .strip())
# for item in soup.findAll("i", {"class": "b-fight-details__text-item"}):
#     bout_details.append(item.text.replace("Round:", "")
#                                 .replace("Time:", "")
#                                 .replace("Time format:", "")
#                                 .replace("Referee:", "")
#                                 .strip())

# print(bout_details)