In [1]:
import numpy as np
import pandas as pd
import urllib.request as urllib
from bs4 import BeautifulSoup

from datetime import datetime
import json
import copy

In [2]:
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
df_stat_data = pd.read_csv('data/df_stat_data_10_25.csv')
df_fight_data = pd.read_csv('data/df_fight_data_10_25.csv',
                            parse_dates=['date'], date_parser=dateparse)

with open('data/fighter_hist.json') as json_file:
    fighter_dict = json.load(json_file)
    
with open('data/fighter_name_to_id.json') as json_file:
    name_dict = json.load(json_file)

In [3]:
with urllib.urlopen("http://ufcstats.com/fight-details/bb42336d6417914c") as html:
    soup = BeautifulSoup(html.read())

In [4]:
# determine the winner of the fight (0 if it's the first fighter and 1 if it's the second)
def determine_winner(webpage_soup):
    result = webpage_soup.find_all('div','b-fight-details__person')[0].get_text().strip()[0]
    return 0 if result == 'W' \
        else 1 if result == 'L' \
        else -1

determine_winner(soup)

1

In [5]:
with urllib.urlopen("http://ufcstats.com/statistics/events/completed?page=all") as html_event:
    soup_event = BeautifulSoup(html_event.read())

In [6]:
def get_dates(soup):
    """Here we take in the soup object for one page of the 'events and fights' section.
    Return a list containing all the dates found on the page in order
    """

    all_event_tags = soup.find_all('span', "b-statistics__date")
    all_dates = [event.get_text().strip() for event in all_event_tags]
    if len(all_dates) == 0:
        return []
    
    # first date is for an upcoming event, which we want to ignore
    return all_dates[1:]


def get_event_names_and_links(soup):
    """Here we take in the soup object for one page of the 'events and fights' section.
    Return a list containing all the names and links of the events found on the page in order
    """

    all_events = soup.find_all('a', "b-link b-link_style_black")
    all_event_names = [event.get_text().strip() for event in all_events]
    all_event_links = [event.get('href') for event in all_events]
    return all_event_names, all_event_links


def find_event_names_dates_links(soup_event_list_page):
    """Here we take in the soup object for one page of the 'events and fights' section.
    Return a list containing all the dates, names, and links of the events found on the page in order
    """

    event_dates = get_dates(soup_event_list_page)
    event_names, event_links = get_event_names_and_links(soup_event_list_page)

    if not (len(event_dates) == len(event_names) and 
            len(event_names) == len(event_links)):
        raise Exception('Wrong number of elements')

    return [{'date': datetime.strptime(event_dates[i], '%B %d, %Y'),
             'name': event_names[i],
             'link': event_links[i]} for i in range(len(event_dates))]


# event_dates = get_dates(soup_event)
# event_names, event_links = get_event_names_and_links(soup_event)
event_data = find_event_names_dates_links(soup_event)


In [29]:
def get_all_fight_links_between_dates(start_date, end_date='current'):
    """Return a list of (fight_link, date) tuples for all fights between dates.
    """

    if end_date == 'current':
        end_date = datetime.today()

    with urllib.urlopen("http://ufcstats.com/statistics/events/completed?page=all") as html_event:
        soup_all_events = BeautifulSoup(html_event.read())

    event_links = find_event_names_dates_links(soup_all_events)
    event_links = list(
        filter(
            lambda entry: (start_date <= entry['date']) and (entry['date'] <= end_date), 
            event_links))

    returned = []
    # find links to all of the fight pages of all of the events
    for entry in event_links:
        
        with urllib.urlopen(entry['link']) as html_event_page:
            event_page_soup = BeautifulSoup(html_event_page.read())
        
        fight_links = find_fight_links(event_page_soup)
        returned = returned + [(fight, entry['date']) for fight in fight_links]

    return returned


fight_links = get_all_fight_links_between_dates(datetime(2022, 1, 1))

In [30]:
def scrape_all_fights_between(initial_state, start_date, end_date='current'):
    """Update the fight state (df_fight_data, df_stat_data, etc) 
    with data from all fights within given date range
    """

    fight_links = get_all_fight_links_between_dates(start_date, end_date)
    fight_links.reverse()

    for (fight_link, date) in fight_links:

        with urllib.urlopen(fight_link) as html_fight:
            fight_page_soup = BeautifulSoup(html_fight.read())

        initial_state = handle_fight_link(initial_state, fight_page_soup, date)
    
    return initial_state

data_state_start = {
    'max_fight_id': -1,
    'max_fighter_id': -1,
    'name_dict': {},
    'fighter_dict': {},
    'df_fight_data': pd.DataFrame({col_name: [] for col_name in df_fight_data.columns}),
    'df_stat_data': pd.DataFrame({col_name: [] for col_name in df_stat_data.columns})}

new_state = scrape_all_fights_between(data_state_start, datetime(2022, 1, 1))
        

In [31]:
new_state['df_fight_data'][new_state['df_fight_data'].fight_id == 43]

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
43,Bobby Green v Nasrat Haqparast 2022-02-12,86,87,0.0,Decision,3.0,2022-02-12,43.0


In [32]:
new_state['df_stat_data'].fight_id.unique()

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.])

In [33]:
new_state['df_stat_data'][new_state['df_stat_data'].Seconds != 300]

Unnamed: 0,Fighter,Landed,Out Of,Round,Seconds,Stat,fight_id
390,0.0,11.0,30.0,1.0,227.0,sig_str,5.0
391,0.0,8.0,25.0,1.0,227.0,head,5.0
392,0.0,2.0,3.0,1.0,227.0,body,5.0
393,0.0,1.0,2.0,1.0,227.0,leg,5.0
394,0.0,8.0,27.0,1.0,227.0,distance,5.0
395,0.0,2.0,2.0,1.0,227.0,clinch,5.0
396,0.0,1.0,1.0,1.0,227.0,ground,5.0
397,1.0,16.0,27.0,1.0,227.0,sig_str,5.0
398,1.0,9.0,19.0,1.0,227.0,head,5.0
399,1.0,5.0,5.0,1.0,227.0,body,5.0


In [12]:
new_state['name_dict']

NameError: name 'new_state' is not defined

In [13]:
notable_fighters = list(filter(lambda x: len(x['fighter_names']) > 1, fighter_dict.values()))
notable_fighters

[{'fighter_names': ['Jun Yong Park', 'Junyong Park'],
  'fight_ids': [5194, 5027]},
 {'fighter_names': ['Jacare Souza', 'Ronaldo Souza'],
  'fight_ids': [5162,
   4852,
   4592,
   4369,
   4254,
   3885,
   3816,
   3449,
   3277,
   2957,
   2671,
   2388,
   2210,
   2093]},
 {'fighter_names': ['Jin Soo Son', 'Jinsoo Son'], 'fight_ids': [4962, 4539]},
 {'fighter_names': ['Joseph Duffy', 'Joe Duffy'],
  'fight_ids': [4783, 4122, 3857, 3496, 3294, 3103, 2904]},
 {'fighter_names': ['Nicolae Negumereanu', 'Nick Negumereanu'],
  'fight_ids': [4782]},
 {'fighter_names': ['Carlo Pedersoli', 'Carlo Pedersoli Jr.'],
  'fight_ids': [4747, 4556, 4386]},
 {'fighter_names': ['Dmitry Sosnovskiy', 'Dmitriy Sosnovskiy'],
  'fight_ids': [4314]},
 {'fighter_names': ['Joshua Burkman', 'Josh Burkman'],
  'fight_ids': [4279,
   4019,
   3843,
   3643,
   3457,
   3334,
   3153,
   2999,
   818,
   752,
   683,
   630,
   558,
   500,
   416,
   367,
   344,
   326]},
 {'fighter_names': ['Timothy Johnson

In [14]:
[n['fighter_names'] for n in notable_fighters]

[['Jun Yong Park', 'Junyong Park'],
 ['Jacare Souza', 'Ronaldo Souza'],
 ['Jin Soo Son', 'Jinsoo Son'],
 ['Joseph Duffy', 'Joe Duffy'],
 ['Nicolae Negumereanu', 'Nick Negumereanu'],
 ['Carlo Pedersoli', 'Carlo Pedersoli Jr.'],
 ['Dmitry Sosnovskiy', 'Dmitriy Sosnovskiy'],
 ['Joshua Burkman', 'Josh Burkman'],
 ['Timothy Johnson', 'Tim Johnson'],
 ['Jimmy Wallhead', 'Jim Wallhead'],
 ['Seo Hee Ham', 'Seohee Ham'],
 ['Rafael Feijao', 'Rafael Cavalcante'],
 ['Emily Kagan', 'Emily Peters Kagan'],
 ['Steven Kennedy', 'Steve Kennedy'],
 ['Antonio Rogerio Nogueira', 'Minotauro Nogueira'],
 ['Antonio Dos Santos', 'Antonio Dos Santos Jr.'],
 ['Luiz Dutra Jr.', 'Luiz Dutra'],
 ['Quinton Jackson', 'Rampage Jackson'],
 ['Alexander Torres', 'Alex Torres'],
 ['William Macario', 'William Patolino'],
 ['Rodrigo de Lima', 'Marcos Rogerio de Lima'],
 ['Jianping Yang', 'Yang Jianping'],
 ['Sai Wang', 'Wang Sai'],
 ['Anying Wang', 'An Ying Wang'],
 ['David Galera', 'Dave Galera'],
 ['Jose Tome', 'Jose Mari

In [15]:
new_state['df_fight_data']

NameError: name 'new_state' is not defined

In [16]:
fight_links

NameError: name 'fight_links' is not defined

In [17]:
with urllib.urlopen("http://ufcstats.com/event-details/2a470ad41c22c25a") as html_single_event:
    soup_single_event = BeautifulSoup(html_single_event.read()) 

In [18]:
def find_fight_links(soup_event_page):
    """Here we take in the soup object for a single event page.
    Return the list of urls for specific fights in the event.
    """

    all_fight_tags = soup_event_page.find_all('a', "b-flag b-flag_style_green")
    return [fight.get('href') for fight in all_fight_tags]
    

find_fight_links(soup_single_event)[0]

'http://ufcstats.com/fight-details/4b7ec02b39fc6f70'

In [19]:
def find_method_round_time(fight_soup):
    """Here we take in the soup object for a single fight page.
    Return the method, number of rounds, and time of the last round.
    """

    fight_results = fight_soup.find_all(class_="b-fight-details__text")
    overall_fight_info = fight_results[0].get_text().split()

    if not ('Method:' in overall_fight_info and 
            'Round:' in overall_fight_info and 
            'Time:' in overall_fight_info):
        
        raise Exception('Invalid fight info')

    method_ind = overall_fight_info.index('Method:')
    round_ind = overall_fight_info.index('Round:')
    time_ind = overall_fight_info.index('Time:')

    time = overall_fight_info[time_ind + 1]
    time = time.split(':')
    time_seconds = int(time[0]) * 60 + int(time[1])
    
    return ({'method': overall_fight_info[method_ind + 1], 
             'round': int(overall_fight_info[round_ind + 1]), 
             'time_last_round': time_seconds}
    )

with urllib.urlopen("http://ufcstats.com/fight-details/a43dcd04788d95c7") as html_fight:
    soup_fight = BeautifulSoup(html_fight.read())
find_method_round_time(soup_fight)

{'method': 'KO/TKO', 'round': 2, 'time_last_round': 134}

In [20]:
def get_sig_strikes(fight_soup, num_rounds):
    """Takes in a soup object for an individual fight page and the number of rounds.
    Return tuple containing round-by-round significant strike counts for both fighters
    """
    
    strike_types = ['_', 'sig_str', '_', 'head', 'body', 'leg', 'distance', 'clinch', 'ground']

    fight_details = fight_soup.find_all(class_="b-fight-details__table js-fight-table")
    data = [r.get_text().strip() for r in fight_details[1].find_all(class_="b-fight-details__table-text")]

    if not (len(data) == num_rounds * 18):
        raise Exception('Unexpected number of stats')

    fighter_0_strikes = data[::2]
    fighter_1_strikes = data[1::2]
    
    return (process_fighter_strikes(fighter_0_strikes, strike_types, num_rounds), 
            process_fighter_strikes(fighter_1_strikes, strike_types, num_rounds))

def get_other_stats(fight_soup, num_rounds, last_round_sec):
    """Takes in a soup object for an individual fight page and the number of rounds.
    Return tuple containing counts of stats other than significant strikes for both fighters
    """
    round_by_round = fight_soup.find_all(class_="b-fight-details__table js-fight-table")
    data = [r.get_text().strip() for r in round_by_round[0].find_all(class_="b-fight-details__table-text")]

    fighter_0_strikes = data[::2]
    fighter_1_strikes = data[1::2]

    process_singlenum = lambda x, i: [int(x), -1]
    def process_control(s, round_number):
        """Take in m:ss time of control and return [seconds_ctrl, seconds_round].
        """
        ls = s.split(':')
        landed = int(ls[0]) * 60 + int(ls[1])
        if round_number == num_rounds - 1:
            return [landed, last_round_sec]
        return [landed, 300]

    strike_types = ['_', 'kd', '_', '_', 'total_str', 'td', '_', 'sub_att', 'rev', 'ctrl']
    process_method = [process_singlenum, split_of, split_of, 
                      process_singlenum, process_singlenum, process_control]

    # return fighter_0_strikes
    return (process_fighter_strikes(fighter_0_strikes, strike_types, num_rounds, process_method),
            process_fighter_strikes(fighter_1_strikes, strike_types, num_rounds, process_method)
    )

def split_of(s, round_number=None):
        """Convert '4 of 5' to [4, 5]
        """
        returned = s.split('of')
        return [int(i) for i in returned]


def process_fighter_strikes(fighter_strikes, strike_types, num_rounds, process_method='split_of'):
    """Process raw scraped data on fighter strikes.
    strike_types must be the ordered list of strike types that appear in the raw scraped data.
    Any ignored scraped values must be represented as '_'
    """
    num_types = len(strike_types)

    # create a list of strikes for each round
    fighter_strikes = [[fighter_strikes[i * num_types + j] for j in range(num_types)] 
                            for i in range(num_rounds)]

    # map each strike type to the corresponding data
    fighter_strikes = [{strike_types[i]: strike_values[i] for i in range(num_types)} 
                            for strike_values in fighter_strikes]
    
    # filter out unused scraped data
    fighter_strikes = [dict(filter(lambda elem: elem[0] != '_', strikes.items())) 
                            for strikes in fighter_strikes]

    if process_method == 'split_of':
        process_method = [split_of for i in range(len(fighter_strikes[0]))]

    # convert raw data to list [#landed, #out_of]
    fighter_strikes = [{k: process_method[i](v, j) for i, (k, v) in enumerate(round_data.items())} 
                            for j, round_data in enumerate(fighter_strikes)]

    return fighter_strikes

def get_fighter_names(fight_page_soup, num_rounds):
    
    fight_details = fight_page_soup.find_all(class_="b-fight-details__table js-fight-table")
    data = [r.get_text().strip() for r in fight_details[1].find_all(class_="b-fight-details__table-text")]

    if not (len(data) == num_rounds * 18):
        raise Exception('Unexpected number of stats')
    
    return data[0], data[1]

ss = get_sig_strikes(soup_fight, 2)
os = get_other_stats(soup_fight, 2, 17)

In [21]:
os[0][0]

{'kd': [0, -1],
 'total_str': [7, 13],
 'td': [0, 0],
 'sub_att': [0, -1],
 'rev': [0, -1],
 'ctrl': [0, 300]}

In [22]:
def handle_df_stat_data(data_state, sig_strikes_data, other_stat_data, num_rounds, time_last_round):
    data_state = data_state.copy()
    df_stats = data_state['df_stat_data'].copy()

    for data in (sig_strikes_data, other_stat_data):
        for fighter_bin, fighter_data in enumerate(data):
            for round, stat_data in enumerate(fighter_data):
                for stat, vals in stat_data.items():
                    df_stats = df_stats.append({
                        'Fighter': fighter_bin,
                        'Landed': vals[0],
                        'Out Of': vals[1],
                        'Round': round + 1,
                        'Seconds': time_last_round if round == num_rounds - 1 else 300,
                        'Stat': stat,
                        'fight_id': data_state['max_fight_id']
                    }, ignore_index=True)
    
    data_state.update({
        'df_stat_data': df_stats
    })
    return data_state

In [23]:
def handle_df_fight_data(data_state, fighter0_name, fighter1_name, date, winner, method, round_end):
    """return a new data state with updated df_fight_data"""

    data_state = data_state.copy()
    date_str = date.strftime('%Y-%m-%d')
    fight_name = f'{fighter0_name} v {fighter1_name} {date_str}'
    fighter0_id = data_state['name_dict'][fighter0_name]
    fighter1_id = data_state['name_dict'][fighter1_name]
    
    new_fight_data = data_state['df_fight_data'].copy()
    new_fight_data = new_fight_data.append({
        'fight_name': fight_name,
        'fighter_0': fighter0_id,
        'fighter_1': fighter1_id,
        'winner': winner,
        'method': method,
        'round_end': round_end,
        'date': date_str,
        'fight_id': data_state['max_fight_id'],
    }, ignore_index=True)

    data_state['df_fight_data'] = new_fight_data
    return data_state

In [24]:
def handle_single_fighter_name(data_state, fighter_name):
    """Update the data state based on the name of the fighter."""

    # Prevent side effects
    data_state = data_state.copy()
    fighter_dict = copy.deepcopy(data_state['fighter_dict'])
    name_dict = data_state['name_dict'].copy()

    max_fighter_id = data_state['max_fighter_id']

    # if we have already seen fights from this fighter,
    # simply append the current fight id to their entry in the fighter_dict
    if fighter_name in name_dict.keys():
        
        fighter_id = name_dict[fighter_name]
        fighter_dict[str(fighter_id)]['fight_ids']\
            .append(data_state['max_fight_id'])
        
    # otherwise add new entries in name_dict and fighter_dict for this fighter
    else:
        max_fighter_id += 1
        name_dict[fighter_name] = str(max_fighter_id)
        fighter_dict[str(max_fighter_id)] = {'fighter_names': [fighter_name],
                                             'fight_ids': [data_state['max_fight_id']]}

    data_state.update({
        'name_dict': name_dict,
        'fighter_dict': fighter_dict,
        'max_fighter_id': max_fighter_id
    })
    return data_state

In [25]:
with urllib.urlopen("http://ufcstats.com/fight-details/388adad23aaa9d39") as html:
    test_soup = BeautifulSoup(html.read())
find_method_round_time(test_soup)

{'method': 'Decision', 'round': 3, 'time_last_round': 300}

In [26]:
def handle_fight_link(data_state, fight_page_soup, fight_date):
    """Update all tracked data based on new data found in given single fight webpage.
    """

    data_state = data_state.copy()

    # gather data from the page
    method, round_num, time = tuple(find_method_round_time(fight_page_soup).values())
    fighter0_name, fighter1_name = get_fighter_names(fight_page_soup, round_num)
    fight_winner = determine_winner(fight_page_soup)

    sig_strikes_data = get_sig_strikes(fight_page_soup, round_num)
    other_stat_data = get_other_stats(fight_page_soup, round_num, time)
    

    # update the max fight_id so that correct value is used in other action handlers
    data_state['max_fight_id'] = data_state['max_fight_id'] + 1

    # update the data state based on the names of the fighters
    data_state = handle_single_fighter_name(data_state, fighter0_name)
    data_state = handle_single_fighter_name(data_state, fighter1_name)

    # update the value of df_fight_data in the data state
    data_state = handle_df_fight_data(data_state, fighter0_name, fighter1_name, fight_date, fight_winner, method, round_num)
    
    # update the vlue of df_stat_data in the data state
    data_state = handle_df_stat_data(data_state, sig_strikes_data, other_stat_data, round_num, time)

    return data_state


# handle_fight_link([], soup_fight, 3)[0]

In [27]:
data_state_start = {
    'max_fight_id': 0,
    'max_fighter_id': 500,
    'name_dict': {},
    'fighter_dict': {},
    'df_fight_data': pd.DataFrame({col_name: [] for col_name in df_fight_data.columns}),
    'df_stat_data': pd.DataFrame({col_name: [] for col_name in df_stat_data.columns})
}

new_state = handle_fight_link(data_state_start, soup, datetime(2021, 12, 18))

In [28]:
new_state['df_fight_data']

Unnamed: 0,fight_name,fighter_0,fighter_1,winner,method,round_end,date,fight_id
0,Derrick Lewis v Tai Tuivasa 2021-12-18,501,502,1.0,KO/TKO,2.0,2021-12-18,1.0


In [22]:
df_stat_data.Stat.unique()

array(['kd', 'sig str', 'total str', 'td', 'sub att', 'pass', 'rev',
       'head', 'body', 'leg', 'distance', 'clinch', 'ground'],
      dtype=object)

In [21]:
new_state['df_stat_data'].Stat.unique()

array(['sig_str', 'head', 'body', 'leg', 'distance', 'clinch', 'ground',
       'kd', 'total_str', 'td', 'sub_att', 'rev', 'ctrl'], dtype=object)