# Scrape UFC events data

1. Compile a list of the past events and write to file
2. Visit each wiki page of the past events and gather matches data
2. Compile a list of the schedule events and write to file
3. Collect the poster images of the past events

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import itertools
import re
pd.set_option('display.max_colwidth', -1)

## Scrape list of past events

In [10]:
# Last visited 4/27/2020
# Collect list of past events
url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'
url_request = requests.get(url).text
soup = BeautifulSoup(url_request, 'html.parser')

In [11]:
events_data = []
table = soup.find('table',{'id':'Past_events'})
table_rows = table.find_all('tr')
for row in table_rows[1:]:
    events_data.append([t.text.strip() for t in row.find_all('td')]) 
            
events_df = pd.DataFrame(events_data, columns=['Index', 'Event', 'Date', 'Venue', 'Location', 'Attendance','Ref'])

In [12]:
links = []
for row in table_rows[1:]:
#     links.append([t.get('href') for t in row.find_all('a')]) 
    links.append("https://en.wikipedia.org" + row.find('a').get('href'))

In [13]:
events_df.insert(2, "wikipage", links) 

In [14]:
print("Total events: " + str(len(events_df)))
events_occurred_df = events_df[events_df['Attendance']!="Cancelled"]
print("Events (excluding cancelled): " + str(len(events_occurred_df)))

Total events: 523
Events (excluding cancelled): 514


In [None]:
# Write list of past UFC events to csv file
events_df.to_csv("list_of_UFC_past_events.csv", index=False)

## Visit each of the past event's wiki page and gather matches data

In [3]:
# Read list of past UFC events
events_df = pd.read_csv("data/list_of_UFC_past_events.csv")
events_occurred_df = events_df[events_df['Attendance']!="Cancelled"]

In [4]:
events_occurred_df = events_occurred_df.reset_index(drop=True)
events_occurred_df

Unnamed: 0,Index,Event,wikipage,Date,Venue,Location,Attendance,Ref
0,513,UFC Fight Night: Lee vs. Oliveira,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,"Mar 14, 2020",Ginásio Nilson Nelson,"Brasília, Brazil",0,[15]
1,512,UFC 248: Adesanya vs. Romero,https://en.wikipedia.org/wiki/UFC_248,"Mar 7, 2020",T-Mobile Arena,"Las Vegas, Nevada, U.S.",15077,[16]
2,511,UFC Fight Night: Benavidez vs. Figueiredo,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Benavidez_vs._Figueiredo,"Feb 29, 2020",Chartway Arena,"Norfolk, Virginia, U.S.",7098,[17]
3,510,UFC Fight Night: Felder vs. Hooker,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Felder_vs._Hooker,"Feb 23, 2020",Spark Arena,"Auckland, New Zealand",10025,[18]
4,509,UFC Fight Night: Anderson vs. Błachowicz 2,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Anderson_vs._B%C5%82achowicz_2,"Feb 15, 2020",Santa Ana Star Center,"Rio Rancho, New Mexico, U.S.",6449,[19]
...,...,...,...,...,...,...,...,...
508,005,UFC 5: The Return of the Beast,https://en.wikipedia.org/wiki/UFC_5,"Apr 7, 1995",Independence Arena,"Charlotte, North Carolina, U.S.",6000,[464]
509,004,UFC 4: Revenge of the Warriors,https://en.wikipedia.org/wiki/UFC_4,"Dec 16, 1994",Expo Square Pavilion,"Tulsa, Oklahoma, U.S.",5857,[465]
510,003,UFC 3: The American Dream,https://en.wikipedia.org/wiki/UFC_3,"Sep 9, 1994",Grady Cole Center,"Charlotte, North Carolina, U.S.",,
511,002,UFC 2: No Way Out,https://en.wikipedia.org/wiki/UFC_2,"Mar 11, 1994",Mammoth Gardens,"Denver, Colorado, U.S.",2000,[466]


In [None]:
# Get event details from each event wikipage
result_df = pd.DataFrame()
for index, row in events_occurred_df.iterrows():
# for index, row in itertools.islice(events_occurred_df.iterrows(), 3):
    event_name = row['Event']
    event_url = row['wikipage']
    event_date = row['Date']
    print(str(index) + "\t" + event_date + "\t" + event_name)
    try:
        url_request = requests.get(event_url).text
        soup = BeautifulSoup(url_request, 'html.parser')
        data = []
        table = soup.find('table',{'class':'toccolours'})
        table_rows = table.find_all('tr')
        for row in table_rows:
            data.append([t.text.strip() for t in row.find_all('td')]) 
        df = pd.DataFrame(data, columns=['Weight class', 'Fighter1', 'Result', 'Fighter2', 'Method', 'Round', 'Time','Note'])
        df = df[~df['Weight class'].isnull()]
        df.insert(0, "Date", event_date) 
        df.insert(1, "Event", event_name)
    except:
        print("Error")
#     df.to_csv("data/"+ event_name + ".csv", index=False)
    result_df = result_df.append(df, ignore_index=True)

In [None]:
# Initial save of incomplete data
# result_df.to_csv("data/ufc_matches.csv", index=False)
# result_df.to_json("data/ufc_matches.json",orient='records')

In [None]:
print(len(result_df))
print(len(result_df['Event'].unique()))

In [None]:
# Need to inspect these pages
inspect_df = pd.DataFrame(np.array([['Apr 20, 2013', 'UFC on Fox: Henderson vs. Melendez','2'],
                                    ['Jan 19, 2013','UFC on FX: Belfort vs. Bisping','1'],
                                    ['Jul 21, 2012','UFC 149: Faber vs. Barão','5'],
                                    ['Jun 8, 2012','UFC on FX: Johnson vs. McCall','4'],
                                    ['May 15, 2012','UFC on Fuel TV: The Korean Zombie vs. Poirier','3'], 
                                    ['Feb 15, 2012','UFC on Fuel TV: Sanchez vs. Ellenberger','2'],
                                    ['Jan 28, 2012','UFC on Fox: Evans vs. Davis','1'], 
                                    ['Apr 5, 2007','UFC Fight Night: Stevenson vs. Guillard','1'], 
                                    ['Aug 6, 2005','UFC Ultimate Fight Night','-1']]), #no wiki page exists
                   columns=['date', 'event_name','table_number'])
supp_result_df = pd.DataFrame()

In [None]:
for index in range(0,8):
    try:
        event_date = inspect_df.iloc[index,0]
        event_name = inspect_df.iloc[index,1]
        table_number = int(inspect_df.iloc[index,2])
        event_url = events_occurred_df[events_occurred_df['Date']==event_date]['wikipage'].values[0]
        print(event_url)
        url_request = requests.get(event_url).text
        soup = BeautifulSoup(url_request, 'html.parser')
        data = []
        table = soup.findAll('table',{'class':'toccolours'})
        table_rows = table[table_number].find_all('tr')
        for row in table_rows:
            data.append([t.text.strip() for t in row.find_all('td')]) 
        df = pd.DataFrame(data, columns=['Weight class', 'Fighter1', 'Result', 'Fighter2', 'Method', 'Round', 'Time','Note'])
        df = df[~df['Weight class'].isnull()]
        df.insert(0, "Date", event_date) 
        df.insert(1, "Event", event_name)
        print(df.iloc[0,0:6])
    except:
        print("Error")
    supp_result_df = supp_result_df.append(df, ignore_index=True)

In [None]:
print(len(supp_result_df))
print(len(supp_result_df['Event'].unique()))

In [None]:
supp_result_df[supp_result_df['Date']=='Feb 15, 2012']

In [None]:
# Find info on this event since no wiki page exists
# Manually add info for this event
event_date = 'Aug 6, 2005'
event_name = 'UFC Ultimate Fight Night'
# https://www.ufc.com/event/UFC-Fight-Night-1
# https://www.sherdog.com/events/UFC-Fight-Night-1-Marquardt-vs-Salaverry-3100
data_array = np.array([
    ['Middleweight', 'Nate Marquardt', 'def.', 'Ivan Salaverry', 'Decision (unanimous) (30-27, 30-27, 29-28)', '3', '5:00', ''],
    ['Middleweight', 'Chris Leben', 'def.', 'Patrick Cote', 'Decision (split) (29-28, 30-27, 27-29)', '3', '5:00', ''],
    ['Light Heavyweight', 'Stephan Bonnar', 'def.', 'Sam Hoger', 'Decision (unanimous) (30-27, 30-27, 30-27)', '3', '5:00', ''],
    ['Middleweight', 'Nate Quarry', 'def.', 'Pete Sell', ' TKO (punch)', '1', '0:42', ''],
    ['Welterweight', 'Josh Koscheck', 'def.', 'Pete Spratt', 'Submission (rear-naked choke)', '1', '1:53', ''],
    ['Middleweight', 'Mike Swick', 'def.', 'Gideon Ray', 'TKO (punches)', '1', '0:22', ''],
    ['Welterweight', 'Kenny Florian', 'def.', 'Alex Karalexis', 'TKO (doctor stoppage)', '2', '2:52', ''],
    ['Welterweight', 'Drew Fickett', 'def.', 'Josh Neer', 'Submission (rear-naked choke)', '1', '1:35', '']])

In [None]:
df = pd.DataFrame(data_array,
                  columns=['Weight class', 'Fighter1', 'Result', 'Fighter2', 'Method', 'Round', 'Time','Note'])
df.insert(0, "Date", event_date) 
df.insert(1, "Event", event_name)
df

In [None]:
supp_result_df = supp_result_df.append(df)
print(len(supp_result_df))
print(len(supp_result_df['Event'].unique()))

In [None]:
result_df = result_df.append(supp_result_df)
print(len(result_df))
print(len(result_df['Event'].unique()))

In [None]:
# Remove any duplicate rows that might have gathered
result_df = result_df.drop_duplicates()

In [None]:
# Save ufc matches to file
result_df.to_csv("data/ufc_matches.csv", index=False)
result_df.to_json("data/ufc_matches.json",orient='records')

## Scrape list of scheduled events

In [62]:
# Collect list of scheduled events
url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'
url_request = requests.get(url).text
soup = BeautifulSoup(url_request, 'html.parser')

scheduled_events_data = []
scheduled_table = soup.find('table',{'id':'Scheduled_events'})
scheduled_table_rows = scheduled_table.find_all('tr')
for row in scheduled_table_rows[1:]:
    scheduled_events_data.append([t.text.strip() for t in row.find_all('td')]) 
            
scheduled_events_df = pd.DataFrame(scheduled_events_data, columns=['Event', 'Original Date', 'Venue', 'Location', 'Ref','Notes'])

In [66]:
scheduled_events_df

Unnamed: 0,Event,Original Date,wikipage,Venue,Location,Ref,Notes
0,UFC Fight Night,"Aug 15, 2020",https://en.wikipedia.org/wiki/3Arena,3Arena,"Dublin, Ireland",[9],Postponed
1,UFC 251 or 252,"Jul 11, 2020",https://en.wikipedia.org/wiki/T-Mobile_Arena,T-Mobile Arena,"Las Vegas, Nevada, U.S.",[9],Postponed
2,UFC on ESPN 11,"Jun 27, 2020",https://en.wikipedia.org/wiki/UFC_on_ESPN_11,Frank Erwin Center,"Austin, Texas, U.S.",[9],Postponed
3,UFC Fight Night: Blaydes vs. Volkov,"Jun 20, 2020",https://en.wikipedia.org/wiki/UFC_Fight_Night:_Blaydes_vs._Volkov,SaskTel Centre,"Saskatoon, Saskatchewan, Canada",[9],Postponed
4,UFC Fight Night 176,"Jun 13, 2020",https://en.wikipedia.org/wiki/UFC_Fight_Night_176,Astana Arena,"Nur-Sultan, Kazakhstan",[9],Postponed
5,UFC 251,"Jun 6, 2020",https://en.wikipedia.org/wiki/UFC_251,Perth Arena,"Perth, Australia",[9],Postponed
6,UFC 250,"Jun 6, 2020",https://en.wikipedia.org/wiki/UFC_250,TBD,"TBD, U.S",[10],Planned
7,UFC Fight Night 175,"May 23, 2020",https://en.wikipedia.org/wiki/UFC_Fight_Night_175,UFC APEX,"Las Vegas, U.S.",[11],Planned
8,UFC on ESPN: Overeem vs. Harris,"May 16, 2020",https://en.wikipedia.org/wiki/UFC_on_ESPN:_Overeem_vs._Harris,VyStar Veterans Memorial Arena,"Jacksonville, Florida, U.S.",[12],Planned
9,UFC on ESPN: Smith vs. Teixeira,"May 13, 2020",https://en.wikipedia.org/wiki/UFC_on_ESPN:_Smith_vs._Teixeira,VyStar Veterans Memorial Arena,"Jacksonville, Florida, U.S.",[12],Planned


In [64]:
links = []
for row in scheduled_table_rows[1:]:
#     links.append([t.get('href') for t in row.find_all('a')]) 
    links.append("https://en.wikipedia.org" + row.find('a').get('href'))
scheduled_events_df.insert(2, "wikipage", links) 

In [None]:
scheduled_events_df.to_csv("data/list_of_UFC_scheduled_events.csv",index=False)

## Get links to posters of past events

In [None]:
# Read list of past UFC events
events_df = pd.read_csv("data/list_of_UFC_past_events.csv")
events_occurred_df = events_df[events_df['Attendance']!="Cancelled"]
# len(events_occurred_df[279:])
events_occurred_df = events_occurred_df.reset_index(drop=True)
events_occurred_df

In [None]:
# Get event posters
posters_df = pd.DataFrame()   
for index, row in events_occurred_df.iterrows():
    event_name = row['Event']
    event_url = row['wikipage']
    event_date = row['Date']
    print(str(index) + "\t" + event_date + "\t" + event_name)
#     print(event_url)
    try:
        url_request = requests.get(event_url).text
        soup = BeautifulSoup(url_request, 'html.parser')
        data = []
#         table = soup.find('table',{'class':'infobox'})
#         images = table.find('img')
        images = soup.find('img')
        image_link = "http:" + images['src']
#         print(image_link)
        data.append(image_link) 
        df = pd.DataFrame(data, columns=['poster_url'])
        df.insert(0, "Date", event_date) 
        df.insert(1, "Event", event_name)
        df.insert(2, "wikipage", event_url)
    except:
        print("Error")
        df = pd.DataFrame(["Not Found"], columns=['poster_url'])
        df.insert(0, "Date", event_date) 
        df.insert(1, "Event", event_name)
        df.insert(2, "wikipage", event_url)
    posters_df = posters_df.append(df, ignore_index=True)

In [None]:
posters_df['Date']= pd.to_datetime(posters_df['Date'],format= '%b %d, %Y')
posters_df = posters_df.sort_values('Date').reset_index(drop=True)
# posters_df[posters_df['poster_url']=="Not Found"]
# posters_df

In [None]:
# posters_df.loc[posters_df['Event'] == "UFC on Fox: Henderson vs. Melendez", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/8/84/UFC_on_Fox_Henderson_vs._Melendez_Poster.gif/220px-UFC_on_Fox_Henderson_vs._Melendez_Poster.gif"
# posters_df.loc[posters_df['Event'] == "UFC on FX: Belfort vs. Bisping", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/f/f5/UFC_on_FX_Belfort_vs._Bisping_poster.jpg/220px-UFC_on_FX_Belfort_vs._Bisping_poster.jpg"
# posters_df.loc[posters_df['Event'] == "UFC 149: Faber vs. Barão", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/f/f4/UFC_149_Faber_vs._Bar%C3%A3o_poster.jpg/220px-UFC_149_Faber_vs._Bar%C3%A3o_poster.jpg"
# posters_df.loc[posters_df['Event'] == "UFC on FX: Johnson vs. McCall", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/e/ea/UFC_on_FX_Johnson_vs._McCall.jpg/220px-UFC_on_FX_Johnson_vs._McCall.jpg"
# posters_df.loc[posters_df['Event'] == "UFC on Fuel TV: The Korean Zombie vs. Poirier", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/4/4c/UFC_on_Fuel_TV_Korean_Zombie_vs._Poirier_poster.jpg/220px-UFC_on_Fuel_TV_Korean_Zombie_vs._Poirier_poster.jpg"
# posters_df.loc[posters_df['Event'] == "UFC on Fuel TV: Sanchez vs. Ellenberger", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/4/4b/UFC_on_Fuel_TV_Sanchez_vs._Ellenberger_poster.jpg/220px-UFC_on_Fuel_TV_Sanchez_vs._Ellenberger_poster.jpg"
# posters_df.loc[posters_df['Event'] == "UFC on Fox: Evans vs. Davis", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/b/b3/UFC_on_Fox_Evans_vs._Davis_poster.jpg/220px-UFC_on_Fox_Evans_vs._Davis_poster.jpg"
# posters_df.loc[posters_df['Event'] == "UFC Fight Night: Stevenson vs. Guillard", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/b/bb/UFCFIGHTNIGHT9.jpg/220px-UFCFIGHTNIGHT9.jpg"

# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Couture vs. Team Liddell Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/8/83/TUF_1_Finale_Poster_-_Fitness_Philippines.jpg/220px-TUF_1_Finale_Poster_-_Fitness_Philippines.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Hughes vs. Team Franklin Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/c/c9/Tuf2Finale.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Ortiz vs. Team Shamrock Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/f/f0/Tuf-3-finale.jpg/220px-Tuf-3-finale.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: The Comeback Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/4/42/UltimateFighter4.jpg/220px-UltimateFighter4.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Pulver vs. Team Penn Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/a/a0/TUF_5_Finale.jpg/220px-TUF_5_Finale.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Hughes vs. Team Serra Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/5/51/Ultimatefighter6.jpg/220px-Ultimatefighter6.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: Team Rampage vs. Team Forrest Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/5/5d/Ultimatefighter7.jpg/220px-Ultimatefighter7.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: United States vs. United Kingdom Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/a/a2/UFCSanchezGuida.jpg"
# posters_df.loc[posters_df['Event'] == "UFC on FX: Sotiropoulos vs. Pearson", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/8/81/The_smashes_finale.png/220px-The_smashes_finale.png"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter Nations Finale: Bisping vs. Kennedy", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/thumb/1/1b/TUF_Nations_Finale_event_poster.jpg/220px-TUF_Nations_Finale_event_poster.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter: A Champion Will Be Crowned Finale", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/1/1a/TUF_20_finale_event_poster.jpg"
# posters_df.loc[posters_df['Event'] == "The Ultimate Fighter Latin America 3 Finale: dos Anjos vs. Ferguson", 'poster_url'] = "http://upload.wikimedia.org/wikipedia/en/3/37/TUF_LA_3_Finale.jpg"

# Missing wiki page
# posters_df.loc[posters_df['Event'] == "UFC Ultimate Fight Night", 'poster_url'] = "https://m.media-amazon.com/images/M/MV5BMTBjMjJjMWEtYTc3Yi00YTIyLWEyMDMtMjg5NmIyYTFlMmJmXkEyXkFqcGdeQXVyNDczMDU5Nw@@._V1_.jpg"

In [None]:
# event_url = posters_df[posters_df['poster_url'].str.contains("Flag")]['wikipage'].iloc[6]
# url_request = requests.get(event_url).text
# soup = BeautifulSoup(url_request, 'html.parser')
# images = soup.find_all('img')
# for image in images:
#     image_link = "http:" + image['src']
#     print(image_link) 

In [None]:
# Writer posters_df to file
posters_df.to_csv("data/ufc_posters.csv", index=True)

In [16]:
posters_df = pd.read_csv("data/ufc_posters.csv")

In [19]:
posters_df.tail()

Unnamed: 0,id,Date,Event,wikipage,poster_url
508,508,2020-02-15,UFC Fight Night: Anderson vs. Błachowicz 2,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Anderson_vs._B%C5%82achowicz_2,http://upload.wikimedia.org/wikipedia/en/thumb/d/da/UFC_on_ESPN%2B_25.jpg/220px-UFC_on_ESPN%2B_25.jpg
509,509,2020-02-23,UFC Fight Night: Felder vs. Hooker,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Felder_vs._Hooker,http://upload.wikimedia.org/wikipedia/en/thumb/3/38/UFC_Fight_Night_Felder_vs._Hooker_Official_Poster.jpg/220px-UFC_Fight_Night_Felder_vs._Hooker_Official_Poster.jpg
510,510,2020-02-29,UFC Fight Night: Benavidez vs. Figueiredo,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Benavidez_vs._Figueiredo,http://upload.wikimedia.org/wikipedia/en/thumb/e/e5/UFC_Fight_Night-_Benavidez_vs_Figueiredo.jpeg/220px-UFC_Fight_Night-_Benavidez_vs_Figueiredo.jpeg
511,511,2020-03-07,UFC 248: Adesanya vs. Romero,https://en.wikipedia.org/wiki/UFC_248,http://upload.wikimedia.org/wikipedia/en/thumb/e/ee/UFC_248_Poster.jpg/220px-UFC_248_Poster.jpg
512,512,2020-03-14,UFC Fight Night: Lee vs. Oliveira,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,http://upload.wikimedia.org/wikipedia/en/thumb/5/5e/UFC_on_ESPN%2B_28.jpg/220px-UFC_on_ESPN%2B_28.jpg
