In [None]:
import sys
print(sys.version)

In [None]:
# Not necessary at all, but to demonstrate that I'm aware that BeautifulSoup4 must be installed
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install BeautifulSoup4

In [None]:
from bs4 import BeautifulSoup
from time import sleep
import requests
import pandas as pd
import json
import csv
import time
import datetime

In [None]:
Events = []
base_url = 'https://en.wikipedia.org'
main_url = base_url + '/wiki/List_of_UFC_events'

In [None]:
def perform_http_get(url):
    """
    Note:
        This is a function that tries to connect to a given URL, if it fails, a retry will occurs in 3 sec.  

    Args:
        url (str): A string representing the URL we want to integrate.

    Returns:
        The http response content 
    """
    r = requests.get(url)
    if r.status_code == 200:
        return BeautifulSoup(r.content, 'html.parser')
        

In [None]:
def extract_cell(cells, id_td):
    return cells[id_td].renderContents().decode().strip()

In [None]:
def append_fighter_names(cells_event, info):
    fighter1 = ''
    fighter2 = ''
    if len(cells_event[1].findAll('a')) == 0:
        fighter1 = extract_cell(cells_event, 1)
    else:
        fighter1 = cells_event[1].find('a').renderContents().decode().strip()

    if len(cells_event[3].findAll('a')) == 0:
        fighter2 = extract_cell(cells_event, 3)
    else:
        fighter2 = cells_event[3].find('a').renderContents().decode().strip()
    
    info.update({"fighter_1" : fighter1})
    info.update({"fighter_2" : fighter2})

In [None]:
def extract_row(cells_event, link):
       
    info = {
        "event_name": link.contents[0],
        "weight_class": extract_cell(cells_event, 0),
        "action": extract_cell(cells_event, 2),
        "method": extract_cell(cells_event, 4),
        "round": extract_cell(cells_event, 5),
        "time": extract_cell(cells_event, 6)
    }
    
    append_fighter_names(cells_event, info)
    Events.append(info)

In [None]:
def extract_info_individual_event(link):
    
    individual_event = perform_http_get(base_url + link.get('href'))
    table = individual_event.find('table',{'class': 'toccolours'})
    if table is not None:
        rows_event = table.findAll('tr')

        for row_event in rows_event:
            cells_event = row_event.findAll('td')
            if len(cells_event) > 0 :
                extract_row(cells_event, link)
                        

In [None]:
Events = []
soup = perform_http_get(main_url)
table_past_events = soup.find('table', {'id': 'Past_events'})

rows = table_past_events.findAll('tr')

for row in rows:
    sleep(10) # Wait 10 sec, recommendations explained below
    cells = row.findAll('td')
    if len(cells) > 0 :
        links = cells[1].findAll('a')
        for link in links:        
            extract_info_individual_event(link)

In [None]:
print(len(Events))

df = pd.DataFrame(Events)
df = df[['event_name', 'weight_class', 'fighter_1', 'action', 'fighter_2', 'round', 'time', 'method' ]]

df.tail()
