In [1]:
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd

In [2]:
#url_to_scrape
url_zerozero = 'https://www.zerozero.pt/edition_matches.php?fase_in=164113&equipa=0&id_edicao=165864&filtro=&page=1&op=calendario'

driver = webdriver.Chrome(ChromeDriverManager().install())

In [3]:
#driver.get method will navigate to a page given by the URL
driver.get(url_zerozero)

In [4]:
# On the ZeroZero website there is a page called Calendário. 
# On this page there is a list containing the details of the games in the season. 
# We want to get all that info and put it in a DataFrame. 
# Each column will be a property such as home, away, date, hour, round, result, ... and every row will be a match.

matches = []
number_of_pages = 6

In [5]:
number_extracted_correctly = 0
number_bad_records = 0

#Loop over the different pages
for i in range(0, number_of_pages-1):

    #In the DOM we look at every entrance in the zztable stats table which is associated to each match
    selections = driver.find_elements(By.XPATH, ".//table[@class='zztable stats']/tbody/tr")
    
    #Loop over the divs
    for s in selections:
        
        #Try catch in python, to skip over the records that do not contain the info we need.
        try:
            
            # Dictionary called "o" wherein we will store the data:
            o = {}
            
            # Extract the info of the match
            o['date'] = s.find_element_by_class_name('date').text
            o['home'] = s.find_element_by_class_name('home').text
            o['result'] = s.find_element_by_class_name('result').text
            o['away'] = s.find_element_by_class_name('away').text
            o['phase'] = s.find_element_by_class_name('phase').text
            
            # Put the match dictionary in the match list
            matches.append(o)
            
            number_extracted_correctly +=1
        except:
            number_bad_records +=1

    # Go to the next page by clicking the HIDDEN "next" button or other correct button.
    #selection = int(driver.find_element_by_class_name('pager').find_element_by_class_name('pager-current').text)
    selection = int(driver.find_element(By.XPATH, ".//div[@class='zz-pagination']/div/span").text)
    if selection != (number_of_pages-1):
        driver.find_element_by_link_text(str(selection+1)).click()
    
    # 2 second delay required, else the page will not be loaded when selecting elements
    time.sleep(2)

#driver.close()

print ("Number of correctly extracted records: %i" % number_extracted_correctly)
print ("Number of bad records that were ignored: %i" % number_bad_records)

Number of correctly extracted records: 250
Number of bad records that were ignored: 5


In [6]:
# Using the created list of dictionaries, we create a Dataframe. 
# The column headers will be the dictionary keys.
df_matches = pd.DataFrame.from_dict(matches)
df_matches

Unnamed: 0,date,home,result,away,phase
0,2022-08-05,Benfica,4-0,FC Arouca,J1
1,2022-08-06,Rio Ave,0-1,FC Vizela,J1
2,2022-08-06,Estoril Praia,2-0,FC Famalicão,J1
3,2022-08-06,FC Porto,5-1,Marítimo,J1
4,2022-08-07,Santa Clara,0-0,Casa Pia,J1
...,...,...,...,...,...
245,2023-04-16,FC Famalicão,vs,Vitória SC,J28
246,2023-04-16,Rio Ave,vs,Casa Pia,J28
247,2023-04-16,Marítimo,vs,Paços de Ferreira,J28
248,2023-04-16,FC Porto,vs,Santa Clara,J28
