### Let us learn Selenium with Python
- Selenium is a powerful tool for controlling web browsers through programs and performing browser automation.
- It is functional for all browsers, works on all major OS and its scripts are written in various languages i.e Python, Java, C#, etc

### 1. Data extraction
#### 1.1 Import libraries

In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
import pandas as pd
import re

# install the libraries for webpages interactions
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

We will try to replicate this tutorial by David Zerpa on his project [here](https://www.linkedin.com/feed/update/urn:li:activity:7196180482084986880?utm_source=share&utm_medium=member_desktop)

#### 1.2 Creating services in Google Chrome to run session called 'Driver'

In [2]:
def setup_selenium():
    chromedriver_autoinstaller.install()
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(executable_path=chromedriver_autoinstaller.install()) 
    driver = webdriver.Chrome(service=service, options=options)
    return driver


#### 1.3 Creating a function to extract data from the website

In [3]:
url = "https://www.flashscore.com"

In [4]:
def accept_cookies(driver):
    try:
        driver.get(url)

        cookie_button = WebDriverWait(driver, 10).until (
            EC.element_to_be_clickable((BY.ID, "onetrust-accept-btn-handler"))
        )
        cookie_button.click()
        print("Cookies accepted successfully")
    except Exception as e:
            print("Cookies button not found")
            print(e)

![imageParse](images/laliga_league.png)

In [16]:
css_1 = "#my-leagues-list"
# css_2 is the laliga link
css_2 = "#my-leagues-list > div:nth-child(1) > div:nth-child(6) > a > span.leftMenu__text"
url_2 = "https://www.flashscore.com/football/spain/laliga/"

def navigate_to_page(driver):

    accept_cookies(driver)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_1))
        )
        # use javascript to click on the element
        link_1 = driver.find_element(By.CSS_SELECTOR, css_2)
        driver.execute_script("arguments[0].click();", link_1)

        print(f"Successfully navigated to {url_2}")
    except Exception as e:
        print(f"Failed to navigate, error {e}")

![imageParse](images/laliga_league2.png)

In [6]:
def click_results_tab(driver):
    try:
        results_tab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "li2"))
        )
        results_tab.click()
        print("Results tab clicked successfully")
    except Exception as e:
        print(f"Failed to click results tab, error {e}")

In [7]:
def click_show_more_matches(driver):
    try:
        while True:
            try:
                WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "a.event__more.event__more--static"))
                )
                show_more_button = driver.find_element(By.CSS_SELECTOR, "a.event__more.event__more--static")
                show_more_button.click()

                print("Show more matches clicked successfully")
            except ElementClickInterceptedException:
                show_more_button = driver.find_element(By.CSS_SELECTOR, "a.event__more.event__more--static")
                driver.execute_script("arguments[0].click();", show_more_button)
            except StaleElementReferenceException:
                print("Encountered a stale element, retrying...")
                continue
    except TimeoutException:
        pass
    except Exception as e:
        print(f"Failed to click show more matches, error {e}")

In [19]:
def extract_match_data (driver, season_text):
    matches = []
    current_round = None

    elements = driver.find_elements(By.CSS_SELECTOR, "div.leagues--static > div, div.leagues--static > div > div")
    match = re.search(r'(\d{4})/(\d{4})', season_text) # extract the season from the text
    if match:
        first_year = match.group(1)
        second_year = match.group(2)
    else:
        first_year = None
        second_year = None
    
    for element in elements:
        class_attr = element.get_attribute("class")
        if "event__round--static" in class_attr:
            current_round = element.text.replace("ROUND", "").strip()
        elif " event__match--static" in class_attr and current_round:
            date_time = element.find_element(By.CSS_SELECTOR, "div.event__time").text
            date_parts = date_time.split(' ')[0].split('.')
            if len(date_parts) >= 2:
                day, month = date_parts[0], date_parts[1]
                year = second_year if int(month) >= 7 else first_year
                full_date = f"{day}/{month}/{year}"
            else:
                continue

            home_team = element.find_element(By.CSS_SELECTOR, "div.event__participant--home").text
            away_team = element.find_element(By.CSS_SELECTOR, "div.event__participant--away").text
            home_score = element.find_element(By.CSS_SELECTOR, "div.event__score--home").text
            away_score = element.find_element(By.CSS_SELECTOR, "div.event__score--away").text

            match_data = {
                "Season": f"{first_year}/{second_year}",
                "Round": current_round,
                "Date": full_date,
                "Home Team":home_team,
                "Away Team": away_team,
                "Home Score": int(home_score),
                "Away Score": int(away_score),
                "Total Goals": int(home_score) + int(away_score),
                "Result": determine_result(int(home_score), int(away_score))
            }
            matches.append(match_data)

In [9]:
def determine_result(home_score, away_score):
    if home_score > away_score:
        return "Home Win"
    elif home_score < away_score:
        return "Away Win"
    else:
        return "Draw"

In [11]:
def navigate_to_archive(driver):

    navigate_to_page(driver)
  
    try:
        archive_tab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#li5"))
        )
        archive_tab.click()
        print("Archive link clicked successfully")
    except Exception as e:
        print(f"Failed to click archive link, error {e}")

In [12]:
def get_season_links(driver):
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.archive__season a"))
        )
        season_links = driver.find_elements(By.CSS_SELECTOR, "div.archive__season a")
        
        return [(season.text, season.get_attribute("href")) for season in season_links]
    except Exception as e:
        print(f"Failed to get season links, error {e}")
        return []

In [13]:
def extract_data_for_all_seasons(driver):
    navigate_to_archive(driver)
    seasons = get_season_links(driver)
    all_matches = []

    for season_text, season_url in seasons:
        driver.get(season_url)
        click_results_tab(driver)
        click_show_more_matches(driver)
        season_matches = extract_match_data(driver, season_text)
        for match in season_matches:
            match["Season"] = season_text
        all_matches.append(season_matches)
        print(f"Extracted {len(season_matches)} matches for {season_text}")
    return all_matches

In [14]:
def main():
    driver = setup_selenium()
    try:
        all_matches_data = extract_data_for_all_seasons(driver)
        df = pd.DataFrame(all_matches_data)
        df.to_csv("laliga_matches.csv", index=False)
        print("Data extracted successfully")
        return df
    finally:
        driver.quit()

In [20]:
matches = main()

Cookies button not found
name 'BY' is not defined
Successfully navigated to https://www.flashscore.com/football/spain/laliga/
Archive link clicked successfully
Results tab clicked successfully
Failed to click show more matches, error name 'StaleElementReferenceException' is not defined


TypeError: 'NoneType' object is not iterable