# Web Scraping for League Results and Odds

## Import packages

In [15]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from datetime import datetime
import requests

## Get Working Directories

In [16]:
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
DATA_DIR = os.path.join(BASE_DIR, 'data_out')
# Replace with the path to your chromedriver
CHROMEDRIVER_DIR = os.path.join(BASE_DIR, 'chromedriver-mac-x64','chromedriver')

## Create Season URLs

In [17]:
def getURLs():
    curr_year = datetime.now().year

    seasons = list(reversed(range(curr_year - 20, curr_year)))

    root_url = 'https://www.oddsportal.com/soccer/england/premier-league'
    results_path = '/results/'
    results_url = root_url + results_path

    #Get URLs for results pages for every season
    seasons_url = [root_url + '-' + str(season) + '-' + str(season + 1) + results_path for season in seasons]

    #complete url list to be scraped
    return [results_url] + seasons_url

## Initialize scroll function to click page links

In [18]:

def scroll_to_element_and_click(driver, element):
    # Scroll the element into view
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    # Add a small delay if needed for any animation or page shift
    time.sleep(3)
    try:
        button = driver.find_element(By.ID, "onetrust-accept-btn-handler")  # Replace with the actual button ID
        button.click()
        time.sleep(1)
    except NoSuchElementException:
        print("No cookies!")
    # Click the element after scrolling
    element.click()

## WebScrape OddsPortal for results

In [19]:
service = Service(executable_path=CHROMEDRIVER_DIR)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
driver.implicitly_wait(2)

df = pd.DataFrame()
unique_matches = set()  # To track unique matches

all_urls = getURLs()
# Iterate over urls for seasons
for url in all_urls:
    driver.get(url)

    #Get every page for current URL
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'pagination')))
    pagination_container = driver.find_element(By.CLASS_NAME, 'pagination')
    pagination_links = pagination_container.find_elements(By.CLASS_NAME, 'pagination-link')

    if len(pagination_links) > 1:
        pagination_links = pagination_links[:-1]

    previous_page = None

    for link in pagination_links:
        current_page = link.get_attribute("data-number")
        
        # Click the pagination link and wait for the page to load
        if previous_page:
            #print(f"Navigating from page {previous_page} to page {current_page}")
            link.click()
        else:
            #print(f"Starting from page {current_page}")
            scroll_to_element_and_click(driver, link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'eventRow')))

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        for row in soup.find_all('div', class_='eventRow'):
            if row.find('div', class_='text-black-main font-main w-full truncate text-xs font-normal leading-5'):
                curr_date = row.find('div', class_='text-black-main font-main w-full truncate text-xs font-normal leading-5').text.strip()

            if len(row.find_all('p', attrs={'data-v-18e31eaa': True})) >= 2 and len(row.find_all('p', class_='participant-name truncate')) > 1:
                home_team = row.find_all('p', class_='participant-name truncate')[0].text.strip()
                away_team = row.find_all('p', class_='participant-name truncate')[1].text.strip()

                # Create a unique identifier for each match
                match_id = (curr_date, home_team, away_team)

                if match_id not in unique_matches:
                    unique_matches.add(match_id)  # Add the match to the set
                    new_row = pd.DataFrame([{
                        'season': soup.find('a', 'active-item-calendar').text.strip(),
                        'date': curr_date,
                        'home_team': home_team,
                        'away_team': away_team,
                        'h_goals': row.find_all('div', class_='min-mt:!flex')[0].text.strip(),
                        'a_goals': row.find_all('div', class_='min-mt:!flex')[1].text.strip(),
                        'h_odds': row.find_all('p', attrs={'data-v-18e31eaa': True})[0].text.strip(),
                        'd_odds': row.find_all('p', attrs={'data-v-18e31eaa': True})[1].text.strip(),
                        'a_odds': row.find_all('p', attrs={'data-v-18e31eaa': True})[2].text.strip()
                    }])
                    df = pd.concat([df, new_row], ignore_index=True)

        previous_page = current_page

driver.quit()

No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!
No cookies!


TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000109a77f68 chromedriver + 7110504
1   chromedriver                        0x0000000109a6ff6a chromedriver + 7077738
2   chromedriver                        0x00000001094110f0 chromedriver + 397552
3   chromedriver                        0x000000010945d383 chromedriver + 709507
4   chromedriver                        0x000000010945d681 chromedriver + 710273
5   chromedriver                        0x00000001094a2e14 chromedriver + 994836
6   chromedriver                        0x000000010948193d chromedriver + 858429
7   chromedriver                        0x00000001094a0234 chromedriver + 983604
8   chromedriver                        0x00000001094816b3 chromedriver + 857779
9   chromedriver                        0x0000000109450182 chromedriver + 655746
10  chromedriver                        0x000000010945115e chromedriver + 659806
11  chromedriver                        0x0000000109a3d3b0 chromedriver + 6869936
12  chromedriver                        0x0000000109a412e4 chromedriver + 6886116
13  chromedriver                        0x0000000109a1f9b7 chromedriver + 6748599
14  chromedriver                        0x0000000109a41d6e chromedriver + 6888814
15  chromedriver                        0x0000000109a0ec84 chromedriver + 6679684
16  chromedriver                        0x0000000109a5e838 chromedriver + 7006264
17  chromedriver                        0x0000000109a5e9f6 chromedriver + 7006710
18  chromedriver                        0x0000000109a6fb78 chromedriver + 7076728
19  libsystem_pthread.dylib             0x00007ff81537018b _pthread_start + 99
20  libsystem_pthread.dylib             0x00007ff81536bae3 thread_start + 15


In [21]:
df.to_csv(os.path.join(DATA_DIR,  'matches.csv'), index = False)