# Web Scraping for League Results and Odds

## Import packages

In [123]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from datetime import datetime, timedelta
import requests
from sqlalchemy import create_engine
import psycopg2

## Get Working Directories

In [124]:
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
DATA_DIR = os.path.join(BASE_DIR, 'data_out')
# Replace with the path to your chromedriver
CHROMEDRIVER_DIR = os.path.join(BASE_DIR, 'chromedriver-mac-x64','chromedriver')

## Create Postgres Connection

In [125]:
db_user = 'db_user'    # Replace with your PostgreSQL username
db_password = 'db_password'  # Replace with your PostgreSQL password
db_host = 'localhost'      # Replace with your PostgreSQL host (e.g., localhost or IP)
db_port = '5432'           # PostgreSQL port (default is 5432)
db_name = 'db_name'  # Replace with your PostgreSQL db name  

connection_string = f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create the SQLAlchemy engine
engine = create_engine(connection_string)

In [126]:
#Create query
query = "SELECT * FROM raw_match_data"
# Read the dfFrame from a PostgreSQL table
try:
    df = pd.read_sql(query, engine)
    print("Table queried successfully.")
    
finally:
    # Ensure connection is closed
    engine.dispose()
    print("Connection closed.")

Table queried successfully.
Connection closed.


In [127]:
if len(df) > 0:
    recent_date = datetime.strptime(df['date'][0], "%d %b %Y")
else:
    recent_date = datetime.strptime("10 Jan 2000", "%d %b %Y")

In [128]:
print(recent_date)

2025-01-25 00:00:00


## Create Season URLs

In [129]:
def getURLs():
    curr_year = datetime.now().year - 1

    seasons = list(reversed(range(curr_year - 20, curr_year)))

    root_url = 'https://www.oddsportal.com/soccer/england/premier-league'
    results_path = '/results/'
    results_url = root_url + results_path

    #Get URLs for results pages for every season
    seasons_url = [root_url + '-' + str(season) + '-' + str(season + 1) + results_path for season in seasons]

    #complete url list to be scraped
    return [results_url] + seasons_url

## Initialize scroll function to click page links

In [130]:

def scroll_to_element_and_click(driver, element):
    # Scroll the element into view
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    # Add a small delay if needed for any animation or page shift
    time.sleep(3)
    try:
        button = driver.find_element(By.ID, "onetrust-accept-btn-handler")  # Replace with the actual button ID
        button.click()
        time.sleep(1)
    except NoSuchElementException:
        print("No cookies!")
    # Click the element after scrolling
    element.click()

## WebScrape OddsPortal for results

In [131]:
service = Service(executable_path=CHROMEDRIVER_DIR)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
driver.implicitly_wait(2)

new_df = pd.DataFrame()
unique_matches = set()  # To track unique matches
breakAll = False

all_urls = getURLs()
# Iterate over urls for seasons
for url in all_urls:
    driver.get(url)

    #Get every page for current URL
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'pagination')))
    pagination_container = driver.find_element(By.CLASS_NAME, 'pagination')
    pagination_links = pagination_container.find_elements(By.CLASS_NAME, 'pagination-link')
    if len(pagination_links) > 1:
        pagination_links = pagination_links[:-1]

    previous_page = None

    for link in pagination_links:
        current_page = link.get_attribute("data-number")
        
        # Click the pagination link and wait for the page to load
        if previous_page:
            #print(f"Navigating from page {previous_page} to page {current_page}")
            link.click()
        else:
            #print(f"Starting from page {current_page}")
            scroll_to_element_and_click(driver, link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'eventRow')))

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        for row in soup.find_all('div', class_='eventRow'):

            if row.find('div', class_='text-black-main font-main w-full truncate text-xs font-normal leading-5'):
                curr_date = row.find('div', class_='text-black-main font-main w-full truncate text-xs font-normal leading-5').text.strip()

            if len(row.find_all('p', attrs={'data-v-a4e7076e': True})) >= 2 and len(row.find_all('p', class_='participant-name truncate')) > 1:
                home_team = row.find_all('p', class_='participant-name truncate')[0].text.strip()
                away_team = row.find_all('p', class_='participant-name truncate')[1].text.strip()

                # Create a unique identifier for each match
                match_id = (curr_date, home_team, away_team)
                if 'Yesterday' in curr_date or 'Today' in curr_date:
                    continue
                curr_date_datetime = datetime.strptime(curr_date, "%d %b %Y")
                if curr_date_datetime < recent_date:
                    breakAll = True
                    break

                if match_id not in unique_matches:
                    unique_matches.add(match_id)  # Add the match to the set
                    if len(row.find_all('p', attrs={'data-v-34474325': True})) == 3:
                        new_row = pd.DataFrame([{
                            'season': soup.find('a', 'active-item-calendar').text.strip(),
                            'date': curr_date,
                            'home_team': home_team,
                            'away_team': away_team,
                            'h_goals': row.find_all('div', class_='min-mt:!flex')[0].text.strip(),
                            'a_goals': row.find_all('div', class_='min-mt:!flex')[1].text.strip(),
                            'h_odds': row.find_all('p', attrs={'data-v-34474325': True})[0].text.strip(),
                            'd_odds': row.find_all('p', attrs={'data-v-34474325': True})[1].text.strip(),
                            'a_odds': row.find_all('p', attrs={'data-v-34474325': True})[2].text.strip()
                        }])
                    else:
                        new_row = pd.DataFrame([{
                            'season': soup.find('a', 'active-item-calendar').text.strip(),
                            'date': curr_date,
                            'home_team': home_team,
                            'away_team': away_team,
                            'h_goals': row.find_all('div', class_='min-mt:!flex')[0].text.strip(),
                            'a_goals': row.find_all('div', class_='min-mt:!flex')[1].text.strip(),
                            'h_odds': '100',
                            'd_odds': '100',
                            'a_odds': '100'
                        }])
                    new_df = pd.concat([new_df, new_row], ignore_index=True)

        if breakAll:
            break

        previous_page = current_page
    
    if breakAll:
        break

driver.quit()

In [133]:
df = pd.concat([new_df, df], ignore_index=True)

In [None]:
# Create the SQLAlchemy engine
engine = create_engine(connection_string)

# Write the DataFrame to a PostgreSQL table
try:
    # Write the DataFrame to a table named 'your_table_name', replace it with your desired table name
    df.to_sql('raw_match_data', engine, index=False, if_exists='replace')

    print("Table created successfully.")
except Exception as e:
    print(f"Error: {e}")

Table created successfully.
