In [4]:
import pandas as pd
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import re
import os
import time


### TABLES

In [5]:
leagues = {
    "Premier League": "https://www.oddsportal.com/football/england/premier-league/",
    "Ligue 1": "https://www.oddsportal.com/football/france/ligue-1/",
    "Bundesliga": "https://www.oddsportal.com/football/germany/bundesliga/",
    "Serie A": "https://www.oddsportal.com/football/italy/serie-a/",
    "LaLiga": "https://www.oddsportal.com/football/spain/laliga/",
    "Ligue 1" : "https://www.oddsportal.com/football/france/ligue-1/",
    "Eredivisie" : "https://www.oddsportal.com/football/netherlands/eredivisie/" }

leagues2 = {
    "Premier League": "https://www.oddsportal.com/football/england/premier-league/"}

odds_pages = {
    "1X2FT" : "#1X2;2",
    "1X2H1" : "##1X2;3",
    "1X2H2" : "##1X2;4",
    "DCFT" : "#double;2",
    "DCH1" : "#double;3",
    "DCH2" : "#double;4",
    "DNBFT" : "#dnb;2",
    "DNBH1" : "#dnb;3",
    "DNBH2" : "#dnb;4",
    "HTFT" : "#ht-ft;2",
    "BTSFT" : "#bts;2",
    "BTSH1" : "#bts;3",
    "BTSH2" : "#bts;4",
    "OUFT" : "#over-under;2",
    "OUH1" : "#over-under;3",
    "OUH2" : "#over-under;4",
    "CSFT" : "#cs;2",
    "CSH1" : "#cs;3",
    "CSH2" : "#cs;4",
    "AHFT" : "#ah;2",
    "AHH1" : "#ah;3",
    "AHH2" : "#ah;4"}

### GETTING THE UPCOMING GAMES

In [20]:
def get_upcoming_games(leagues):
    chrome_driver_path = 'C:/Users/devon/Desktop/Projects/NQ Project/chromedriver.exe'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    
    driver = uc.Chrome(options=chrome_options)    
    game_data = []

    for league, url in leagues2.items():
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        scripts = soup.find_all("script", type="application/ld+json")

        for script in scripts:
            try:
                data = json.loads(script.string)
                if data.get("@type") == "SportsEvent":
                    game_data.append({
                        "Match" : data['name'],
                        "Date" : data['startDate'],
                        "Venue" : data['location']['name'],
                        "URL": data["url"]
                    })
            except json.JSONDecodeError:
                continue

    game_data = pd.DataFrame(game_data)
    driver.quit()
    return game_data

### Get Historical Games

In [7]:
def get_historic_games(leagues, no_previous_yrs=12):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")

    driver = uc.Chrome(options=chrome_options)

    game_data = []

    for league, url in leagues.items():
        driver.get(url + "results/")
        soup = BeautifulSoup(driver.page_source, "html.parser")
        seasons = soup.find_all("option")
        season_links = [option["value"] for option in seasons if "value" in option.attrs]
        season_links = season_links[:no_previous_yrs]

        print(season_links)

        for link in season_links:
            print(f'Navigating to season: {link}')
            driver.get(link)
            
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pagination-link"))
            )
            
            driver.execute_script("window.scrollBy(1, window.innerHeight);")

            soup = BeautifulSoup(driver.page_source, "html.parser")

            raw_pages = soup.find_all("a", class_="pagination-link")
            if raw_pages:
                pages = [link["data-number"] for link in raw_pages if "data-number" in link.attrs]
            else:
                pages = []

            print(f'Pages found: {pages}')
            
            for page in pages:
                print(f"Scraping page {page}")
                
                page_url = link if page == 1 else f"{link}#/page/{page}/"
                driver.get(page_url)

                WebDriverWait(driver, 4).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pagination-link"))
            )
                
                driver.execute_script("window.scrollBy(1, window.innerHeight);")

                soup = BeautifulSoup(driver.page_source, "html.parser")
                
                games = soup.find_all('div', class_="border-black-borders border-b border-l border-r hover:bg-[#f9e9cc]")

                for game_row in games: 
                    # Extract game URL
                    game_url = game_row.find('a', href=True)
                    game_url = game_url['href'] if game_url else "No URL"
                    
                    # Extract teams (e.g., Arsenal vs Manchester City)
                    teams = game_row.find_all('p', class_='participant-name')
                    if len(teams) == 2:
                        game = f"{teams[0].get_text(strip=True)} - {teams[1].get_text(strip=True)}"
                    else:
                        game = "No Teams Found"
                    
                    # Append extracted information to the list
                    game_data.append({
                        'League' : league,
                        'Game': game,
                        'URL': f"https://www.oddsportal.com{game_url}"
                    })

    df = pd.DataFrame(game_data)
    return df

### GET 1X2 TYPE ODDS

In [8]:
def get_1x2_odds(url, title, ext, max_retries=100):
    
    attempt = 0
    average_odds = [{f'{title}_1' : ['-'],
                    f'{title}_X' : ['-'],
                    f'{title}_2' : ['-']}]

    while attempt < max_retries:
        
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")
        
        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)

        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//p[text()='Average']"))
            )

            soup = BeautifulSoup(driver.page_source, "html.parser")

            average_section = soup.find("p", class_="height-content", string="Average")

            if average_section:
                parent_div = average_section.find_parent("div", class_="border-black-borders")
                next_siblings = parent_div.find_all("div", class_="border-black-borders")
                average_odds = [sibling.find("p", class_="height-content").text.strip() for sibling in next_siblings]
                average_odds = [{f'{title}_1' : average_odds[0],
                                 f'{title}_X' : average_odds[1],
                                 f'{title}_2' : average_odds[2]}]
            break

        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {title}")
            attempt += 1
            
            if attempt < max_retries:
                print(f"Retrying... ({attempt}/{max_retries})")
                driver.quit()
            else:
                print("Max retries reached. Returning default values.")

    driver.quit()
    return average_odds



### GET 1,2 ODDS TYPE

In [9]:
def get_1_2_odds(url, title, ext, max_retries=100):
    
    attempt = 0
    average_odds = ['-'] * 2

    while attempt < max_retries:
        
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")

        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)

        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//p[text()='Average']"))
            )

            soup = BeautifulSoup(driver.page_source, "html.parser")

            average_section = soup.find("p", class_="height-content", string="Average")

            if average_section:
                parent_div = average_section.find_parent("div", class_="border-black-borders")
                next_siblings = parent_div.find_all("div", class_="border-black-borders")
                average_odds = [sibling.find("p", class_="height-content").text.strip() for sibling in next_siblings]
                average_odds = [{f'{title}_1' : average_odds[0],
                                 f'{title}_2' : average_odds[1]}]
            break

        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {title}")
            attempt += 1
            
            if attempt < max_retries:
                print(f"Retrying... ({attempt}/{max_retries})")
                driver.quit()
            else:
                print("Max retries reached. Returning default values.")

    driver.quit()
    return average_odds

### GET HALFTIME/FULLTIME ODDS

In [10]:
def get_HTFT(url, ext='#ht-ft;2', max_tries=100):
    
    attempt = 0

    while attempt < max_tries:

        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")

        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)

        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//p[contains(@class, 'height-content')]"))
            )

            soup = BeautifulSoup(driver.page_source, "html.parser")

            odds_elements = soup.find_all("div", class_='flex h-9 text-xs max-sm:h-auto')
            odds_list = [element.find("p", class_="height-content").text.strip() for element in odds_elements]

            average_odds = [{'H_H' :odds_list[0],
                            'H_D' :odds_list[1],
                            'H_A' :odds_list[2],
                            'D_H' :odds_list[3],
                            'D_D' :odds_list[4],
                            'D_A' :odds_list[5],
                            'A_H' :odds_list[6],
                            'A_D' :odds_list[7],
                            'A_A' :odds_list[8]}]
            
            break

        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {ext}")
            attempt += 1
            
            if attempt < max_tries:
                print(f"Retrying... ({attempt}/{max_tries})")
                driver.quit()  
                 
            else:
                print("Max retries reached. Returning default values.")
    
    driver.quit()
    return average_odds



### GET OVER/UNDER ODDS

In [11]:
def get_OU(url, ext, title, max_retries=100):

    all_row_data = []
    attempt = 0

    while attempt < max_retries:
        
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")

        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)
        try:

            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'relative flex flex-col')]"))
            )

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            over_under_elements = soup.find_all("div", class_="relative flex flex-col")
            
            row_data = {}

            for element in over_under_elements:
                over_under_text = element.find("div", class_="flex w-full items-center justify-start pl-3 font-bold text-[#2F2F2F]").find_all("p")
                over_under = over_under_text[0].text.strip()
                odds_containers = element.find_all("div", class_="flex-center border-black-main min-w-[60px] max-w-[60px] flex-col gap-1 border-l border-opacity-10")
                
                yes_odds = odds_containers[0].find("p").text.strip()
                no_odds = odds_containers[1].find("p").text.strip()

                row_data[f"{over_under}_{title}_Over"] = yes_odds
                row_data[f"{over_under}_{title}_Under"] = no_odds

            all_row_data.append(row_data)

            if not all_row_data or not row_data:
                print(f"Warning: No OU or odds found on attempt {attempt + 1}. Retrying...")
                attempt += 1
                driver.quit()
                continue

            return all_row_data
            
            
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {ext}")
            attempt += 1
            driver.quit()
            
            if attempt + 1 < max_retries:
                print(f"Retrying... ({attempt + 1}/{max_retries})")
                driver.quit()

            else:
                print(f"Max retries reached. Moving to the next URL.")
                break
    
    driver.quit()
    return []

### GET CORRECT SCORE

In [12]:
def get_CS(url, ext, title, max_retries=100):

    attempt = 0

    while attempt < max_retries:
    
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")

        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)
        try:

            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'relative flex flex-col')]"))
            )

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            cs = [f"CS{title}_" + score.find("p").text for score in soup.find_all('div', 'flex w-full items-center justify-start pl-3 font-bold text-[#2F2F2F]')]
            odds = [odd.text for odd in soup.find_all('div', class_='flex h-9 text-xs max-sm:h-auto')]

            if not cs or not odds:
                print(f"Warning: No CS or odds found on attempt {attempt + 1}. Retrying...")
                attempt += 1
                driver.quit()
                continue

            table = dict(zip(cs, odds))

            return table
            
            
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {ext}")
            attempt += 1
            driver.quit()
            
            if attempt + 1 < max_retries:
                print(f"Retrying... ({attempt + 1}/{max_retries})")
                driver.quit()

            else:
                print(f"Max retries reached. Moving to the next URL.")
                break
    
    driver.quit()
    return []

### GET ASIAN HANDICAP

In [13]:
def get_AH(url, ext, title, max_retries=100):

    all_row_data = []
    attempt = 0

    while attempt < max_retries:

        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("chrome-version=132")

        driver = uc.Chrome(options=chrome_options)
        driver.get(url + ext)
        try:

            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'relative flex flex-col')]"))
            )

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            ah_elements = soup.find_all("div", class_="relative flex flex-col")
            
            row_data = {}

            for element in ah_elements:
                ah_text = element.find("div", class_="flex w-full items-center justify-start pl-3 font-bold text-[#2F2F2F]").find_all("p")
                ah = ah_text[0].text.strip()
                odds_containers = element.find_all("div", class_="flex-center border-black-main min-w-[60px] max-w-[60px] flex-col gap-1 border-l border-opacity-10")
                
                o_odds = odds_containers[0].find("p").text.strip()
                t_odds = odds_containers[1].find("p").text.strip()

                row_data[f"{ah}_{title}_1"] = o_odds
                row_data[f"{ah}_{title}_2"] = t_odds

            if row_data:
                all_row_data.append(row_data)

            if not row_data:
                print(f"Warning: No AH{title} or odds found on attempt {attempt + 1}. Retrying...")
                attempt += 1
                driver.quit()
                continue

            return all_row_data
            
            
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: AH{title}")
            attempt += 1
            driver.quit()
            
            if attempt + 1 < max_retries:
                print(f"Retrying... ({attempt + 1}/{max_retries})")
                driver.quit()

            else:
                print(f"Max retries reached. Moving to the next URL.")
                break
    
    driver.quit()
    return []

### Get GameLink Odds

In [14]:
def get_game_odds(game_link):

    """Fetch odds for a each game."""
    try:
        _1X2FT = get_1x2_odds(game_link,'1X2FT', "#1X2;2")
        _1X2H1 = get_1x2_odds(game_link,'1X2H1', "#1X2;3")
        _1X2H2 = get_1x2_odds(game_link,'1X2H2', "#1X2;4")
        DCFT = get_1x2_odds(game_link,'DCFT', "#double;2")
        DCH1 = get_1x2_odds(game_link,'DCH1', "#double;3")
        DCH2 = get_1x2_odds(game_link,'DCH2', "#double;4")
        HTFT = get_HTFT(game_link)
        DNBFT = get_1_2_odds(game_link,'DNBFT', "#dnb;2")
        DNBH1 = get_1_2_odds(game_link,'DNBH1', "#dnb;3")
        DNBH2 = get_1_2_odds(game_link,'DNBH2', "#dnb;4")
        BTSFT = get_1_2_odds(game_link,'BTSFT', "#bts;2")
        BTSH1 = get_1_2_odds(game_link,'BTSH1', "#bts;3")
        BTSH2 = get_1_2_odds(game_link,'BTSH2', "#bts;4")
        OUFT = get_OU(game_link, "#over-under;2", 'FT')
        OUH1 = get_OU(game_link, "#over-under;3", 'H1')
        OUH2 = get_OU(game_link, "#over-under;4", 'H2')
        CSFT = get_CS(game_link, '#cs;2', 'FT')
        CSH1 = get_CS(game_link, '#cs;3', 'H1')
        CSH2 = get_CS(game_link, '#cs;4', 'H2')
        AHFT = get_AH(game_link, "#ah;2", 'FT')
        AHH1 = get_AH(game_link, "#ah;3", 'H1')
        AHH2 = get_AH(game_link, "#ah;4", 'H2')

        game = { 'URL' : game_link}

        df = {
            **game,
            **_1X2FT[0], 
            **_1X2H1[0],
            **_1X2H2[0],
            **DCFT[0],
            **DCH1[0],
            **DCH2[0],
            **HTFT[0],
            **DNBFT[0],
            **DNBH1[0],
            **DNBH2[0],
            **BTSFT[0],
            **BTSH1[0],
            **BTSH2[0],
            **OUFT[0],
            **OUH1[0],
            **OUH2[0],
            **CSFT,
            **CSH1,
            **CSH2,
            **AHFT[0],
            **AHH1[0],
            **AHH2[0]}

        df = pd.DataFrame([df])

        return df
    
    except Exception as e:
        print(f"Error fetching odds for {game_link}: {e} get_game_odds0")



### Build Odds DF

In [None]:
def build_odds_df(game_data, save_interval=100, output_filename='odds_data.csv', resume=False):
    game_links = game_data['URL'].to_list()
    all_odds = []
    all_columns = set()
    
    base_column_order = [
        'URL',
        '1X2FT_1', '1X2FT_X', '1X2FT_2',
        '1X2H1_1', '1X2H1_X', '1X2H1_2',
        '1X2H2_1', '1X2H2_X', '1X2H2_2',
        'DCFT_1', 'DCFT_X', 'DCFT_2',
        'DCH1_1', 'DCH1_X', 'DCH1_2',
        'DCH2_1', 'DCH2_X', 'DCH2_2',
        'H_H', 'H_D', 'H_A',
        'D_H', 'D_D', 'D_A',
        'A_H', 'A_D', 'A_A',
        'DNBFT_1', 'DNBFT_2',
        'DNBH1_1', 'DNBH1_2',
        'DNBH2_1', 'DNBH2_2',
        'BTSFT_1', 'BTSFT_2',
        'BTSH1_1', 'BTSH1_2',
        'BTSH2_1', 'BTSH2_2',
    ]
    
    
    start_idx = 0
    if resume and os.path.exists(output_filename):
       
        print(f"Loading existing data from {output_filename}...")
        existing_data = pd.read_csv(output_filename)
        start_idx = len(existing_data)
        all_odds = [existing_data]  
    
    for i, game in enumerate(game_links[start_idx:], start=start_idx+1):
        try:
            game_odds = get_game_odds(game)
            if not game_odds.empty:
                all_columns.update(game_odds.columns) 
                all_odds.append(game_odds) 
            print(f"Completed game #{i} of {len(game_links)}")
            
            # Save periodically
            if i % save_interval == 0:
                print(f"Saving progress at game #{i}...")
                df_all_odds = pd.concat(all_odds, ignore_index=True)
                df_all_odds.to_csv(output_filename, index=False)

        except Exception as e:
            print(f"Error occurred while processing game #{i}:")
    
    
    if all_odds:
        df_all_odds = pd.concat(all_odds, ignore_index=True)
        df_all_odds.to_csv(output_filename, index=False)
        print(f"Final data saved to {output_filename}")
    
    # Process columns
    if all_odds:
        all_odds_aligned = []

        # Over/Under Columns
        over_under_columns = [col for col in all_columns if 'Over/Under' in col]
        over_under_columns_sorted = sorted(over_under_columns, key=lambda x: (
            re.search(r'(FT|H1|H2)', x).group(),
            float(re.search(r'(\+?\d+\.\d+|\d+)', x).group()), 
            '_Under' in x ))

        # Correct Score Columns
        CS_columns = [col for col in all_columns if 'CS' in col]
        CS_columns_sorted = sorted(CS_columns, key=lambda x: (
            re.search(r'(FT|H1|H2)', x).group(),
            *map(int, re.search(r'(\d+):(\d+)', x).groups())
        ))

        # Asian Handicap Columns
        AH_columns = [col for col in all_columns if 'AH' in col]
        AH_columns_sorted = sorted(AH_columns, key=lambda x: (
            re.search(r'(FT|H1|H2)', x).group(),
            *map(int, re.search(r'(\d+)', x).groups())
        ))

        dynamic_column_order = base_column_order + over_under_columns_sorted + CS_columns_sorted + AH_columns_sorted 

        for df in all_odds:
            df_aligned = df.reindex(columns=dynamic_column_order)
            all_odds_aligned.append(df_aligned)

        return pd.concat(all_odds_aligned, ignore_index=True)

### Example Code run

In [None]:
game_data = get_upcoming_games(leagues)
odds_df = build_odds_df(game_data)

In [None]:
print(odds_df.head())