In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import time
import hashlib

In [2]:
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-infobars')  
options.add_argument('--disable-extensions')
options.add_argument('--disable-notifications')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [3]:
def dismiss_cookies(driver, timeout=8):
    wait = WebDriverWait(driver, timeout)
    tried = []

    def _visible(el):
        try:
            return el.is_displayed() and el.is_enabled()
        except Exception:
            return False

    # 0) Give the banner a second to mount
    driver.execute_script("window._probe = Date.now();")
    try:
        wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "body")))
    except TimeoutException:
        pass

    # 1) Direct hit: common OneTrust IDs/classes
    candidates = [
        (By.ID, "onetrust-accept-btn-handler"),
        (By.CSS_SELECTOR, "button#onetrust-accept-btn-handler"),
        (By.CSS_SELECTOR,
         "#onetrust-banner-sdk button#onetrust-accept-btn-handler"),
        (By.CSS_SELECTOR, "button#onetrust-reject-all-handler"
         ),  # sometimes only Reject is visible first
        (By.CSS_SELECTOR, "[data-testid='onetrust-accept-btn-handler']"),
        (By.XPATH,
         "//button[contains(@id,'accept') and contains(translate(., 'ACEPT','acept'),'accept')]"
         ),
        (By.XPATH,
         "//button[contains(@aria-label,'Accept') or contains(normalize-space(.),'Accept')]"
         ),
    ]
    for how, what in candidates:
        tried.append(f"{how}={what}")
        try:
            btn = driver.find_element(how, what)
            if _visible(btn):
                btn.click()
                return True
        except NoSuchElementException:
            continue
        except WebDriverException:
            # Try JS click if the element exists but normal click fails
            try:
                driver.execute_script("arguments[0].click();", btn)
                return True
            except Exception:
                continue

    # 2) If not found, check for a banner container (present but hidden/animating)
    try:
        banner = driver.find_element(By.ID, "onetrust-banner-sdk")
        if _visible(banner):
            try:
                btn = banner.find_element(By.CSS_SELECTOR,
                                          "button[id*='accept']")
                driver.execute_script("arguments[0].click();", btn)
                return True
            except Exception:
                pass
    except NoSuchElementException:
        pass

    # 3)Scan iframes and try inside.
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    for i, frame in enumerate(iframes):
        # quick filter to avoid costly switches
        src = (frame.get_attribute("src") or "").lower()
        name = (frame.get_attribute("name") or "").lower()
        if any(k in src + name
               for k in ("consent", "onetrust", "privacy", "cookie")):
            tried.append(f"iframe[{i}] src={src or name}")
            try:
                driver.switch_to.frame(frame)
                # try common selectors again inside this frame
                for how, what in candidates:
                    try:
                        btn = WebDriverWait(driver, 2).until(
                            EC.presence_of_element_located((how, what)))
                        if _visible(btn):
                            driver.execute_script(
                                "arguments[0].scrollIntoView({block:'center'});",
                                btn)
                            try:
                                btn.click()
                            except Exception:
                                driver.execute_script("arguments[0].click();",
                                                      btn)
                            driver.switch_to.default_content()
                            return True
                    except Exception:
                        continue
                driver.switch_to.default_content()
            except Exception:
                # ensure weâ€™re back
                try:
                    driver.switch_to.default_content()
                except:
                    pass

    # 4) Last resort: call OneTrust API if it exists, or remove the banner to unblock clicks
    try:
        ok = driver.execute_script("""
            if (window.OneTrust && OneTrust.AcceptAll) { OneTrust.AcceptAll(); return true; }
            const b = document.getElementById('onetrust-banner-sdk');
            if (b) { b.remove(); return 'removed'; }
            return false;
        """)
        if ok:
            return True
    except Exception:
        pass

    print("[cookies] Could not find/close cookie banner. Tried:",
          *tried,
          sep="\n - ")
    return False


In [4]:
url = 'https://www.mlssoccer.com/schedule/scores#competition=MLS-COM-000001&club=all'

driver.get(url)
wait = WebDriverWait(driver, 10)

try:
    dismiss_cookies(driver, timeout=8)
except:
    print("Cookie button not found or already clicked.")

rounds = 1000000
stop_date = "Tuesday Mar 26, 2011"
matches = []

for i in range(rounds):
    previous_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Previous results']")))

    try:
        time.sleep(5)

        matches_table = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="mls-c-schedule__matches"]')))
        if not matches_table:
            print("No matches table found on this page.")
            previous_button.click()
            continue

        for date in matches_table.find_elements(By.CSS_SELECTOR, '.sc-GKYbw.bMOKWx'):
            date_element = date.find_element(By.CSS_SELECTOR, '.sc-hLBbgP.gIKMo').text.strip()

            if date_element == stop_date:
                break

            for match in date.find_elements(By.TAG_NAME, 'a'):

                home_abbr_el = match.find_element(
                    By.XPATH,
                    ".//div[contains(@class,'mls-c-club') and contains(@class,'--home')]"
                    "//span[contains(@class,'mls-c-club__abbreviation')]"
                )

                away_abbr_el = match.find_element(
                    By.XPATH,
                    ".//div[contains(@class,'mls-c-club') and contains(@class,'--away')]"
                    "//span[contains(@class,'mls-c-club__abbreviation')]"
                )
                
                home_team = home_abbr_el.get_attribute("textContent").strip()
                away_team = away_abbr_el.get_attribute("textContent").strip()

                home_score = match.find_elements(By.CSS_SELECTOR, '.mls-c-scorebug__score')[0].text.strip()
                away_score = match.find_elements(By.CSS_SELECTOR, '.mls-c-scorebug__score')[1].text.strip()
                
                match_id = home_team + "_" + away_team + "_" + date_element.replace(" ", "_").replace(",", "")
                match_id = match_id.lower()
                match_id = hashlib.md5(match_id.encode()).hexdigest()[:8]

                print(f"Match found: {date_element} - {home_team} vs {away_team} ({home_score}:{away_score}) and match_id: {match_id} ")
                matches.append({
                    "date": date_element,
                    "home_team": home_team,
                    "away_team": away_team,
                    "home_score": home_score,
                    "away_score": away_score,
                    'match_id': match_id
                })


        previous_button.click()

    except Exception as e:
        print(f"Error occurred: {e}")
        previous_button.click()
print(f"Total unique matches collected: {len(matches)}")

driver.quit()

Match found: Saturday Oct 18 - ATL vs DC (1:1) and match_id: 6c93df59 
Match found: Saturday Oct 18 - CLT vs PHI (2:0) and match_id: f86ec7a3 
Match found: Saturday Oct 18 - CIN vs MTL (3:0) and match_id: d3e085cd 
Match found: Saturday Oct 18 - CLB vs RBNY (3:1) and match_id: e411ac49 
Match found: Saturday Oct 18 - NSH vs MIA (2:5) and match_id: 7cf27f31 
Match found: Saturday Oct 18 - NE vs CHI (2:2) and match_id: 9f7537a0 
Match found: Saturday Oct 18 - NYC vs SEA (1:2) and match_id: bf454f67 
Match found: Saturday Oct 18 - TOR vs ORL (4:2) and match_id: 62c27490 
Match found: Saturday Oct 18 - COL vs LAFC (2:2) and match_id: 42089098 
Match found: Saturday Oct 18 - SKC vs HOU (0:0) and match_id: a7bbedf9 
Match found: Saturday Oct 18 - LA vs MIN (2:1) and match_id: 94412db9 
Match found: Saturday Oct 18 - POR vs SD (0:4) and match_id: 348a835c 
Match found: Saturday Oct 18 - SJ vs ATX (2:1) and match_id: f51f706e 
Match found: Saturday Oct 18 - STL vs RSL (2:2) and match_id: f43f8

TimeoutException: Message: 


In [5]:
matches_df = pd.DataFrame(matches)


In [6]:
matches_df

Unnamed: 0,date,home_team,away_team,home_score,away_score,match_id
0,Saturday Oct 18,ATL,DC,1,1,6c93df59
1,Saturday Oct 18,CLT,PHI,2,0,f86ec7a3
2,Saturday Oct 18,CIN,MTL,3,0,d3e085cd
3,Saturday Oct 18,CLB,RBNY,3,1,e411ac49
4,Saturday Oct 18,NSH,MIA,2,5,7cf27f31
...,...,...,...,...,...,...
6663,"Saturday Apr 1, 2006",KCW,CLB,3,1,df04f188
6664,"Saturday Apr 1, 2006",LA,NE,0,1,e2833d8f
6665,"Sunday Apr 2, 2006",DC,RBNY,2,2,93295fc7
6666,"Sunday Apr 2, 2006",CHV,RSL,3,0,c0bf35f1


In [7]:

matches_df.to_csv('matches_past.csv', index=False)

In [2]:
matches = pd.read_csv('G:/My Drive/GitHubProjects/MLS/data/db_save/matches_past.csv')

In [4]:
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,Tuesday Sep 30,MIA,CHI,3,5
1,Saturday Oct 4,DC,CLT,0,1
2,Saturday Oct 4,MTL,NSH,1,1
3,Saturday Oct 4,DAL,LA,2,1
4,Saturday Oct 4,MIA,NE,4,1


In [5]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

## remove 'days' from date column

for day in days:
    matches['date'] = matches['date'].str.replace(f'{day} ', '', regex=False)
    
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,Sep 30,MIA,CHI,3,5
1,Oct 4,DC,CLT,0,1
2,Oct 4,MTL,NSH,1,1
3,Oct 4,DAL,LA,2,1
4,Oct 4,MIA,NE,4,1


In [6]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

### convert month abbreviations to numbers

for i, month in enumerate(months, start=1):
    matches['date'] = matches['date'].str.replace(month, str(i).zfill(2), regex=False)
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,09 30,MIA,CHI,3,5
1,10 4,DC,CLT,0,1
2,10 4,MTL,NSH,1,1
3,10 4,DAL,LA,2,1
4,10 4,MIA,NE,4,1


In [7]:
### add / for date formatting

matches['date'] = matches['date'].str.replace(' ', '/', regex=False)

matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,09/30,MIA,CHI,3,5
1,10/4,DC,CLT,0,1
2,10/4,MTL,NSH,1,1
3,10/4,DAL,LA,2,1
4,10/4,MIA,NE,4,1


In [8]:
### if there is no third part of the date, add the year 2025

matches['date'] = matches['date'].apply(lambda x: x if x.count('/') == 2 else f"{x}/2025")

In [9]:
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,09/30/2025,MIA,CHI,3,5
1,10/4/2025,DC,CLT,0,1
2,10/4/2025,MTL,NSH,1,1
3,10/4/2025,DAL,LA,2,1
4,10/4/2025,MIA,NE,4,1


In [10]:
### if ',' present, remove it

matches['date'] = matches['date'].str.replace(',', '', regex=False)

In [11]:
### convert to datetime

matches['date'] = pd.to_datetime(matches['date'], format='%m/%d/%Y')

In [12]:
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,2025-09-30,MIA,CHI,3,5
1,2025-10-04,DC,CLT,0,1
2,2025-10-04,MTL,NSH,1,1
3,2025-10-04,DAL,LA,2,1
4,2025-10-04,MIA,NE,4,1


In [13]:
matches

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,2025-09-30,MIA,CHI,3,5
1,2025-10-04,DC,CLT,0,1
2,2025-10-04,MTL,NSH,1,1
3,2025-10-04,DAL,LA,2,1
4,2025-10-04,MIA,NE,4,1
...,...,...,...,...,...
5823,2010-03-27,DAL,HOU,1,1
5824,2010-03-27,RBNY,CHI,1,0
5825,2010-03-27,KCW,DC,4,0
5826,2010-03-27,SJ,RSL,0,3


In [14]:
from sqlalchemy import create_engine

db_string = 'mysql+pymysql://root:root@127.0.0.1:2022/MLS'

engine = create_engine(db_string)

In [15]:
old_df = pd.read_sql("SELECT match_id AS old_hash, date, home_team_abbr, away_team_abbr FROM matches", engine)
new_df = matches.rename(columns={
    'home_team': 'home_team_abbr',
    'away_team': 'away_team_abbr'
})

## add 'new hash' column to new_df
new_df = new_df.assign(new_hash=lambda x: x.index + 1)

# make sure dates are datetime.date
old_df["date"] = pd.to_datetime(old_df["date"]).dt.date
new_df["date"] = pd.to_datetime(new_df["date"]).dt.date

In [16]:
merged = (
    old_df
    .merge(new_df, on=["date", "home_team_abbr", "away_team_abbr"], how="right")
    [["old_hash", "new_hash", "date", "home_team_abbr", "away_team_abbr", 'home_score', 'away_score']]
)

In [17]:
merged

Unnamed: 0,old_hash,new_hash,date,home_team_abbr,away_team_abbr,home_score,away_score
0,b58463fd,1,2025-09-30,MIA,CHI,3,5
1,523f201f,2,2025-10-04,DC,CLT,0,1
2,98a71c11,3,2025-10-04,MTL,NSH,1,1
3,fdc39f90,4,2025-10-04,DAL,LA,2,1
4,62b997f4,5,2025-10-04,MIA,NE,4,1
...,...,...,...,...,...,...,...
5823,3fe1cb8e,5824,2010-03-27,DAL,HOU,1,1
5824,2a6593a9,5825,2010-03-27,RBNY,CHI,1,0
5825,8cddbc7d,5826,2010-03-27,KCW,DC,4,0
5826,4c33b0f5,5827,2010-03-27,SJ,RSL,0,3


In [18]:
## percentage of nan in 'old_hash' column

nan_percentage = merged['old_hash'].isna().mean() * 100
nan_percentage

0.0

In [19]:
## rename old hash to match_id and delete new_hash

merged = merged.rename(columns={'old_hash': 'match_id'}).drop(columns=['new_hash'])


In [20]:
merged

Unnamed: 0,match_id,date,home_team_abbr,away_team_abbr,home_score,away_score
0,b58463fd,2025-09-30,MIA,CHI,3,5
1,523f201f,2025-10-04,DC,CLT,0,1
2,98a71c11,2025-10-04,MTL,NSH,1,1
3,fdc39f90,2025-10-04,DAL,LA,2,1
4,62b997f4,2025-10-04,MIA,NE,4,1
...,...,...,...,...,...,...
5823,3fe1cb8e,2010-03-27,DAL,HOU,1,1
5824,2a6593a9,2010-03-27,RBNY,CHI,1,0
5825,8cddbc7d,2010-03-27,KCW,DC,4,0
5826,4c33b0f5,2010-03-27,SJ,RSL,0,3


In [21]:
clean = lambda s: s.fillna("").str.lower().str.replace(r"\s+", "", regex=True)

merged["date"] = pd.to_datetime(merged["date"])

merged["slug"] = (
    clean(merged["home_team_abbr"])
    + "vs"
    + clean(merged["away_team_abbr"])
    + "-"
    + merged["date"].dt.strftime("%m-%d-%Y")
)


In [22]:
import hashlib
def hash_match_ids(df: pd.DataFrame, col="slug", out_col="match_id_hash", length=8):
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not found.")
    
    df = df.copy()
    df[out_col] = (
        df[col]
        .astype(str)
        .str.lower()
        .map(lambda x: hashlib.md5(x.encode()).hexdigest()[:length])
    )
    
    
    return df

In [23]:
merged2 = hash_match_ids(merged, col="slug", out_col="match_id_hash", length=8)

In [24]:
## remove 'slug' column

merged2 = merged2.drop(columns=['slug'], errors='ignore')

In [25]:
merged2

Unnamed: 0,match_id,date,home_team_abbr,away_team_abbr,home_score,away_score,match_id_hash
0,b58463fd,2025-09-30,MIA,CHI,3,5,b58463fd
1,523f201f,2025-10-04,DC,CLT,0,1,523f201f
2,98a71c11,2025-10-04,MTL,NSH,1,1,98a71c11
3,fdc39f90,2025-10-04,DAL,LA,2,1,fdc39f90
4,62b997f4,2025-10-04,MIA,NE,4,1,62b997f4
...,...,...,...,...,...,...,...
5823,3fe1cb8e,2010-03-27,DAL,HOU,1,1,3fe1cb8e
5824,2a6593a9,2010-03-27,RBNY,CHI,1,0,2a6593a9
5825,8cddbc7d,2010-03-27,KCW,DC,4,0,8cddbc7d
5826,4c33b0f5,2010-03-27,SJ,RSL,0,3,4c33b0f5


In [26]:
### if match_id nan, replace with match_id_hash

merged2['match_id'] = merged2.apply(
    lambda row: row['match_id_hash'] if pd.isna(row['match_id']) else row['match_id'],
    axis=1
)

merged2.drop(columns=['match_id_hash'], errors='ignore', inplace=True)

merged2

Unnamed: 0,match_id,date,home_team_abbr,away_team_abbr,home_score,away_score
0,b58463fd,2025-09-30,MIA,CHI,3,5
1,523f201f,2025-10-04,DC,CLT,0,1
2,98a71c11,2025-10-04,MTL,NSH,1,1
3,fdc39f90,2025-10-04,DAL,LA,2,1
4,62b997f4,2025-10-04,MIA,NE,4,1
...,...,...,...,...,...,...
5823,3fe1cb8e,2010-03-27,DAL,HOU,1,1
5824,2a6593a9,2010-03-27,RBNY,CHI,1,0
5825,8cddbc7d,2010-03-27,KCW,DC,4,0
5826,4c33b0f5,2010-03-27,SJ,RSL,0,3


In [27]:
merged2

Unnamed: 0,match_id,date,home_team_abbr,away_team_abbr,home_score,away_score
0,b58463fd,2025-09-30,MIA,CHI,3,5
1,523f201f,2025-10-04,DC,CLT,0,1
2,98a71c11,2025-10-04,MTL,NSH,1,1
3,fdc39f90,2025-10-04,DAL,LA,2,1
4,62b997f4,2025-10-04,MIA,NE,4,1
...,...,...,...,...,...,...
5823,3fe1cb8e,2010-03-27,DAL,HOU,1,1
5824,2a6593a9,2010-03-27,RBNY,CHI,1,0
5825,8cddbc7d,2010-03-27,KCW,DC,4,0
5826,4c33b0f5,2010-03-27,SJ,RSL,0,3


In [None]:
##check duplicates in merged2

merged2['match_id'].duplicated().sum()

## view duplicates

merged2[merged2['match_id'].duplicated(keep=False)].sort_values('match_id')



In [None]:
## drop first occurrence of duplicates

merged2 = merged2[~merged2['match_id'].duplicated(keep='first')]

### rename to home_team_score and away_team_score

merged2 = merged2.rename(columns={'home_score': 'home_team_score', 'away_score': 'away_team_score'})

In [30]:
merged2.to_sql('matches', engine, if_exists='append', index=False)

5824