In [45]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time

In [46]:
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--start-maximized')
options.add_argument('--disable-infobars')  
options.add_argument('--disable-extensions')


service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [None]:
url = 'https://www.mlssoccer.com/schedule/scores#competition=MLS-COM-000001&club=all&date=2025-08-11'

driver.get(url)
time.sleep(5) 
wait = WebDriverWait(driver, 10)

all_links = set()

rounds = 2000
stop_date = "Wednesday Oct 18, 2023"


for i in range(rounds):
    try:
        previous_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Previous results"]')))
        time.sleep(5)
        
        matches_table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'mls-c-schedule__matches')))
        if not matches_table:
            print("No matches table found on this page.")
            previous_button.click()
            continue
        
        date_headers = matches_table.find_elements(By.CSS_SELECTOR, 'h2.sc-hLBbgP.gIKMo')
        if date_headers:
            first_date_text = date_headers[0].text.strip()
            print("First date header:", first_date_text)
            if first_date_text == stop_date:
                print("Reached stop date.")
                break
        
        hrefs = matches_table.find_elements(By.TAG_NAME, 'a')
        if not hrefs:
            print("No match links found on this page.")
            previous_button.click()
            continue
        
        for href in hrefs:
            print(href.get_attribute('href'))
            all_links.add(href.get_attribute('href'))
            

        previous_button.click()

    except Exception as e:
        print(f"Error occurred: {e}")
        continue
print(f"Total unique match links collected: {len(all_links)}")

driver.quit()

First date header: Saturday Aug 9
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/mtlvsatl-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/nevsdc-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/phivstor-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/atxvshou-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/chivslafc-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/dalvspor-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/stlvsnsh-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/skcvssd-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/sjvsvan-08-09-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2025/matches/cinvsclt-08-10-2025
https://www.mlssoccer.com/competitions/mls-regular-season/2

In [48]:
all_links_df = pd.DataFrame(all_links, columns=['Match Links'])

all_links_df.to_csv('../data/mls_match_links.csv', index=False)

In [None]:
for link in all_links_df['links']:
    feed = []
    try:
        driver.get(link)
        time.sleep(5)

        feed_bttn = driver.find_element(By.LINK_TEXT, 'Feed')
        
        try:
            feed_bttn.click()
            time.sleep(5)
            
            cont = driver.find_element(By.CSS_SELECTOR, 'div[class="mls-o-match-feed"]')

            for event in cont.find_elements(By.XPATH, './/div[contains(@class,"mls-o-match-feed__commentary mls-o-match-feed__commentary--")]'):
                title = event.find_element(By.XPATH, './/div[contains(@class,"mls-o-match-feed__title")]').text.strip()
                comment = event.find_element(By.XPATH, './/div[contains(@class,"mls-o-match-feed__comment")]').text.strip()

                feed.append({
                    'title': title,
                    'comment': comment
                })
        except Exception as e:
            print(f"Error processing feed for link {link}: {e}")

        stats = []
        stats_bttn = driver.find_element(By.CSS_SELECTOR, 'span[class="mls-c-sub-nav__item-text"]')
        
        try:
            stats_bttn.click()
            time.sleep(5)
            
                    
        players_bttn = driver.find_element(By.CSS_SELECTOR, 'button[value="players"]')
        players_bttn.click()
        
        try:
            score_wrapper = driver.find_element(By.CSS_SELECTOR, 'div[class="sc-jSUZER ipGjDO mls-c-matchhub md --post"]')
            
            date = score_wrapper.find_element(By.CSS_SELECTOR, 'div[class="sc-hLBbgP icXwUM mls-c-blockheader__subtitle"]').text.strip()
            home_team = score_wrapper.find_element(By.CSS_SELECTOR, 'div[class="sc-fLcnxK izXAE mls-c-club --home "]')
            home_team_name = home_team.find_element(By.CSS_SELECTOR, 'span[class="mls-c-club__shortname"]').text.strip()

            away_team = score_wrapper.find_element(By.CSS_SELECTOR, 'div[class="sc-fLcnxK izXAE mls-c-club --away "]')
            away_team_name = away_team.find_element(By.CSS_SELECTOR, 'span[class="mls-c-club__shortname"]').text.strip()
            
            scores = score_wrapper.find_elements(By.CSS_SELECTOR, 'div[class="sc-bYMpWt ipQTcP mls-c-scorebug__post"]')
            home_score = scores[0].text.strip()
            away_score = scores[2].text.strip()