In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import *
from selenium.webdriver.support.ui import Select
from datetime import datetime
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def set_up_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    service = ChromeService(ChromeDriverManager().install())

    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [4]:
def go_to_page(url, driver):
    driver.get(url)
    return driver

In [5]:
def extact_teams_and_href(driver):
    teams = []
    team_headers = []
    hrefs = []
    try:
        table = driver.find_element(By.TAG_NAME, 'table')
        header_elements = table.find_elements(By.TAG_NAME, "th")
        for header in header_elements:
            team_headers.append(header.text.strip())
        rows = table.find_elements(By.XPATH, ".//tbody/tr")
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, 'td')
            row_data = []
            for cell in cells:
                cell_text = cell.text.strip()
                row_data.append(cell_text)
                try:
                    link_element = cell.find_element(By.TAG_NAME, 'a')
                    hrefs.append(link_element.get_attribute('href'))
                except Exception:
                    pass 
            teams.append(row_data)
    except Exception as e:
        print('could not find table', e)

    teams_df = pd.DataFrame(teams, columns=team_headers)
    
    try:
        date_dropdown = driver.find_element(By.XPATH, '/html/body/header/section/p/select[2]')
        select = Select(date_dropdown)
        date = select.first_selected_option.text.strip()
        teams_df['Date'] = date
    except Exception as e:
        print('couldnt save date', e)

    return teams_df, hrefs, date

In [6]:
def extract_teams():
    driver = set_up_driver()
    go_to_page('https://sofifa.com/teams?type=all&lg%5B0%5D=39&showCol%5B%5D=ti&showCol%5B%5D=fm&showCol%5B%5D=oa&showCol%5B%5D=at&showCol%5B%5D=md&showCol%5B%5D=df&showCol%5B%5D=cw&showCol%5B%5D=ps', driver)
    df, hrefs, date = extact_teams_and_href(driver)
    
    return df, hrefs, date
    

In [7]:
df, hrefs, date = extract_teams()

In [8]:
df.head()

Unnamed: 0,Unnamed: 1,Name,ID,Formation,Overall,Attack,Midfield,Defence,Club worth,Players,Unnamed: 11,Date
0,,Inter Miami\nMajor League Soccer,112893,4-4-2 Holding,73,84,72,70,€420M,26,,"Jul 17, 2025"
1,,Seattle Sounders FC\nMajor League Soccer,111144,3-4-2-1,71,73,68,71,€314.5M,28,,"Jul 17, 2025"
2,,Los Angeles FC\nMajor League Soccer,112996,4-3-3 Holding,71,74,69,70,€427.5M,26,,"Jul 17, 2025"
3,,LA Galaxy\nMajor League Soccer,697,4-2-3-1 Wide,71,70,71,69,€412M,26,,"Jul 17, 2025"
4,,St. Louis CITY SC\nMajor League Soccer,113018,3-4-2-1,70,73,69,68,€175M,29,,"Jul 17, 2025"


In [9]:
hrefs

['https://sofifa.com/team/112893/inter-miami/',
 'https://sofifa.com/team/111144/seattle-sounders-fc/',
 'https://sofifa.com/team/112996/los-angeles-fc/',
 'https://sofifa.com/team/697/la-galaxy/',
 'https://sofifa.com/team/113018/st-louis-city-sc/',
 'https://sofifa.com/team/114640/charlotte-fc/',
 'https://sofifa.com/team/112885/atlanta-united/',
 'https://sofifa.com/team/113149/fc-cincinnati/',
 'https://sofifa.com/team/111140/portland-timbers/',
 'https://sofifa.com/team/111928/san-jose-earthquakes/',
 'https://sofifa.com/team/687/columbus-crew/',
 'https://sofifa.com/team/689/new-york-red-bulls/',
 'https://sofifa.com/team/691/new-england-revolution/',
 'https://sofifa.com/team/698/houston-dynamo/',
 'https://sofifa.com/team/112606/orlando-city-sc/',
 'https://sofifa.com/team/114161/austin-fc/',
 'https://sofifa.com/team/114162/nashville-sc/',
 'https://sofifa.com/team/101112/vancouver-whitecaps-fc/',
 'https://sofifa.com/team/112134/philadelphia-union/',
 'https://sofifa.com/team

In [11]:
df.to_csv(f'teams_{date}', index=False)
with open('hrefs.txt', 'w') as f:
    for href in hrefs:
        f.write(href + '\n')