In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time

In [6]:
# Setup ChromeDriver with Options
options = Options()
options.add_argument("--start-maximized")  # Optional
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

In [10]:
team_ids_21 = list(range(510453, 511448 + 1)) # No hometowns - 2021-2021
team_ids_22 = list(range(532474, 533466 + 1)) # No hometowns
team_ids_23 = list(range(549168, 550153 + 1))
team_ids_24_1 = list(range(571907, 572665 + 1)) # Jumble of softball and women's lacrosse teams, divide into two sets for faster run time
team_ids_24_2 = list(range(572666, 573423 + 1))
team_ids_25 = list(range(595035, 596026 + 1))
team_ids_26 = list(range(613550, 614536 + 1))

all_players = []

In [8]:
for team_id in team_ids_24_2:  # change just the loop condition team_ids
    roster_url = f"https://stats.ncaa.org/teams/{team_id}/roster"
    print(f"📄 Scraping: {roster_url}")

    try:
        driver.get(roster_url)
        wait.until(EC.presence_of_element_located((By.XPATH, "//table//thead//tr//th")))

        # Extract team name
        try:
            team_name_el = driver.find_element(By.XPATH, "//a[@target='ATHLETICS_URL']")
            team_name = team_name_el.text.strip()
        except:
            team_name = f"Team {team_id}"

        # ✅ Extract sport name (top left dropdown text or label)
        try:
            sport_el = driver.find_element(By.XPATH, "//select[@id='sport_list']/option[@selected]")
            sport_name = sport_el.text.strip()
        except:
            # Fallback if structure differs
            sport_name = "Unknown"

        # Skip if not Sofball
        if sport_name != "Softball":
            print(f"⏩ Skipping {team_id} ({team_name}) because sport is {sport_name}")
            continue

        # Get column headers
        header_elements = driver.find_elements(By.XPATH, "//table//thead//tr//th")
        headers = [h.text.strip() for h in header_elements]

        # Add ID, team name, and sport as first headers
        csv_headers = ["Team ID", "Team Name", "Sport"] + headers

        # Extract player rows
        rows = driver.find_elements(By.XPATH, "//table//tbody//tr")
        for row in rows:
            cells = [c.text.strip() for c in row.find_elements(By.TAG_NAME, "td")]
            all_players.append([team_id, team_name, sport_name] + cells)

        print(f"✅ Scraped {len(rows)} players for: {team_name} ({sport_name})")

    except Exception as e:
        print(f"❌ Failed for {team_id}: {e}")
    finally:
        time.sleep(1)

📄 Scraping: https://stats.ncaa.org/teams/572666/roster
⏩ Skipping 572666 (Lynchburg Hornets) because sport is Women's Lacrosse
📄 Scraping: https://stats.ncaa.org/teams/572667/roster
✅ Scraped 21 players for: Emporia St. Lady Hornets (Softball)
📄 Scraping: https://stats.ncaa.org/teams/572668/roster
✅ Scraped 27 players for: Erskine Flying Fleet (Softball)
📄 Scraping: https://stats.ncaa.org/teams/572669/roster
⏩ Skipping 572669 (Maine Maritime Mariners) because sport is Women's Lacrosse
📄 Scraping: https://stats.ncaa.org/teams/572670/roster
⏩ Skipping 572670 (Me.-Farmington Beavers) because sport is Women's Lacrosse
📄 Scraping: https://stats.ncaa.org/teams/572671/roster
✅ Scraped 25 players for: Fairmont St. Falcons (Softball)
📄 Scraping: https://stats.ncaa.org/teams/572672/roster
⏩ Skipping 572672 (Manhattanville Valiants) because sport is Women's Lacrosse
📄 Scraping: https://stats.ncaa.org/teams/572673/roster
✅ Scraped 34 players for: Findlay Oilers (Softball)
📄 Scraping: https://stats

In [9]:
# years_20_21 = pd.DataFrame(all_players)
# years_20_21.to_csv("years_20_21.csv", index=False, header=False)

# years_21_22 = pd.DataFrame(all_players)
# years_21_22.to_csv("years_21_22.csv", index=False, header=False)

years_22_23 = pd.DataFrame(all_players)
years_22_23.to_csv("years_22_23.csv", index=False, header=False)

# years_23_24 = pd.DataFrame(all_players)
# years_23_24.to_csv("years_23_24_2.csv", index=False, header=False)

# years_24_25 = pd.DataFrame(all_players)
# years_24_25.to_csv("years_24_25.csv", index=False, header=False)

# years_25_26 = pd.DataFrame(all_players)
# years_25_26.to_csv("years_25_26.csv", index=False, header=False)

driver.quit()