In [50]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
from tqdm import tqdm

# Schools, Sports, Divisions


In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}

url = "https://web1.ncaa.org/stats/StatsSrv/careersearch"

with requests.Session() as session:

    response = session.get(url, headers=headers)

    if response.status_code != 200:
        print("Failed to load page")
        exit(1)

    soup = BeautifulSoup(response.content, "lxml")

    with open("schools.csv", "w") as file:
        writer = csv.writer(file, lineterminator="\n")

        # schoolIds
        for option in soup.select("select[name='searchOrg'] option"):
            writer.writeschool([option["value"], option.get_text(strip=True)])

    with open("sports.csv", "w") as file:
        writer = csv.writer(file, lineterminator="\n")

        # sportIds
        for option in soup.select("select[name='searchSport'] option"):
            writer.writeschool([option["value"], option.get_text(strip=True)])

    with open("divisions.csv", "w") as file:
        writer = csv.writer(file, lineterminator="\n")

        # divisionIds
        for option in soup.select("select[name='searchDiv'] option"):
            writer.writerow([option["value"], option.get_text(strip=True)])

# School Lacrosse Histories


In [51]:
schools = pd.read_csv("schools.csv", names=["id", "schoolName"], skiprows=1)
schools

Unnamed: 0,id,schoolName
0,26172,A&M-Corpus Christi
1,2,Abilene Christian
2,30123,Academy of Art
3,929,Adams St.
4,3,Adelphi
...,...,...
1340,814,Yeshiva
1341,2730,York (NY)
1342,815,York (PA)
1343,30154,Young Harris


In [60]:
schools_file = "schools.csv"
base_url = "https://stats.ncaa.org/teams/history/MLA/"
website_url = "https://stats.ncaa.org"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}

with requests.Session() as session:
    for i, school in tqdm(schools.iterrows(), total=schools.shape[0]):
        response = session.get(base_url + str(school.id), headers=headers)

        if response.status_code != 200:
            continue

        soup = BeautifulSoup(response.content, "lxml")
        table = soup.find("table", {"id": "team_history_data_table"})
        table_body = table.find("tbody")

        # Check if school has any lacrosse history data. Continue if not.
        first_row = table_body.find("tr")
        if not first_row:
            continue

        first_row_data = [
            data.get_text(strip=True) for data in first_row.find_all("td")
        ]
        if first_row_data[2] == "" and first_row_data[3] == "-":
            continue

        with open(f"histories/history_{school.id}.csv", "w") as file:
            writer = csv.writer(file, lineterminator="\n")

            # write headers
            table_header = table.find("thead")
            ths = [th.get_text(strip=True) for th in table_header.find_all("th")] + [
                "team_url",
                "coach_url",
            ]
            writer.writerow(ths)

            # write data
            for row in table_body.find_all("tr"):
                team_url, coach_url = "", ""
                tds = row.find_all("td")
                if tds[0].find("a"):
                    team_url = website_url + tds[0].a["href"]
                if tds[1].find("a"):
                    coach_url = website_url + tds[1].a["href"]

                writer.writerow(
                    [data.get_text(strip=True) for data in row.find_all("td")]
                    + [team_url, coach_url]
                )

100%|██████████| 1345/1345 [45:25<00:00,  2.03s/it]


In [59]:
with requests.Session() as session:
    response = session.get(base_url + str(11062), headers=headers)

    if response.status_code != 200:
        exit()

    soup = BeautifulSoup(response.content, "lxml")
    table = soup.find("table", {"id": "team_history_data_table"})
    table_body = table.find("tbody")
    table_header = table.find("thead")

    # Check if school has any lacrosse history data. Continue if not.
    first_row = table_body.find("tr")
    if not first_row:
        exit()

    first_row_data = [data.get_text(strip=True) for data in first_row.find_all("td")]
    if first_row_data[2] == "" and first_row_data[3] == "-":
        exit()

    ths = [th.get_text(strip=True) for th in table_header.find_all("th")]

    for row in table_body.find_all("tr"):
        tds = row.find_all("td")
        if tds[0].find("a"):
            print(tds[0].a["href"])
        if tds[1].find("a"):
            print(tds[1].a["href"])

/teams/423694
/people/2002684
/teams/110842
/people/46013
/teams/24257
/people/45011
/teams/106243
/people/39933
/teams/103805
/people/39933
/teams/35452
/people/41058
/teams/56978
/people/29784
/teams/29653
/people/29784
/teams/53481
/people/29784
/teams/95601
/people/29784
/teams/408099
/people/29784
/teams/179028
/people/29784
/teams/222421
/people/29784
/teams/222201
/people/9316
/teams/179663
/people/9316
/teams/563141
/teams/563153


In [35]:
import csv
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

schools_file = "schools.csv"
base_url = "https://stats.ncaa.org/teams/history/MLA/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}


def process_school(row):
    school_code = row[0]
    school_name = row[1]
    # print(school_code, school_name)

    with requests.Session() as session:
        response = session.get(base_url + school_code, headers=headers)

        print(response.status_code)
        if response.status_code != 200:
            print("Status Code Error")
            return

        soup = BeautifulSoup(response.content, "lxml")
        table = soup.find("table", {"id": "team_history_data_table"})
        table_body = table.find("tbody")
        table_header = table.find("thead")

        # Check if school has any lacrosse history data. Continue if not.
        first_row = table_body.find("tr")
        if not first_row:
            print("No first row", row[0], row[1])
            return

        first_row_data = [
            data.get_text(strip=True) for data in first_row.find_all("td")
        ]
        if first_row_data[2] == "" and first_row_data[3] == "-":
            print("No valid data", row[0], row[1])
            return

        with open(f"histories/history_{school_code}.csv", "w") as file:
            writer = csv.writer(file, lineterminator="\n")

            # write headers
            headers = [
                header.get_text(strip=True) for header in table_header.find_all("th")
            ]
            writer.writerow(headers)

            # write data
            for row in table_body.find_all("tr"):
                writer.writerow(
                    [data.get_text(strip=True) for data in row.find_all("td")]
                )


# Load schools data into memory
with open(schools_file, "r") as file:
    reader = csv.reader(file)
    next(reader)  # skip "All" row
    schools_data = list(reader)  # Read all rows into memory

    # Parallelize processing of schools using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        executor.map(process_school, schools_data)

In [38]:
import csv
import asyncio
import aiohttp
from bs4 import BeautifulSoup

schools_file = "schools.csv"
base_url = "https://stats.ncaa.org/teams/history/MLA/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}
from aiohttp import ClientSession


async def fetch(session: ClientSession, url: str, attempt=1) -> str:
    try:
        async with session.get(url) as response:
            response.raise_for_status()  # Will raise an error for 4xx and 5xx codes
            return await response.text()
    except aiohttp.ClientResponseError as e:
        if e.status == 403 and attempt < 3:  # Retry logic for 403 status code
            await asyncio.sleep(2**attempt)  # Exponential backoff
            return await fetch(session, url, attempt + 1)
        else:
            raise


# This function will now retry twice with delays if a 403 error occurs


async def process_school(session, row):
    school_code = row[0]
    school_name = row[1]
    response_text, status_code = await fetch(session, base_url + school_code)
    print(status_code)  # Check the status code

    if status_code != 200:
        print("Status Code Error", status_code)
        return

    soup = BeautifulSoup(response_text, "lxml")
    table = soup.find("table", {"id": "team_history_data_table"})
    if not table:
        print("No table found for", school_name)
        return

    table_body = table.find("tbody")
    table_header = table.find("thead")

    first_row = table_body.find("tr")
    if not first_row:
        print("No data for", school_name)
        return

    with open(f"histories/history_{school_code}.csv", "w", newline="") as file:
        writer = csv.writer(file)
        headers = [th.get_text(strip=True) for th in table_header.find_all("th")]
        writer.writerow(headers)

        for row in table_body.find_all("tr"):
            writer.writerow([td.get_text(strip=True) for td in row.find_all("td")])


async def main():
    tasks = []
    async with aiohttp.ClientSession(headers=headers) as session:
        with open(schools_file, mode="r") as file:
            reader = csv.reader(file)
            next(reader)  # Skip the header or the first row
            for row in reader:
                task = asyncio.ensure_future(process_school(session, row))
                tasks.append(task)
        await asyncio.gather(*tasks)


# Ensure the directory exists
import os

if not os.path.exists("histories"):
    os.makedirs("histories")


def run_async_main():
    loop = asyncio.get_event_loop()
    if loop.is_running():
        asyncio.create_task(main())  # Add main to the existing event loop
    else:
        asyncio.run(main())  # This is safe to use if no event loop is running


run_async_main()

Exception in callback _ProactorBasePipeTransport._call_connection_lost(None)
handle: <Handle _ProactorBasePipeTransport._call_connection_lost(None)>
Traceback (most recent call last):
  File "c:\Users\19083\anaconda3\Lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "c:\Users\19083\anaconda3\Lib\asyncio\proactor_events.py", line 165, in _call_connection_lost
    self._sock.shutdown(socket.SHUT_RDWR)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Task exception was never retrieved
future: <Task finished name='Task-2895' coro=<main() done, defined at C:\Users\19083\AppData\Local\Temp\ipykernel_13596\423040502.py:63> exception=ClientResponseError(RequestInfo(url=URL('https://stats.ncaa.org/teams/history/MLA/17'), method='GET', headers=<CIMultiDictProxy('Host': 'stats.ncaa.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.466