In [12]:
import pandas as pd
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By # used to import different ways to access data in the XML or HTML file
from selenium.webdriver.chrome.service import Service # no longer need to download a driver file, use service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from webdriver_manager.chrome import ChromeDriverManager # used to manage the Chrome driver to emulate a Chrome web browser

import time
import random

In [24]:

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from urllib.parse import urljoin

def scrape_metric_links(page_url, titles,
                        headless: bool = True,
                        timeout: int = 10) -> pd.DataFrame:
    """
    For each title in `titles`, find the link on the page by exact
    link text and return a DataFrame with columns ['Title','URL'].
    """
    # --- setup driver ---
    opts = Options()
    if headless:
        opts.add_argument("--headless")
    driver = webdriver.Chrome(options=opts)
    driver.get(page_url)

    wait = WebDriverWait(driver, timeout)
    # wait until the main table (or nav) is present
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

    records = []
    for title in titles:
        try:
            link = wait.until(
                EC.presence_of_element_located((By.LINK_TEXT, title))
            )
            raw_href = link.get_attribute("href")
            full_href = urljoin(page_url, raw_href)
        except TimeoutException:
            full_href = None
        records.append({"Title": title, "URL": full_href})

    driver.quit()
    return pd.DataFrame(records)


if __name__ == "__main__":
    PAGE = "https://stats.ncaa.org/rankings?sport_code=MFB&division=11"
    TITLES = [
        "Total Offense",
        "Rushing Offense",
        "Passing Offense",
        "Team Passing Efficiency",
        "Scoring Offense",
        "Total Defense",
        "Rushing Defense",
        "Passing Yards Allowed",
        "Team Passing Efficiency Defense",
        "Scoring Defense",
        "Turnover Margin",
        "3rd Down Conversion Pct",
        "4th Down Conversion Pct",
        "3rd Down Conversion Pct Defense",
        "4th Down Conversion Pct Defense",
        "Red Zone Offense",
        "Red Zone Defense",
        "Net Punting",
        "Punt Returns",
        "Kickoff Returns",
        "First Downs Offense",
        "First Downs Defense",
        "Fewest Penalties Per Game",
        "Fewest Penalty Yards Per Game",
        "Time of Possession",
    ]

    df = scrape_metric_links(PAGE, TITLES, headless=False)

                              Title  \
0                     Total Offense   
1                   Rushing Offense   
2                   Passing Offense   
3           Team Passing Efficiency   
4                   Scoring Offense   
5                     Total Defense   
6                   Rushing Defense   
7             Passing Yards Allowed   
8   Team Passing Efficiency Defense   
9                   Scoring Defense   
10                  Turnover Margin   
11          3rd Down Conversion Pct   
12          4th Down Conversion Pct   
13  3rd Down Conversion Pct Defense   
14  4th Down Conversion Pct Defense   
15                 Red Zone Offense   
16                 Red Zone Defense   
17                      Net Punting   
18                     Punt Returns   
19                  Kickoff Returns   
20              First Downs Offense   
21              First Downs Defense   
22        Fewest Penalties Per Game   
23    Fewest Penalty Yards Per Game   
24               Time of 

In [32]:
df.to_csv("ncaa_metric_links.csv", index=False)

In [50]:
import time
import random
import requests
import pandas as pd

INPUT_CSV  = "ncaa_metric_links.csv"
OUTPUT_CSV = "ncaa_all_metrics.csv"

# Load your list of metric titles and URLs
links_df = pd.read_csv(INPUT_CSV)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

all_dfs = []

for _, row in links_df.iterrows():
    title = row["Title"]
    url   = row["URL"]

    if pd.isna(url):
        print(f"Skipping {title!r}: no URL")
        continue

    print(f"Scraping {title!r} → {url}")
    resp = session.get(url, timeout=10)
    resp.raise_for_status()

    # pandas will find all <table> tags—take the first one
    tables = pd.read_html(resp.text)
    if not tables:
        print(f"No tables found for {title!r}, skipping.")
        continue

    df = tables[0]
    df.insert(0, "Metric", title)
    all_dfs.append(df)
    print(f"  Retrieved {len(df)} rows for {title!r}")

    # optional pause to be polite
    time.sleep(random.uniform(1, 2))

if all_dfs:
    combined = pd.concat(all_dfs, ignore_index=True)
    combined.to_csv(OUTPUT_CSV, index=False)
    print(f"\nDone! {len(combined)} rows saved to {OUTPUT_CSV}")
else:
    print("No data collected.")


Scraping 'Total Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=21.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Total Offense'
Scraping 'Rushing Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=23.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Rushing Offense'
Scraping 'Passing Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=25.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Passing Offense'
Scraping 'Team Passing Efficiency' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=465.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Team Passing Efficiency'
Scraping 'Scoring Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=27.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Scoring Offense'
Scraping 'Total Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=22.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Total Defense'
Scraping 'Rushing Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=24.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Rushing Defense'
Scraping 'Passing Yards Allowed' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=695.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Passing Yards Allowed'
Scraping 'Team Passing Efficiency Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=40.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Team Passing Efficiency Defense'
Scraping 'Scoring Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=28.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Scoring Defense'
Scraping 'Turnover Margin' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=29.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Turnover Margin'
Scraping '3rd Down Conversion Pct' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=699.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for '3rd Down Conversion Pct'
Scraping '4th Down Conversion Pct' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=700.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for '4th Down Conversion Pct'
Scraping '3rd Down Conversion Pct Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=701.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for '3rd Down Conversion Pct Defense'
Scraping '4th Down Conversion Pct Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=702.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for '4th Down Conversion Pct Defense'
Scraping 'Red Zone Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=703.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Red Zone Offense'
Scraping 'Red Zone Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=704.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Red Zone Defense'
Scraping 'Net Punting' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=98.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Net Punting'
Scraping 'Punt Returns' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=97.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Punt Returns'
Scraping 'Kickoff Returns' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=96.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Kickoff Returns'
Scraping 'First Downs Offense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=693.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'First Downs Offense'
Scraping 'First Downs Defense' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=694.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'First Downs Defense'
Scraping 'Fewest Penalties Per Game' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=697.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Fewest Penalties Per Game'
Scraping 'Fewest Penalty Yards Per Game' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=698.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Fewest Penalty Yards Per Game'
Scraping 'Time of Possession' → https://stats.ncaa.org/rankings/national_ranking?academic_year=2025.0&division=11.0&ranking_period=84.0&sport_code=MFB&stat_seq=705.0


  tables = pd.read_html(resp.text)


  Retrieved 135 rows for 'Time of Possession'

Done! 3375 rows saved to ncaa_all_metrics.csv


In [52]:
ncaa = pd.read_csv('ncaa_all_metrics.csv')
ncaa

Unnamed: 0,Metric,Rank,Team,G,W-L,Plays,YDS,Yds/Play,Off TDs,YPG,...,FD,Opp Rush 1st,Opp Pass 1st,Opp Pen 1st,Opp FD,Penalties,PenPerGame,PYards,TOP,AvgTOP
0,Total Offense,1,Miami (FL) (ACC),13,10-3,922,6983,7.57,71,537.2,...,,,,,,,,,,
1,Total Offense,2,Ole Miss (SEC),13,10-3,934,6845,7.33,60,526.5,...,,,,,,,,,,
2,Total Offense,3,North Texas (The American),13,6-7,946,6355,6.72,57,488.8,...,,,,,,,,,,
3,Total Offense,4,New Mexico (Mountain West),12,5-7,846,5811,6.87,49,484.3,...,,,,,,,,,,
4,Total Offense,5,Texas St. (Sun Belt),13,8-5,957,6200,6.48,58,476.9,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3370,Time of Possession,131,Mississippi St. (SEC),12,2-10,,,,,,...,,,,,,,,,308:16,25:41
3371,Time of Possession,132,East Carolina (The American),13,8-5,,,,,,...,,,,,,,,,329:23,25:20
3372,Time of Possession,133,South Fla. (The American),13,7-6,,,,,,...,,,,,,,,,326:39,25:08
3373,Time of Possession,Reclassifying,Reclassifying,Reclassifying,Reclassifying,,,,,,...,,,,,,,,,Reclassifying,Reclassifying


In [64]:
all_dfs[1]

Unnamed: 0,Metric,Rank,Team,G,W-L,Rush,Rush Yds,Yds/Rush,Rush TD,YPG
0,Rushing Offense,1,Army West Point (The American),14,12-2,764,4207,5.51,48,300.5
1,Rushing Offense,2,New Mexico (Mountain West),12,5-7,456,3043,6.67,37,253.6
2,Rushing Offense,3,Jacksonville St. (CUSA),14,9-5,646,3517,5.44,51,251.2
3,Rushing Offense,4,Liberty (CUSA),12,8-4,542,3008,5.55,28,250.7
4,Rushing Offense,5,UCF (Big 12),12,4-8,516,2977,5.77,33,248.1
...,...,...,...,...,...,...,...,...,...,...
130,Rushing Offense,131,UCLA (Big Ten),12,5-7,338,1039,3.07,4,86.6
131,Rushing Offense,132,Kent St. (MAC),12,0-12,391,923,2.36,3,76.9
132,Rushing Offense,133,Colorado (Big 12),13,9-4,341,847,2.48,15,65.2
133,Rushing Offense,Reclassifying,Reclassifying,Reclassifying,Reclassifying,Reclassifying,Reclassifying,Reclassifying,Reclassifying,Reclassifying
