In [1]:
import random, time
import pandas as pd
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, Comment
from io import StringIO

BASE = "https://www.pro-football-reference.com"
teams = [
    'phi'
]

season = random.randint(2000, 2024)
team = random.choice(teams)
print(f"Season: {season}, Team: {team}")

req = Request(f"{BASE}/teams/{team}/{season}.htm", headers={"User-Agent": "Mozilla/5.0"})
html = urlopen(req).read().decode("utf-8")
pd.read_html(StringIO(html), header=1, attrs={"id": "games"})[0]
soup = BeautifulSoup(html, "lxml")
boxscore_urls = [BASE + a["href"] for a in soup.select("table#games a[href*='/boxscores/']")]
if not boxscore_urls:
    raise SystemExit("No boxscores found.")

random.shuffle(boxscore_urls)
html_table = None
box_url = None
for url in boxscore_urls:
    try:
        req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
        h = urlopen(req).read().decode("utf-8")
        s = BeautifulSoup(h, "lxml")
        comments = s.find_all(string=lambda t: isinstance(t, Comment))
        hit = next((c for c in comments if "home_starters" in c), None)
        if hit:
            html_table = hit
            box_url = url
            break
    except Exception:
        continue

if not html_table:
    raise SystemExit("No home_starters table found in any boxscore.")

print(f"Boxscore: {box_url}")

starters_df = pd.read_html(StringIO(html_table), header=0, attrs={"id": "home_starters"})[0]
soup_table = BeautifulSoup(html_table, "lxml")
hrefs = [a["href"] for a in soup_table.select("table#home_starters th[data-stat='player'] a")]
names = [a.get_text(strip=True) for a in soup_table.select("table#home_starters th[data-stat='player'] a")]
starters_df["Player"] = names
starters_df["player_href"] = hrefs
starters_df["link"] = BASE + starters_df["player_href"]

def get_college(url, player_name):
    try:
        time.sleep(random.uniform(4, 5))
        req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
        html = urlopen(req).read().decode("utf-8")
        soup = BeautifulSoup(html, "lxml")
        meta = soup.find(id="meta")
        if not meta:
            print(f"{player_name} college not found (no meta box)")
            return None
        label = meta.find("strong", string=lambda s: s and s.strip().startswith("College"))
        if not label:
            print(f"{player_name} college not found (no College label)")
            return None
        colleges = []
        for node in label.next_siblings:
            if getattr(node, "name", None) == "br":
                break
            if getattr(node, "name", None) == "a":
                href = node.get("href", "")
                if href.startswith("/schools/") and "high_schools" not in href:
                    colleges.append(node.get_text(strip=True))
        if colleges:
            college = colleges[-1]
            print(f"{player_name} college has been collected: {college}")
            return college
        print(f"{player_name} college not found (no college links)")
        return None
    except Exception as e:
        print(f"Error fetching {url} for {player_name}: {e}")
        return None

starters_df["college"] = [
    get_college(u, n) if pd.notnull(u) else None
    for u, n in zip(starters_df["link"], starters_df["Player"])
]

pd.set_option("display.max_colwidth", None)
starters_df[["Player", "Pos", "college"]]


Season: 2016, Team: phi
Boxscore: https://www.pro-football-reference.com/boxscores/201611060nyg.htm
Eli Manning college has been collected: Mississippi
Rashad Jennings college has been collected: Liberty
Odell Beckham Jr. college has been collected: LSU
Sterling Shepard college has been collected: Oklahoma
Dwayne Harris college has been collected: East Carolina
Will Tye college has been collected: Stony Brook
Ereck Flowers college has been collected: Miami (FL)
Justin Pugh college has been collected: Syracuse
Weston Richburg college has been collected: Colorado St.
John Jerry college has been collected: Mississippi
Bobby Hart college has been collected: Florida St.
Jason Pierre-Paul college has been collected: South Florida
Damon Harrison college has been collected: William Penn
Johnathan Hankins college has been collected: Ohio St.
Olivier Vernon college has been collected: Miami (FL)
Keenan Robinson college has been collected: Texas
Jonathan Casillas college has been collected: Wisco

Unnamed: 0,Player,Pos,college
0,Eli Manning,QB,Mississippi
1,Rashad Jennings,RB,Liberty
2,Odell Beckham Jr.,WR,LSU
3,Sterling Shepard,WR,Oklahoma
4,Dwayne Harris,WR,East Carolina
5,Will Tye,TE,Stony Brook
6,Ereck Flowers,LT,Miami (FL)
7,Justin Pugh,LG,Syracuse
8,Weston Richburg,C,Colorado St.
9,John Jerry,RG,Mississippi


In [2]:
starters_df["college"] = starters_df["college"].str.replace("St.", "State")
starters_df["college"] = starters_df["college"].str.replace("Col.", "College")

In [3]:
starters_df

Unnamed: 0,Player,Pos,player_href,link,college
0,Eli Manning,QB,/players/M/MannEl00.htm,https://www.pro-football-reference.com/players/M/MannEl00.htm,Mississippi
1,Rashad Jennings,RB,/players/J/JennRa00.htm,https://www.pro-football-reference.com/players/J/JennRa00.htm,Liberty
2,Odell Beckham Jr.,WR,/players/B/BeckOd00.htm,https://www.pro-football-reference.com/players/B/BeckOd00.htm,LSU
3,Sterling Shepard,WR,/players/S/ShepSt00.htm,https://www.pro-football-reference.com/players/S/ShepSt00.htm,Oklahoma
4,Dwayne Harris,WR,/players/H/HarrDw00.htm,https://www.pro-football-reference.com/players/H/HarrDw00.htm,East Carolina
5,Will Tye,TE,/players/T/TyexWi00.htm,https://www.pro-football-reference.com/players/T/TyexWi00.htm,Stony Brook
6,Ereck Flowers,LT,/players/F/FlowEr00.htm,https://www.pro-football-reference.com/players/F/FlowEr00.htm,Miami (FL)
7,Justin Pugh,LG,/players/P/PughJu00.htm,https://www.pro-football-reference.com/players/P/PughJu00.htm,Syracuse
8,Weston Richburg,C,/players/R/RichWe00.htm,https://www.pro-football-reference.com/players/R/RichWe00.htm,Colorado State
9,John Jerry,RG,/players/J/JerrJo20.htm,https://www.pro-football-reference.com/players/J/JerrJo20.htm,Mississippi


In [4]:
# Load the CSV with college names + conferences
cbb_df = pd.read_csv("/Users/noah/Desktop/starting5/app/static/json/cbb25.csv")

# Merge starters_df with cbb_df on college/Common name
# 'college' in starters_df, 'Common name' in cbb_df
starters_df = starters_df.merge(
    cbb_df[['Common name', 'Primary']],
    left_on='college',
    right_on='Common name',
    how='left'
)

# Rename Primary to conference
starters_df.rename(columns={'Primary': 'conference'}, inplace=True)

# Drop Common name column if you don’t want it
starters_df.drop(columns=['Common name'], inplace=True)

pd.set_option('display.max_colwidth', None)
starters_df


Unnamed: 0,Player,Pos,player_href,link,college,conference
0,Eli Manning,QB,/players/M/MannEl00.htm,https://www.pro-football-reference.com/players/M/MannEl00.htm,Mississippi,
1,Rashad Jennings,RB,/players/J/JennRa00.htm,https://www.pro-football-reference.com/players/J/JennRa00.htm,Liberty,Conference USA
2,Odell Beckham Jr.,WR,/players/B/BeckOd00.htm,https://www.pro-football-reference.com/players/B/BeckOd00.htm,LSU,Southeastern Conference
3,Sterling Shepard,WR,/players/S/ShepSt00.htm,https://www.pro-football-reference.com/players/S/ShepSt00.htm,Oklahoma,Southeastern Conference
4,Dwayne Harris,WR,/players/H/HarrDw00.htm,https://www.pro-football-reference.com/players/H/HarrDw00.htm,East Carolina,American Athletic Conference
5,Will Tye,TE,/players/T/TyexWi00.htm,https://www.pro-football-reference.com/players/T/TyexWi00.htm,Stony Brook,Coastal Athletic Association
6,Ereck Flowers,LT,/players/F/FlowEr00.htm,https://www.pro-football-reference.com/players/F/FlowEr00.htm,Miami (FL),
7,Justin Pugh,LG,/players/P/PughJu00.htm,https://www.pro-football-reference.com/players/P/PughJu00.htm,Syracuse,Atlantic Coast Conference
8,Weston Richburg,C,/players/R/RichWe00.htm,https://www.pro-football-reference.com/players/R/RichWe00.htm,Colorado State,Mountain West Conference
9,John Jerry,RG,/players/J/JerrJo20.htm,https://www.pro-football-reference.com/players/J/JerrJo20.htm,Mississippi,


NameError: Expected a DataFrame named `starters_df` in the notebook environment.