In [48]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from io import StringIO
import string
import time

In [3]:
## Find players in "A" page that played in 2024 ##
# Target a single player list page: Last names starting with 'a'
url = "https://www.basketball-reference.com/wnba/players/a/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Grab all <p> tags that list players
player_paragraphs = soup.find_all("p")

# Print players who played in 2024
for p in player_paragraphs:
    if "2024" in p.text:
        print(p.text.strip())

Lindsay Allen
2017 to 2024
Rebecca Allen
2015 to 2024
Laeticia Amihere
2023 to 2024
At  Ariel Atkins
2018 to 2024
Amy Atwell
2022 to 2024
Shakira Austin
2022 to 2024


In [5]:
url = "https://www.basketball-reference.com/wnba/players/a/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

player_paragraphs = soup.find_all("p")

# Expand this to show player URLs
for p in player_paragraphs:
    if "2024" in p.text:
        a_tag = p.find("a")
        name = a_tag.text.strip()
        link = a_tag["href"]
        print(name, link)

Lindsay Allen /wnba/players/a/allenli01w.html
Rebecca Allen /wnba/players/a/allenre01w.html
Laeticia Amihere /wnba/players/a/amihela01w.html
Ariel Atkins /wnba/players/a/atkinar01w.html
Amy Atwell /wnba/players/a/atwelam01w.html
Shakira Austin /wnba/players/a/austish01w.html


In [7]:
## Try to see tables for Lindsay Allen in game logs page ##
# Target 2024 gamelog page for Lindsay Allen
url = "https://www.basketball-reference.com/wnba/players/a/allenli01w/gamelog/2024/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Find the table we care about
table = soup.find("table", id="wnba_pgl_basic")

# Parse the table if it exists
if table is not None:
    df = pd.read_html(StringIO(str(table)))[0]
    print(df.head())
else:
    print("Table not found")

  Rk        Date     Age   Tm Unnamed: 4  Opp Unnamed: 6 GS     MP FG  ...  \
0  1  2024-05-15  29-056  CHI          @  DAL     L (-8)  0  13:31  2  ...   
1  2  2024-05-18  29-059  CHI          @  DAL     W (+9)  0   9:06  1  ...   
2  3  2024-05-23  29-064  CHI          @  NYL     W (+9)  0  18:44  4  ...   
3  4  2024-05-25  29-066  CHI        NaN  CON     L (-4)  0  15:38  3  ...   
4  5  2024-05-28  29-069  CHI        NaN  SEA     L (-9)  0  23:03  0  ...   

  ORB DRB TRB AST STL BLK TOV PF PTS GmSc  
0   0   1   1   0   1   0   0  0   5  3.2  
1   0   1   1   1   0   0   1  0   2  1.7  
2   0   1   1   2   0   0   1  3   8  4.2  
3   1   1   2   2   2   0   2  1   6  7.1  
4   0   2   2   4   0   0   3  2   3  0.1  

[5 rows x 28 columns]


In [9]:
for col in df.columns:
    print(col)

Rk
Date
Age
Tm
Unnamed: 4
Opp
Unnamed: 6
GS
MP
FG
FGA
FG%
3P
3PA
3P%
FT
FTA
FT%
ORB
DRB
TRB
AST
STL
BLK
TOV
PF
PTS
GmSc


In [11]:
# Rename columns
df = df.rename(columns={
    "Unnamed: 4": "home_away",
    "Unnamed: 6": "win_margin"
})

# Drop header rows accidentally parsed as data
df = df[df["Rk"] != "Rk"]

# Convert Age from 'YY-DDD' to decimal years
age_parts = df["Age"].str.extract(r"(\d+)-(\d+)")
age_parts = age_parts.astype(float)
df["Age"] = round(age_parts[0] + age_parts[1] / 365, 1)

# Convert home_away: '@' → 'away', else 'home'
df["home_away"] = df["home_away"].apply(lambda x: "away" if x == "@" else "home")

# Convert win_margin: extract number inside parentheses
df["win_margin"] = df["win_margin"].str.extract(r"\(([-+]?\d+)\)").astype(float)

# Convert MP (minutes played) from MM:SS to float minutes
def convert_mp(val):
    if pd.isna(val):
        return np.nan
    mins, secs = map(int, val.split(":"))
    return round(mins + secs / 60, 1)

df["MP"] = df["MP"].apply(convert_mp)

In [13]:
df.head()

Unnamed: 0,Rk,Date,Age,Tm,home_away,Opp,win_margin,GS,MP,FG,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc
0,1,2024-05-15,29.2,CHI,away,DAL,-8.0,0,13.5,2,...,0,1,1,0,1,0,0,0,5,3.2
1,2,2024-05-18,29.2,CHI,away,DAL,9.0,0,9.1,1,...,0,1,1,1,0,0,1,0,2,1.7
2,3,2024-05-23,29.2,CHI,away,NYL,9.0,0,18.7,4,...,0,1,1,2,0,0,1,3,8,4.2
3,4,2024-05-25,29.2,CHI,home,CON,-4.0,0,15.6,3,...,1,1,2,2,2,0,2,1,6,7.1
4,5,2024-05-28,29.2,CHI,home,SEA,-9.0,0,23.1,0,...,0,2,2,4,0,0,3,2,3,0.1


In [15]:
# Start fresh to avoid duplication from earlier cells
frames = []
df = pd.DataFrame()

# Collect all 2024 players from the 'a' page
url = "https://www.basketball-reference.com/wnba/players/a/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

player_tags = soup.find_all("p")
players = []

for tag in player_tags:
    if "2024" in tag.text:
        a_tag = tag.find("a")
        name = a_tag.text.strip()
        link = a_tag["href"]
        players.append((name, link))

# Loop through those players and extract + clean their gamelog tables
frames = []

for name, rel_link in players:
    gamelog_url = f"https://www.basketball-reference.com{rel_link.replace('.html', '/gamelog/2024/')}"
    response = requests.get(gamelog_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", id="wnba_pgl_basic")

    if table is None:
        print(f"No table found for {name}")
        continue

    df = pd.read_html(StringIO(str(table)))[0]
    df = df[df["Rk"] != "Rk"]

    df = df.rename(columns={
        "Unnamed: 4": "home_away",
        "Unnamed: 6": "win_margin"
    })

    # Clean age
    age_parts = df["Age"].str.extract(r"(\d+)-(\d+)").astype(float)
    df["Age"] = round(age_parts[0] + age_parts[1] / 365, 1)

    # Clean other fields
    df["home_away"] = df["home_away"].apply(lambda x: "away" if x == "@" else "home")
    df["win_margin"] = df["win_margin"].str.extract(r"\(([-+]?\d+)\)").astype(float)

    def convert_mp(val):
        if pd.isna(val):
            return np.nan
        mins, secs = map(int, val.split(":"))
        return round(mins + secs / 60, 1)

    df["MP"] = df["MP"].apply(convert_mp)

    df.insert(0, "Player", name)
    frames.append(df)

# Combine all players into one DataFrame
df = pd.concat(frames, ignore_index=True)

In [17]:
df.head()

Unnamed: 0,Player,Rk,Date,Age,Tm,home_away,Opp,win_margin,GS,MP,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc
0,Lindsay Allen,1,2024-05-15,29.2,CHI,away,DAL,-8.0,0,13.5,...,0,1,1,0,1,0,0,0,5,3.2
1,Lindsay Allen,2,2024-05-18,29.2,CHI,away,DAL,9.0,0,9.1,...,0,1,1,1,0,0,1,0,2,1.7
2,Lindsay Allen,3,2024-05-23,29.2,CHI,away,NYL,9.0,0,18.7,...,0,1,1,2,0,0,1,3,8,4.2
3,Lindsay Allen,4,2024-05-25,29.2,CHI,home,CON,-4.0,0,15.6,...,1,1,2,2,2,0,2,1,6,7.1
4,Lindsay Allen,5,2024-05-28,29.2,CHI,home,SEA,-9.0,0,23.1,...,0,2,2,4,0,0,3,2,3,0.1


In [19]:
for player in df["Player"].unique():
    print(player)

Lindsay Allen
Rebecca Allen
Laeticia Amihere
Ariel Atkins
Amy Atwell
Shakira Austin


In [21]:
# Null check
print(df.isnull().sum())

Player         0
Rk             0
Date           0
Age            0
Tm             0
home_away      0
Opp            0
win_margin     0
GS             0
MP             0
FG             0
FGA            0
FG%            8
3P             0
3PA            0
3P%           36
FT             0
FTA            0
FT%           65
ORB            0
DRB            0
TRB            0
AST            0
STL            0
BLK            0
TOV            0
PF             0
PTS            0
GmSc           0
dtype: int64


In [33]:
import string

# Base URL pattern
base_url = "https://www.basketball-reference.com/wnba/players/{}/"
headers = {"User-Agent": "Mozilla/5.0"}

# Store (name, link) for all 2024 players
players = []

# Loop through a–z player index pages
for letter in string.ascii_lowercase:
    url = base_url.format(letter)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    player_paragraphs = soup.find_all("p")

    # Filter for players with '2024' in their years active
    for p in player_paragraphs:
        if "2024" in p.text:
            a_tag = p.find("a")
            name = a_tag.text.strip()
            link = a_tag["href"]
            players.append((name, link))

# Print result
print(f"Found {len(players)} players with 2024:")
for name, link in players:
    print(name, link)

Found 157 players with 2024:
Lindsay Allen /wnba/players/a/allenli01w.html
Rebecca Allen /wnba/players/a/allenre01w.html
Laeticia Amihere /wnba/players/a/amihela01w.html
Ariel Atkins /wnba/players/a/atkinar01w.html
Amy Atwell /wnba/players/a/atwelam01w.html
Shakira Austin /wnba/players/a/austish01w.html
Rachel Banham /wnba/players/b/banhara01w.html
Kierstan Bell /wnba/players/b/bellki01w.html
Grace Berger /wnba/players/b/bergegr01w.html
Morgan Bertsch /wnba/players/b/bertsmo01w.html
Caitlin Bickle /wnba/players/b/bicklca01w.html
Monique Billings /wnba/players/b/billimo01w.html
DeWanna Bonner /wnba/players/b/bonnede01w.html
Aliyah Boston /wnba/players/b/bostoal01w.html
Cameron Brink /wnba/players/b/brinkca01w.html
Jaelyn Brown /wnba/players/b/brownja06w.html
Kalani Brown /wnba/players/b/brownka01w.html
Lexie Brown /wnba/players/b/brownle02w.html
Jakia Brown-Turner /wnba/players/b/brownja07w.html
Kennedy Burke /wnba/players/b/burkeke01w.html
Rae Burrell /wnba/players/b/burrera01w.html
Ve

In [35]:
# Count players with last names starting with 'B'
count_b = sum(1 for name, _ in players if name.split()[-1].startswith("B"))
print(f"Players with last name starting with B: {count_b}")

Players with last name starting with B: 16


In [37]:
# Count players with last names starting with 'I'
count_i = sum(1 for name, _ in players if name.split()[-1].startswith("I"))
print(f"Players with last name starting with I: {count_i}")

Players with last name starting with I: 1


In [43]:
# Start fresh to avoid duplication from earlier cells
frames = []
df = pd.DataFrame()

# Loop through all letter index pages a–z
players = []

for letter in string.ascii_lowercase:
    url = f"https://www.basketball-reference.com/wnba/players/{letter}/"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    player_tags = soup.find_all("p")

    for tag in player_tags:
        if "2024" in tag.text:
            a_tag = tag.find("a")
            name = a_tag.text.strip()
            link = a_tag["href"]
            players.append((name, link))

# Loop through those players and extract + clean their gamelog tables
frames = []

# Loop through players and extract gamelog data
for name, rel_link in players:
    gamelog_url = f"https://www.basketball-reference.com{rel_link.replace('.html', '/gamelog/2024/')}"
    response = requests.get(gamelog_url, headers=headers)

    # Wait to avoid rate limits
    time.sleep(3)  # ~20 requests per minute max

    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", id="wnba_pgl_basic")

    if table is None:
        print(f"No table found for {name}")
        continue

    df = pd.read_html(StringIO(str(table)))[0]
    df = df[df["Rk"] != "Rk"]

    df = df.rename(columns={
        "Unnamed: 4": "home_away",
        "Unnamed: 6": "win_margin"
    })

    # Clean age
    age_parts = df["Age"].str.extract(r"(\d+)-(\d+)").astype(float)
    df["Age"] = round(age_parts[0] + age_parts[1] / 365, 1)

    # Clean other fields
    df["home_away"] = df["home_away"].apply(lambda x: "away" if x == "@" else "home")
    df["win_margin"] = df["win_margin"].str.extract(r"\(([-+]?\d+)\)").astype(float)

    def convert_mp(val):
        if pd.isna(val):
            return np.nan
        mins, secs = map(int, val.split(":"))
        return round(mins + secs / 60, 1)

    df["MP"] = df["MP"].apply(convert_mp)

    df.insert(0, "Player", name)
    frames.append(df)

# Combine all players into one DataFrame
df = pd.concat(frames, ignore_index=True)

No table found for Napheesa Collier
No table found for Sydney Colson
No table found for Kahleah Copper
No table found for Sophie Cunningham
No table found for Crystal Dangerfield
No table found for Kaela Davis
No table found for Marquesha Davis
No table found for Diamond DeShields
No table found for Liz Dixon
No table found for Ivana Dojkić
No table found for Emily Engstler
No table found for Olivia Époupa
No table found for Dana Evans
No table found for Temi Fagbenle
No table found for Dyaisha Fair
No table found for Dulcy Fankam Mendjiadeu
No table found for Leonie Fiebich
No table found for Kysre Gondrezick
No table found for Allisha Gray
No table found for Chelsea Gray
No table found for Brittney Griner
No table found for Megan Gustafson
No table found for Dearica Hamby
No table found for Mikiah Herbert Harrigan
No table found for Tyasha Harris
No table found for Isabelle Harrison
No table found for Tiffany Hayes
No table found for Destanni Henderson
No table found for Natisha Hied

In [46]:
for player in df["Player"].unique():
    print(player)

Lindsay Allen
Rebecca Allen
Laeticia Amihere
Ariel Atkins
Amy Atwell
Shakira Austin
Rachel Banham
Kierstan Bell
Grace Berger
Morgan Bertsch
Caitlin Bickle
Monique Billings
DeWanna Bonner
Aliyah Boston
Cameron Brink
Jaelyn Brown
Kalani Brown
Lexie Brown
Jakia Brown-Turner
Kennedy Burke
Rae Burrell
Veronica Burton
Maya Caldwell
Jordin Canada
Emma Cannon
Kamilla Cardoso
Bridget Carleton
DiJonai Carrington
Chennedy Carter
Jessika Carter
Tina Charles
Layshia Clarendon
Alysha Clark
Caitlin Clark
Natasha Cloud
Nia Coffey
Zia Cooke
Lorela Cubaj
Damiris Dantas
Skylar Diggins-Smith
Stefanie Dolson
Aaliyah Edwards
Queen Egbo
