In [21]:
# Clone your GitHub repo (you’ll be prompted to authorize if it's private)
!git clone https://github.com/colterwood/LHL-final-final-project.git

fatal: destination path 'LHL-final-final-project' already exists and is not an empty directory.


In [22]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, Comment
import requests
from io import StringIO
import string
import time
import re
from functools import reduce

In [23]:
# Base URL pattern
base_url = "https://www.basketball-reference.com/wnba/players/{}/"
headers = {"User-Agent": "Mozilla/5.0"}

# Store (name, link) for all 2024 players
players = []

# Loop through a–z player index pages
for letter in string.ascii_lowercase:
    url = base_url.format(letter)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    player_paragraphs = soup.find_all("p")

    # Filter for players with '2024' in their years active
    for p in player_paragraphs:
        if "2024" in p.text:
            a_tag = p.find("a")
            name = a_tag.text.strip()
            link = a_tag["href"]
            players.append((name, link))

# Print result
print(f"Found {len(players)} players with 2024:")
for name, link in players:
    print(name, link)

Found 157 players with 2024:
Lindsay Allen /wnba/players/a/allenli01w.html
Rebecca Allen /wnba/players/a/allenre01w.html
Laeticia Amihere /wnba/players/a/amihela01w.html
Ariel Atkins /wnba/players/a/atkinar01w.html
Amy Atwell /wnba/players/a/atwelam01w.html
Shakira Austin /wnba/players/a/austish01w.html
Rachel Banham /wnba/players/b/banhara01w.html
Kierstan Bell /wnba/players/b/bellki01w.html
Grace Berger /wnba/players/b/bergegr01w.html
Morgan Bertsch /wnba/players/b/bertsmo01w.html
Caitlin Bickle /wnba/players/b/bicklca01w.html
Monique Billings /wnba/players/b/billimo01w.html
DeWanna Bonner /wnba/players/b/bonnede01w.html
Aliyah Boston /wnba/players/b/bostoal01w.html
Cameron Brink /wnba/players/b/brinkca01w.html
Jaelyn Brown /wnba/players/b/brownja06w.html
Kalani Brown /wnba/players/b/brownka01w.html
Lexie Brown /wnba/players/b/brownle02w.html
Jakia Brown-Turner /wnba/players/b/brownja07w.html
Kennedy Burke /wnba/players/b/burkeke01w.html
Rae Burrell /wnba/players/b/burrera01w.html
Ve

In [24]:
# name, link = players[0]  # or any other index
table_ids = ["per_game", "per_minute", "per_poss", "advanced", "shooting", "pbp"]
no_prefix_cols = {"Player", "Year", "Tm", "Age", "G", "GS"}

# Accumulators
per_game_frames = []
per_minute_frames = []
per_poss_frames = []
advanced_frames = []
shooting_frames = []
pbp_frames = []

for name, link in players:
    print(f"Processing: {name}")

    url = "https://www.basketball-reference.com" + link
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    def load_table(table_id, soup, comments, player_name):
        tag = soup.find("table", {"id": table_id + "0"})
        if tag is None:
            for c in comments:
                if f'id="{table_id}0"' in c:
                    tag = BeautifulSoup(c, "html.parser").find("table", {"id": table_id + "0"})
                    break
        if tag is None:
            print(f"Table not found: {table_id}")
            return None

        if table_id in ["shooting", "pbp"]:
            df = pd.read_html(StringIO(str(tag)), header=[0, 1])[0]
            df.columns = [f"{a}_{b}" if not a.startswith("Unnamed") else b for a, b in df.columns]
        else:
            df = pd.read_html(StringIO(str(tag)), header=0)[0]

        df.columns = [col if col in no_prefix_cols else f"{table_id}_{col}" for col in df.columns]
        df.insert(0, "Player", player_name)
        return df

    # Load all 6 tables and append
    t = load_table("per_game", soup, comments, name)
    if t is not None: per_game_frames.append(t)

    t = load_table("per_minute", soup, comments, name)
    if t is not None: per_minute_frames.append(t)

    t = load_table("per_poss", soup, comments, name)
    if t is not None: per_poss_frames.append(t)

    t = load_table("advanced", soup, comments, name)
    if t is not None: advanced_frames.append(t)

    t = load_table("shooting", soup, comments, name)
    if t is not None: shooting_frames.append(t)

    t = load_table("pbp", soup, comments, name)
    if t is not None: pbp_frames.append(t)

    time.sleep(5)

# Combine each full table
per_game = pd.concat(per_game_frames, ignore_index=True)
per_minute = pd.concat(per_minute_frames, ignore_index=True)
per_poss = pd.concat(per_poss_frames, ignore_index=True)
advanced = pd.concat(advanced_frames, ignore_index=True)
shooting = pd.concat(shooting_frames, ignore_index=True)
pbp = pd.concat(pbp_frames, ignore_index=True)

Processing: Lindsay Allen
Processing: Rebecca Allen
Processing: Laeticia Amihere
Processing: Ariel Atkins
Processing: Amy Atwell
Processing: Shakira Austin
Processing: Rachel Banham
Processing: Kierstan Bell
Processing: Grace Berger
Processing: Morgan Bertsch
Processing: Caitlin Bickle
Processing: Monique Billings
Processing: DeWanna Bonner
Processing: Aliyah Boston
Processing: Cameron Brink
Processing: Jaelyn Brown
Processing: Kalani Brown
Processing: Lexie Brown
Processing: Jakia Brown-Turner
Processing: Kennedy Burke
Processing: Rae Burrell
Processing: Veronica Burton
Processing: Maya Caldwell
Processing: Jordin Canada
Processing: Emma Cannon
Processing: Kamilla Cardoso
Processing: Bridget Carleton
Processing: DiJonai Carrington
Processing: Chennedy Carter
Processing: Jessika Carter
Processing: Tina Charles
Processing: Layshia Clarendon
Processing: Alysha Clark
Processing: Caitlin Clark
Processing: Natasha Cloud
Processing: Nia Coffey
Processing: Napheesa Collier
Processing: Sydney 

In [25]:
per_game

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,per_game_DRB,per_game_TRB,per_game_AST,per_game_STL,per_game_BLK,per_game_TOV,per_game_PF,per_game_PTS,per_game_Awards,per_game_Unnamed: 29
0,Lindsay Allen,2017,NYL,22.0,28,0,13.4,0.8,2.2,.371,...,1.1,1.5,2.2,0.6,0.0,0.7,0.9,1.9,,
1,Lindsay Allen,2018,LVA,23.0,24,6,14.9,1.2,3.0,.384,...,1.1,1.3,2.9,0.6,0.0,0.9,0.7,3.1,,
2,Lindsay Allen,2019,,23.0,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),...,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury)
3,Lindsay Allen,2020,LVA,25.0,21,21,13.5,1.3,3.1,.424,...,0.8,1.1,2.4,0.3,0.0,0.8,1.0,3.3,,
4,Lindsay Allen,2021,IND,26.0,32,8,17.8,2.0,4.8,.428,...,1.2,1.5,3.0,0.5,0.1,1.0,1.1,5.4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,Cecilia Zandalasini,2021,,24.0,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),...,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA)
1551,Cecilia Zandalasini,2022,,25.0,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),...,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA)
1552,Cecilia Zandalasini,2023,,26.0,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),...,Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA),Did Not Play (did not appear in WNBA)
1553,Cecilia Zandalasini,2024,MIN,28.0,40,0,12.2,1.7,3.7,.453,...,1.0,1.2,1.1,0.4,0.2,0.8,1.4,4.6,,


In [26]:
per_minute

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_minute_MP,per_minute_FG,per_minute_FGA,per_minute_FG%,...,per_minute_FT%,per_minute_ORB,per_minute_DRB,per_minute_TRB,per_minute_AST,per_minute_STL,per_minute_BLK,per_minute_TOV,per_minute_PF,per_minute_PTS
0,Lindsay Allen,2017,NYL,22.0,28.0,0.0,376.0,2.2,5.9,0.371,...,0.700,1.0,3.1,4.0,5.9,1.5,0.1,1.8,2.3,5.1
1,Lindsay Allen,2018,LVA,23.0,24.0,6.0,358.0,2.8,7.3,0.384,...,0.708,0.3,2.7,3.0,6.9,1.5,0.0,2.1,1.7,7.4
2,Lindsay Allen,2020,LVA,25.0,21.0,21.0,284.0,3.5,8.4,0.424,...,0.800,0.8,2.2,2.9,6.5,0.9,0.1,2.0,2.5,8.9
3,Lindsay Allen,2021,IND,26.0,32.0,8.0,571.0,4.1,9.6,0.428,...,0.811,0.6,2.5,3.0,6.1,1.1,0.2,2.0,2.1,11.0
4,Lindsay Allen,2022,MIN,27.0,9.0,0.0,134.0,5.4,10.2,0.526,...,0.923,0.0,3.8,3.8,8.3,0.5,0.0,1.9,3.0,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Li Yueru,1 season,CHI,,16.0,0.0,82.0,3.5,7.9,0.444,...,1.000,2.2,8.3,10.5,0.9,0.9,0.9,5.7,4.0,12.3
1453,Cecilia Zandalasini,2017,MIN,21.0,3.0,0.0,19.0,1.9,7.6,0.250,...,,0.0,1.9,1.9,0.0,0.0,0.0,3.8,3.8,3.8
1454,Cecilia Zandalasini,2018,MIN,22.0,29.0,6.0,478.0,4.6,11.2,0.409,...,0.840,0.5,3.6,4.1,2.4,0.6,0.1,2.3,3.8,12.5
1455,Cecilia Zandalasini,2024,MIN,28.0,40.0,0.0,487.0,5.0,10.9,0.453,...,0.621,0.4,3.0,3.4,3.2,1.1,0.7,2.3,4.1,13.5


In [27]:
per_poss

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_poss_MP,per_poss_FG,per_poss_FGA,per_poss_FG%,...,per_poss_TRB,per_poss_AST,per_poss_STL,per_poss_BLK,per_poss_TOV,per_poss_PF,per_poss_PTS,per_poss_Unnamed: 27,per_poss_ORtg,per_poss_DRtg
0,Lindsay Allen,2017,NYL,22.0,28.0,0.0,376.0,3.2,8.5,0.371,...,5.8,8.5,2.2,0.1,2.6,3.3,7.3,,95.0,101.0
1,Lindsay Allen,2018,LVA,23.0,24.0,6.0,358.0,3.9,10.1,0.384,...,4.1,9.5,2.1,0.0,2.9,2.4,10.2,,96.0,110.0
2,Lindsay Allen,2020,LVA,25.0,21.0,21.0,284.0,4.9,11.5,0.424,...,4.0,8.9,1.2,0.2,2.8,3.5,12.2,,104.0,104.0
3,Lindsay Allen,2021,IND,26.0,32.0,8.0,571.0,5.9,13.8,0.428,...,4.3,8.7,1.5,0.3,2.8,3.1,15.8,,107.0,113.0
4,Lindsay Allen,2022,MIN,27.0,9.0,0.0,134.0,7.6,14.5,0.526,...,5.3,11.8,0.8,0.0,2.7,4.2,22.8,,136.0,110.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Li Yueru,1 season,CHI,,16.0,0.0,82.0,4.9,11.1,0.444,...,14.8,1.2,1.2,1.2,8.0,5.5,17.2,,85.0,100.0
1453,Cecilia Zandalasini,2017,MIN,21.0,3.0,0.0,19.0,2.7,10.8,0.250,...,2.7,0.0,0.0,0.0,5.4,5.4,5.4,,35.0,104.0
1454,Cecilia Zandalasini,2018,MIN,22.0,29.0,6.0,478.0,6.6,16.2,0.409,...,5.9,3.5,0.9,0.1,3.4,5.6,18.1,,95.0,106.0
1455,Cecilia Zandalasini,2024,MIN,28.0,40.0,0.0,487.0,7.1,15.7,0.453,...,4.9,4.5,1.6,1.0,3.3,5.8,19.4,,100.0,100.0


In [28]:
advanced

Unnamed: 0,Player,Year,Tm,Age,G,advanced_MP,advanced_PER,advanced_TS%,advanced_3PAr,advanced_FTr,...,advanced_AST%,advanced_STL%,advanced_BLK%,advanced_TOV%,advanced_USG%,advanced_Unnamed: 17,advanced_OWS,advanced_DWS,advanced_WS,advanced_WS/48
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,9.0,0.399,0.210,0.161,...,24.6,2.2,0.2,22.2,10.0,,0.0,0.6,0.6,0.080
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,9.8,0.443,0.219,0.329,...,27.5,2.1,0.0,20.1,12.5,,0.0,0.2,0.2,0.024
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,10.5,0.497,0.258,0.152,...,25.0,1.2,0.3,18.5,13.4,,0.3,0.4,0.6,0.106
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,13.6,0.517,0.309,0.243,...,28.2,1.5,0.4,15.6,15.6,,0.9,-0.2,0.7,0.059
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,21.4,0.686,0.368,0.342,...,38.5,0.8,0.0,13.8,16.9,,0.7,0.0,0.8,0.280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Li Yueru,1 season,CHI,,16.0,82.0,8.3,0.601,0.000,0.667,...,3.4,1.2,1.9,35.8,20.1,,-0.1,0.2,0.0,0.010
1453,Cecilia Zandalasini,2017,MIN,21.0,3.0,19.0,-8.9,0.250,0.000,0.000,...,0.0,0.0,0.0,33.3,14.0,,-0.1,0.0,-0.1,-0.228
1454,Cecilia Zandalasini,2018,MIN,22.0,29.0,478.0,8.0,0.519,0.403,0.168,...,10.6,0.9,0.2,16.2,18.1,,-0.1,0.6,0.4,0.045
1455,Cecilia Zandalasini,2024,MIN,28.0,40.0,487.0,11.2,0.569,0.473,0.196,...,14.5,1.6,1.6,16.2,18.2,,0.3,0.9,1.2,0.115


In [29]:
shooting

Unnamed: 0,Player,Year,Tm,Age,G,shooting_MP,shooting_FG%,shooting_Dist.,shooting_Unnamed: 7_level_1,shooting_% of FGA by Distance_2P,...,shooting_% of FG Ast'd_3P,shooting_Unnamed: 24_level_1,shooting_Dunks_%FGA,shooting_Dunks_#,shooting_Unnamed: 27_level_1,shooting_Corner 3s_%3PA,shooting_Corner 3s_3P%,shooting_Unnamed: 30_level_1,shooting_Heaves_Att.,shooting_Heaves_#
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,0.371,13.5,,0.790,...,,,0.0,0.0,,0.000,,,1.0,0.0
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,0.384,14.5,,0.781,...,1.000,,0.0,0.0,,0.000,,,0.0,0.0
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,0.424,13.6,,0.742,...,0.667,,0.0,0.0,,0.118,0.500,,0.0,0.0
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,0.428,14.0,,0.691,...,0.857,,0.0,0.0,,0.149,0.429,,0.0,0.0
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,0.526,14.9,,0.632,...,0.250,,0.0,0.0,,0.143,0.500,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Li Yueru,1 season,CHI,,16.0,82.0,0.444,4.3,,1.000,...,,,0.0,0.0,,,,,0.0,0.0
1453,Cecilia Zandalasini,2017,MIN,21.0,3.0,19.0,0.250,13.1,,1.000,...,,,0.0,0.0,,,,,0.0,0.0
1454,Cecilia Zandalasini,2018,MIN,22.0,29.0,478.0,0.409,17.8,,0.597,...,1.000,,0.0,0.0,,0.200,0.333,,0.0,0.0
1455,Cecilia Zandalasini,2024,MIN,28.0,40.0,486.0,0.453,17.7,,0.527,...,1.000,,0.0,0.0,,0.200,0.429,,0.0,0.0


In [30]:
pbp

Unnamed: 0,Player,Year,Tm,Age,G,pbp_MP,pbp_+/- Per 100 Poss._OnCourt,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,10.3,9.2,13.0,5.0,8.0,1.0,5.0,5.0,137.0,2.0,1.0
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,1.3,2.2,16.0,3.0,5.0,0.0,6.0,6.0,153.0,3.0,3.0
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,3.7,-11.1,12.0,1.0,11.0,1.0,5.0,3.0,111.0,2.0,7.0
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,-17.2,-7.6,21.0,4.0,12.0,2.0,16.0,7.0,219.0,3.0,8.0
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,-5.2,-3.5,6.0,0.0,5.0,1.0,4.0,1.0,70.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,Li Yueru,1 season,CHI,,16.0,82.0,-11.1,-18.7,4.0,4.0,2.0,2.0,6.0,0.0,5.0,0.0,1.0
1453,Cecilia Zandalasini,2017,MIN,21.0,3.0,19.0,-48.1,-63.3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1454,Cecilia Zandalasini,2018,MIN,22.0,29.0,478.0,-2.4,-5.9,13.0,7.0,23.0,2.0,11.0,8.0,72.0,2.0,10.0
1455,Cecilia Zandalasini,2024,MIN,28.0,40.0,486.0,2.5,-8.8,13.0,11.0,25.0,1.0,13.0,5.0,102.0,3.0,12.0


In [87]:
def clean_year_rows(df):
    return df[df["Year"].astype(str).str.match(r"^\d{4}$|^Career$")].copy()

# Clean each table
per_game = clean_year_rows(per_game)
per_minute = clean_year_rows(per_minute)
per_poss = clean_year_rows(per_poss)
advanced = clean_year_rows(advanced)
shooting = clean_year_rows(shooting)
pbp = clean_year_rows(pbp)

In [88]:
# Keys to merge on
merge_keys = ["Player", "Year", "Tm"]

# All the other shared columns we want to keep only once
shared_cols = list(no_prefix_cols - set(merge_keys))

# Prepare list: keep shared_cols only from the first df
dfs = [per_game, per_minute, per_poss, advanced, shooting, pbp]
cleaned_dfs = [dfs[0]]  # keep everything in first one

# For all others: drop shared_cols if they exist
for df in dfs[1:]:
    drop = [col for col in shared_cols if col in df.columns]
    cleaned_dfs.append(df.drop(columns=drop))

# Merge on Player and Year
df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), cleaned_dfs)

# Optional: sort and reset index
df = df.sort_values(by=["Year", "Player"]).reset_index(drop=True)

print("Final merged shape:", df.shape)
df.head(10)

Final merged shape: (1167, 137)


Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Diana Taurasi,2004,PHO,22.0,34,34,33.2,6.1,14.8,.416,...,1.6,46.0,13.0,38.0,27.0,,,317.0,18.0,18.0
1,Diana Taurasi,2005,PHO,23.0,33,33,33.0,5.3,12.9,.410,...,4.8,61.0,20.0,42.0,22.0,,,342.0,8.0,12.0
2,Diana Taurasi,2006,PHO,24.0,34,34,33.9,8.8,19.4,.452,...,14.0,33.0,16.0,53.0,18.0,71.0,2.0,299.0,17.0,19.0
3,Diana Taurasi,2007,PHO,25.0,32,32,32.0,6.4,14.6,.440,...,2.7,50.0,13.0,49.0,15.0,52.0,6.0,312.0,23.0,10.0
4,Diana Taurasi,2008,PHO,26.0,34,34,31.9,7.6,17.0,.446,...,10.4,44.0,17.0,46.0,20.0,102.0,14.0,276.0,25.0,27.0
5,DeWanna Bonner,2009,PHO,21.0,34,0,21.3,3.7,8.2,.457,...,-5.3,5.0,14.0,34.0,6.0,65.0,3.0,30.0,6.0,27.0
6,Diana Taurasi,2009,PHO,27.0,31,31,31.5,6.5,14.0,.461,...,7.2,41.0,18.0,42.0,15.0,77.0,12.0,247.0,14.0,13.0
7,Alysha Clark,2010,,22.0,Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),...,,,,,,,,,,
8,DeWanna Bonner,2010,PHO,22.0,32,4,25.4,4.1,8.9,.465,...,-3.5,20.0,13.0,34.0,4.0,55.0,2.0,87.0,15.0,28.0
9,Diana Taurasi,2010,PHO,28.0,31,31,32.2,6.8,16.0,.427,...,3.8,57.0,22.0,47.0,23.0,83.0,9.0,330.0,19.0,11.0


In [89]:
df.shape

(1167, 137)

In [90]:
df.head(20)

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Diana Taurasi,2004,PHO,22.0,34,34,33.2,6.1,14.8,.416,...,1.6,46.0,13.0,38.0,27.0,,,317.0,18.0,18.0
1,Diana Taurasi,2005,PHO,23.0,33,33,33.0,5.3,12.9,.410,...,4.8,61.0,20.0,42.0,22.0,,,342.0,8.0,12.0
2,Diana Taurasi,2006,PHO,24.0,34,34,33.9,8.8,19.4,.452,...,14.0,33.0,16.0,53.0,18.0,71.0,2.0,299.0,17.0,19.0
3,Diana Taurasi,2007,PHO,25.0,32,32,32.0,6.4,14.6,.440,...,2.7,50.0,13.0,49.0,15.0,52.0,6.0,312.0,23.0,10.0
4,Diana Taurasi,2008,PHO,26.0,34,34,31.9,7.6,17.0,.446,...,10.4,44.0,17.0,46.0,20.0,102.0,14.0,276.0,25.0,27.0
5,DeWanna Bonner,2009,PHO,21.0,34,0,21.3,3.7,8.2,.457,...,-5.3,5.0,14.0,34.0,6.0,65.0,3.0,30.0,6.0,27.0
6,Diana Taurasi,2009,PHO,27.0,31,31,31.5,6.5,14.0,.461,...,7.2,41.0,18.0,42.0,15.0,77.0,12.0,247.0,14.0,13.0
7,Alysha Clark,2010,,22.0,Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),...,,,,,,,,,,
8,DeWanna Bonner,2010,PHO,22.0,32,4,25.4,4.1,8.9,.465,...,-3.5,20.0,13.0,34.0,4.0,55.0,2.0,87.0,15.0,28.0
9,Diana Taurasi,2010,PHO,28.0,31,31,32.2,6.8,16.0,.427,...,3.8,57.0,22.0,47.0,23.0,83.0,9.0,330.0,19.0,11.0


In [91]:
# Save fplayer_data.csv to desktop (Colab)
from google.colab import files

df.to_csv("player_data.csv", index=False)
files.download("player_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load player_data.csv
import pandas as pd

# Load the CSV from the cloned repo's data folder
df = pd.read_csv('LHL-final-final-project/data/player_data.csv')

# Preview the first few rows
df.head()

In [92]:
# Null check
print(df.isnull().sum())

Player                     0
Year                       0
Tm                       255
Age                      157
G                          0
                        ... 
pbp_Fouls Drawn_Shoot    100
pbp_Fouls Drawn_Off.     100
pbp_Misc._PGA             98
pbp_Misc._And1            98
pbp_Misc._Blkd            98
Length: 137, dtype: int64


In [93]:
# Change TM null to Career when Year is Career
df.loc[(df["Year"] == "Career") & (df["Tm"].isna()), "Tm"] = "Career"

In [94]:
# Null check
print(df.isnull().sum())

Player                     0
Year                       0
Tm                        98
Age                      157
G                          0
                        ... 
pbp_Fouls Drawn_Shoot    100
pbp_Fouls Drawn_Off.     100
pbp_Misc._PGA             98
pbp_Misc._And1            98
pbp_Misc._Blkd            98
Length: 137, dtype: int64


In [95]:
# Change Career to 0 in Year column
df["Year"] = df["Year"].replace("Career", 0)

In [96]:
for player in df["Player"].unique():
    print(player)

Diana Taurasi
DeWanna Bonner
Alysha Clark
Tina Charles
Courtney Vandersloot
Sydney Colson
Damiris Dantas
Nneka Ogwumike
Tiffany Hayes
Brittney Griner
Layshia Clarendon
Skylar Diggins-Smith
Alyssa Thomas
Astou Ndour-Fall
Chelsea Gray
Kayla McBride
Natasha Howard
Odyssey Sims
Stefanie Dolson
Stephanie Talbot
Betnijah Laney-Hamilton
Cheyenne Parker-Tyus
Dearica Hamby
Elizabeth Williams
Erica Wheeler
Isabelle Harrison
Jewell Loyd
Kayla Thornton
Kiah Stokes
Natasha Cloud
Rebecca Allen
Aerial Powers
Breanna Stewart
Courtney Williams
Jonquel Jones
Kahleah Copper
Moriah Jefferson
Rachel Banham
Temi Fagbenle
Tiffany Mitchell
Allisha Gray
Brionna Jones
Brittney Sykes
Cecilia Zandalasini
Emma Cannon
Kaela Davis
Kelsey Plum
Lindsay Allen
Nia Coffey
Sami Whitcomb
Shatori Walker-Kimbrough
A'ja Wilson
Ariel Atkins
Azura Stevens
Diamond DeShields
Gabby Williams
Jordin Canada
Karlie Samuelson
Kelsey Mitchell
Kia Nurse
Kristy Wallace
Lexie Brown
Mercedes Russell
Monique Billings
Myisha Hines-Allen
Victo

In [97]:
print(df["Player"].nunique())

157


In [99]:
print(df.duplicated().sum())

0


In [100]:
# Check not nulls to confirm they are useless
unnamed_cols = [
    'per_game_Unnamed: 29',
    'per_poss_Unnamed: 27',
    'advanced_Unnamed: 17',
    'shooting_Unnamed: 7_level_1',
    'shooting_Unnamed: 14_level_1',
    'shooting_Unnamed: 21_level_1',
    'shooting_Unnamed: 24_level_1',
    'shooting_Unnamed: 27_level_1',
    'shooting_Unnamed: 30_level_1'
]

# Show non-null count for each
df[unnamed_cols].notnull().sum()

Unnamed: 0,0
per_game_Unnamed: 29,98
per_poss_Unnamed: 27,0
advanced_Unnamed: 17,0
shooting_Unnamed: 7_level_1,0
shooting_Unnamed: 14_level_1,0
shooting_Unnamed: 21_level_1,0
shooting_Unnamed: 24_level_1,0
shooting_Unnamed: 27_level_1,0
shooting_Unnamed: 30_level_1,0


In [101]:
# Show rows where it's not null
df[df['per_game_Unnamed: 29'].notnull()][['Player', 'Year', 'Tm', 'per_game_Unnamed: 29']].head(10)


Unnamed: 0,Player,Year,Tm,per_game_Unnamed: 29
7,Alysha Clark,2010,,Did Not Play (waived)
11,Alysha Clark,2011,,Did Not Play (waived)
19,Damiris Dantas,2012,,Did Not Play (Olympics—Brazil)
23,Sydney Colson,2012,,Did Not Play (waived)
29,Damiris Dantas,2013,,Did Not Play (did not appear in WNBA)
35,Sydney Colson,2013,,Did Not Play (did not appear in WNBA)
42,Chelsea Gray,2014,,Did Not Play (injury—right knee)
54,Stephanie Talbot,2014,,Did Not Play (did not appear in WNBA)
55,Sydney Colson,2014,,Did Not Play (did not appear in WNBA)
60,Astou Ndour-Fall,2015,,Did Not Play (did not appear in WNBA)


In [102]:
# Drop rows where 'G' contains the phrase 'Did Not Play'
df = df[~df['G'].astype(str).str.contains('Did Not Play', na=False)]

In [103]:
# Show rows where it's not null
df[df['per_game_Unnamed: 29'].notnull()][['Player', 'Year', 'Tm', 'per_game_Unnamed: 29']].head(10)

Unnamed: 0,Player,Year,Tm,per_game_Unnamed: 29


In [104]:
# Check not nulls to confirm they are useless
unnamed_cols = [
    'per_game_Unnamed: 29',
    'per_poss_Unnamed: 27',
    'advanced_Unnamed: 17',
    'shooting_Unnamed: 7_level_1',
    'shooting_Unnamed: 14_level_1',
    'shooting_Unnamed: 21_level_1',
    'shooting_Unnamed: 24_level_1',
    'shooting_Unnamed: 27_level_1',
    'shooting_Unnamed: 30_level_1'
]

# Show non-null count for each
df[unnamed_cols].notnull().sum()

Unnamed: 0,0
per_game_Unnamed: 29,0
per_poss_Unnamed: 27,0
advanced_Unnamed: 17,0
shooting_Unnamed: 7_level_1,0
shooting_Unnamed: 14_level_1,0
shooting_Unnamed: 21_level_1,0
shooting_Unnamed: 24_level_1,0
shooting_Unnamed: 27_level_1,0
shooting_Unnamed: 30_level_1,0


In [105]:
# Drop all Unnamed columns now that they're confirmed empty
df = df.drop(columns=[
    'per_game_Unnamed: 29',
    'per_poss_Unnamed: 27',
    'advanced_Unnamed: 17',
    'shooting_Unnamed: 7_level_1',
    'shooting_Unnamed: 14_level_1',
    'shooting_Unnamed: 21_level_1',
    'shooting_Unnamed: 24_level_1',
    'shooting_Unnamed: 27_level_1',
    'shooting_Unnamed: 30_level_1'
])

In [106]:
# Check columns and dtypes
for col, dtype in df.dtypes.items():
    print(f"{col}: {dtype}")

Player: object
Year: object
Tm: object
Age: float64
G: object
GS: object
per_game_MP: object
per_game_FG: object
per_game_FGA: object
per_game_FG%: object
per_game_3P: object
per_game_3PA: object
per_game_3P%: object
per_game_2P: object
per_game_2PA: object
per_game_2P%: object
per_game_eFG%: object
per_game_FT: object
per_game_FTA: object
per_game_FT%: object
per_game_ORB: object
per_game_DRB: object
per_game_TRB: object
per_game_AST: object
per_game_STL: object
per_game_BLK: object
per_game_TOV: object
per_game_PF: object
per_game_PTS: object
per_game_Awards: object
per_minute_MP: float64
per_minute_FG: float64
per_minute_FGA: float64
per_minute_FG%: float64
per_minute_3P: float64
per_minute_3PA: float64
per_minute_3P%: float64
per_minute_2P: float64
per_minute_2PA: float64
per_minute_2P%: float64
per_minute_FT: float64
per_minute_FTA: float64
per_minute_FT%: float64
per_minute_ORB: float64
per_minute_DRB: float64
per_minute_TRB: float64
per_minute_AST: float64
per_minute_STL: float6

In [107]:
df["per_game_Awards"].value_counts(dropna=False)

Unnamed: 0_level_0,count
per_game_Awards,Unnamed: 1_level_1
,793
AS,43
DEF2,9
6POY-4,8
ROY-2,7
...,...
"MVP-10,DEF2,AS,WNBA2",1
"MIP-5,6POY-5",1
"MVP-2,WNBA1,DEF1,AS,DPOY-1",1
"MVP-7,DEF2,AS,WNBA2",1


In [108]:
# Columns to exclude from conversion
exclude = {"Player", "Tm", "per_game_Awards"}

# Loop through and convert if column is not in the exclude list
for col in df.columns:
    if df[col].dtype == "object" and col not in exclude:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Check for non-numeric
for col, dtype in df.dtypes.items():
    if dtype != "float64":
        print(f"{col}: {dtype}")

Player: object
Year: int64
Tm: object
per_game_Awards: object


In [109]:
# Null check
print(df.isnull().sum())

Player                     0
Year                       0
Tm                         0
Age                      157
G                          0
                        ... 
pbp_Fouls Drawn_Shoot      2
pbp_Fouls Drawn_Off.       2
pbp_Misc._PGA              0
pbp_Misc._And1             0
pbp_Misc._Blkd             0
Length: 128, dtype: int64


In [110]:
# Check for Player, Year duplication
df.groupby(["Player", "Year"]).size().reset_index(name="count").query("count > 1")

Unnamed: 0,Player,Year,count
18,Aerial Powers,2018,3
134,Bridget Carleton,2019,3
181,Celeste Taylor,2024,4
227,Courtney Williams,2016,3
239,Crystal Dangerfield,2022,3
241,Crystal Dangerfield,2024,3
244,Damiris Dantas,2015,3
253,Dana Evans,2021,3
286,Destanni Henderson,2023,3
354,Emma Cannon,2021,4


In [111]:
# Spot check players
df[(df["Player"] == "Queen Egbo") & (df["Year"] == 2024)]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
970,Queen Egbo,2024,CON,24.0,3.0,0.0,2.3,0.0,1.7,0.0,...,-9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
971,Queen Egbo,2024,LAS,24.0,2.0,0.0,2.0,0.5,0.5,1.0,...,10.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
972,Queen Egbo,2024,LVA,24.0,3.0,0.0,6.0,1.3,2.0,0.667,...,-13.3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
973,Queen Egbo,2024,TOT,24.0,8.0,0.0,3.6,0.6,1.5,0.417,...,-5.3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [112]:
# Spot check players
df[(df["Player"] == "Moriah Jefferson") & (df["Year"] == 2024)]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
948,Moriah Jefferson,2024,CHI,30.0,14.0,0.0,8.9,0.2,1.9,0.115,...,-22.5,5.0,1.0,4.0,0.0,2.0,5.0,56.0,0.0,1.0
949,Moriah Jefferson,2024,CON,30.0,9.0,0.0,6.8,0.7,1.8,0.375,...,-19.2,1.0,0.0,3.0,1.0,1.0,0.0,12.0,0.0,0.0
950,Moriah Jefferson,2024,TOT,30.0,23.0,0.0,8.0,0.4,1.8,0.214,...,-23.8,6.0,1.0,7.0,1.0,3.0,5.0,68.0,0.0,1.0


In [113]:
# Spot check players
df[(df["Player"] == "Moriah Jefferson") & (df["Year"] == 2022)]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
643,Moriah Jefferson,2022,DAL,28.0,1.0,0.0,4.0,0.0,0.0,,...,-122.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
644,Moriah Jefferson,2022,MIN,28.0,30.0,30.0,26.8,3.9,8.6,0.452,...,-1.6,36.0,18.0,38.0,3.0,25.0,9.0,340.0,2.0,13.0
645,Moriah Jefferson,2022,TOT,28.0,31.0,30.0,26.1,3.8,8.4,0.452,...,-3.3,36.0,18.0,38.0,3.0,25.0,9.0,340.0,2.0,13.0


In [114]:
# Spot check players
df[(df["Player"] == "Celeste Taylor") & (df["Year"] == 2024)]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
854,Celeste Taylor,2024,CON,23.0,2.0,0.0,5.5,0.5,0.5,1.0,...,0.1,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
855,Celeste Taylor,2024,IND,23.0,5.0,0.0,3.2,0.2,0.2,1.0,...,11.2,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
856,Celeste Taylor,2024,PHO,23.0,15.0,4.0,20.1,1.3,4.0,0.317,...,6.5,17.0,2.0,8.0,1.0,7.0,6.0,62.0,2.0,5.0
857,Celeste Taylor,2024,TOT,23.0,22.0,4.0,14.9,1.0,2.8,0.339,...,1.2,17.0,2.0,10.0,1.0,8.0,7.0,64.0,2.0,5.0


In [115]:
# Spot check players
df[(df["Player"] == "Natasha Mack") & (df["Year"] == 2021)]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
527,Natasha Mack,2021,CHI,23.0,3.0,0.0,5.3,0.7,1.0,0.667,...,-17.8,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
528,Natasha Mack,2021,MIN,23.0,1.0,0.0,2.0,0.0,0.0,,...,44.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
529,Natasha Mack,2021,TOT,23.0,4.0,0.0,4.5,0.5,0.8,0.667,...,-15.1,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0


In [116]:
# Check for Player, Year, Tm duplication
df.groupby(["Player", "Year", "Tm"]).size().reset_index(name="count").query("count > 1")

Unnamed: 0,Player,Year,Tm,count


In [117]:
# Null check
print(df.isnull().sum())

Player                     0
Year                       0
Tm                         0
Age                      157
G                          0
                        ... 
pbp_Fouls Drawn_Shoot      2
pbp_Fouls Drawn_Off.       2
pbp_Misc._PGA              0
pbp_Misc._And1             0
pbp_Misc._Blkd             0
Length: 128, dtype: int64


In [118]:
# Check rows where Age is null
df[df["Age"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
1010,A'ja Wilson,0,Career,,227.0,226.0,31.2,7.7,15.6,0.497,...,5.5,142.0,109.0,288.0,61.0,720.0,18.0,1130.0,170.0,285.0
1011,Aaliyah Edwards,0,Career,,34.0,17.0,21.8,3.0,6.2,0.490,...,-7.0,18.0,13.0,42.0,15.0,33.0,10.0,122.0,4.0,23.0
1012,Aari McDonald,0,Career,,116.0,29.0,21.6,2.8,7.3,0.389,...,2.4,119.0,51.0,80.0,11.0,88.0,63.0,758.0,21.0,32.0
1013,Aerial Powers,0,Career,,190.0,56.0,19.6,3.4,8.6,0.397,...,-2.2,133.0,106.0,123.0,39.0,274.0,42.0,683.0,62.0,87.0
1014,Alanna Smith,0,Career,,141.0,75.0,19.2,2.5,5.6,0.442,...,7.8,75.0,41.0,145.0,39.0,85.0,27.0,564.0,27.0,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,Tyasha Harris,0,Career,,167.0,49.0,19.6,2.4,5.9,0.410,...,-1.8,131.0,30.0,117.0,8.0,84.0,20.0,995.0,24.0,49.0
1163,Veronica Burton,0,Career,,107.0,20.0,14.0,0.7,2.2,0.326,...,2.0,49.0,10.0,68.0,4.0,32.0,41.0,493.0,6.0,18.0
1164,Victaria Saxton,0,Career,,24.0,0.0,3.2,0.4,1.1,0.333,...,-10.9,1.0,3.0,5.0,3.0,5.0,1.0,0.0,1.0,1.0
1165,Victoria Vivians,0,Career,,179.0,93.0,20.8,2.4,6.8,0.357,...,-0.3,104.0,33.0,155.0,18.0,76.0,18.0,571.0,15.0,57.0


In [119]:
df[df["Age"].isna()]["Tm"].unique()

array(['Career'], dtype=object)

In [120]:
# Change null Age in Career rows to be players max age
df["Age"] = df.groupby("Player")["Age"].transform(lambda x: x.fillna(x.max()))

In [121]:
# Null check
print(df.isnull().sum())

Player                   0
Year                     0
Tm                       0
Age                      0
G                        0
                        ..
pbp_Fouls Drawn_Shoot    2
pbp_Fouls Drawn_Off.     2
pbp_Misc._PGA            0
pbp_Misc._And1           0
pbp_Misc._Blkd           0
Length: 128, dtype: int64


In [122]:
# Confirm with a player that those Age changes look correct
df[df["Player"] == "A'ja Wilson"]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
186,A'ja Wilson,2018,LVA,21.0,33.0,33.0,30.6,7.4,16.1,0.462,...,3.5,15.0,16.0,48.0,8.0,118.0,6.0,160.0,26.0,50.0
258,A'ja Wilson,2019,LVA,22.0,26.0,25.0,28.5,6.1,12.7,0.479,...,-0.5,23.0,18.0,32.0,8.0,66.0,0.0,102.0,13.0,35.0
345,A'ja Wilson,2020,LVA,23.0,22.0,22.0,31.7,7.5,15.7,0.48,...,6.5,12.0,12.0,37.0,6.0,78.0,2.0,97.0,19.0,22.0
437,A'ja Wilson,2021,LVA,24.0,32.0,32.0,31.9,6.5,14.6,0.444,...,-11.2,26.0,8.0,33.0,5.0,97.0,1.0,213.0,21.0,45.0
555,A'ja Wilson,2022,LVA,25.0,36.0,36.0,30.0,7.2,14.4,0.501,...,24.4,22.0,19.0,43.0,13.0,97.0,6.0,178.0,27.0,39.0
688,A'ja Wilson,2023,LVA,26.0,40.0,40.0,30.7,8.4,15.0,0.557,...,16.3,24.0,19.0,53.0,12.0,133.0,1.0,157.0,34.0,46.0
827,A'ja Wilson,2024,LVA,27.0,38.0,38.0,34.4,10.1,19.6,0.518,...,-2.7,20.0,17.0,42.0,9.0,131.0,2.0,223.0,30.0,48.0
1010,A'ja Wilson,0,Career,27.0,227.0,226.0,31.2,7.7,15.6,0.497,...,5.5,142.0,109.0,288.0,61.0,720.0,18.0,1130.0,170.0,285.0


In [123]:
# Check remaining null columns
nulls = df.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0
per_game_Awards,793
shooting_Corner 3s_3P%,227
shooting_% of FG Ast'd_3P,184
per_minute_3P%,101
per_game_3P%,101
...,...
advanced_ORB%,1
advanced_TRB%,1
advanced_AST%,1
pbp_+/- Per 100 Poss._On-Off,1


In [124]:
# Change per_game_Awards nulls to 'None'
df["per_game_Awards"] = df["per_game_Awards"].fillna("None")

In [125]:
# Check pbp_+/- Per 100 Poss._OnCourt	row for it's null
df[df["pbp_+/- Per 100 Poss._OnCourt"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
375,Emma Cannon,2020,LVA,31.0,1.0,0.0,0.0,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
# Identify the index of the target row
idx = 375  # Replace with the correct index if it changes

# Fill nulls in just that row with 0
df.loc[idx] = df.loc[idx].fillna(0)

In [127]:
# Check remaining null columns
nulls = df.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0
shooting_Corner 3s_3P%,226
shooting_% of FG Ast'd_3P,183
per_poss_3P%,100
per_minute_3P%,100
per_game_3P%,100
shooting_Corner 3s_%3PA,100
shooting_FG% by Distance_3P,100
shooting_FG% by Distance_16-3P,76
shooting_FG% by Distance_10-16,64
per_poss_FT%,47


In [128]:
# Save fplayer_data.csv to desktop (Colab)
from google.colab import files

df.to_csv("player_data.csv", index=False)
files.download("player_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [129]:
# Filter the DataFrame to show rows with null values in 'pbp_Fouls Drawn_Off.'
df[df["pbp_Fouls Drawn_Off."].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Diana Taurasi,2004,PHO,22.0,34.0,34.0,33.2,6.1,14.8,0.416,...,1.6,46.0,13.0,38.0,27.0,,,317.0,18.0,18.0
1,Diana Taurasi,2005,PHO,23.0,33.0,33.0,33.0,5.3,12.9,0.41,...,4.8,61.0,20.0,42.0,22.0,,,342.0,8.0,12.0


In [130]:
# Replace nulls with 0
df[["pbp_Fouls Drawn_Shoot", "pbp_Fouls Drawn_Off."]] = df[["pbp_Fouls Drawn_Shoot", "pbp_Fouls Drawn_Off."]].fillna(0)

In [131]:
# Filter the DataFrame to show rows with null values in 'per_poss_ORtg'
df[df["per_poss_ORtg"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
385,Kaela Davis,2020,ATL,25.0,2.0,0.0,1.0,0.0,0.0,,...,-61.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519,Mikiah Herbert Harrigan,2021,SEA,22.0,1.0,0.0,1.0,0.0,0.0,,...,-5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528,Natasha Mack,2021,MIN,23.0,1.0,0.0,2.0,0.0,0.0,,...,44.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
622,Kiana Williams,2022,CON,23.0,1.0,0.0,3.0,0.0,0.0,,...,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,Moriah Jefferson,2022,DAL,28.0,1.0,0.0,4.0,0.0,0.0,,...,-122.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
# Replace nulls with 0 in 'per_poss_ORtg'
df["per_poss_ORtg"] = df["per_poss_ORtg"].fillna(0)

In [133]:
# Filter the DataFrame to show rows with null values in 'advanced_TOV%'
df[df["advanced_TOV%"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
385,Kaela Davis,2020,ATL,25.0,2.0,0.0,1.0,0.0,0.0,,...,-61.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519,Mikiah Herbert Harrigan,2021,SEA,22.0,1.0,0.0,1.0,0.0,0.0,,...,-5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528,Natasha Mack,2021,MIN,23.0,1.0,0.0,2.0,0.0,0.0,,...,44.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
622,Kiana Williams,2022,CON,23.0,1.0,0.0,3.0,0.0,0.0,,...,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,Moriah Jefferson,2022,DAL,28.0,1.0,0.0,4.0,0.0,0.0,,...,-122.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
# Replace nulls with 0 in 'per_poss_ORtg'
df["advanced_TOV%"] = df["advanced_TOV%"].fillna(0)

In [135]:
# Filter the DataFrame to show rows with null values in 'per_poss_ORtg'
df[df["shooting_% of FGA by Distance_10-16"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
385,Kaela Davis,2020,ATL,25.0,2.0,0.0,1.0,0.0,0.0,,...,-61.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486,Joyner Holmes,2021,NYL,23.0,1.0,0.0,5.0,0.0,0.0,,...,-24.7,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
508,Layshia Clarendon,2021,NYL,30.0,1.0,0.0,3.0,0.0,0.0,,...,-81.9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
519,Mikiah Herbert Harrigan,2021,SEA,22.0,1.0,0.0,1.0,0.0,0.0,,...,-5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528,Natasha Mack,2021,MIN,23.0,1.0,0.0,2.0,0.0,0.0,,...,44.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,Shatori Walker-Kimbrough,2021,CON,26.0,1.0,0.0,4.0,0.0,0.0,,...,-46.9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
622,Kiana Williams,2022,CON,23.0,1.0,0.0,3.0,0.0,0.0,,...,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,Moriah Jefferson,2022,DAL,28.0,1.0,0.0,4.0,0.0,0.0,,...,-122.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
999,Taylor Soule,2024,MIN,24.0,2.0,0.0,1.5,0.0,0.0,,...,-175.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
# List of row indices you provided
target_indices = [385, 486, 508, 519, 528, 539, 622, 643, 999]

# Subset the DataFrame to those rows
subset = df.loc[target_indices]

# Identify which columns have nulls in any of those rows
null_columns = subset.columns[subset.isna().any()]

# Always include identifying columns
id_columns = ["Player", "Year", "Tm", "G"]

# Combine identifying columns with the null columns
columns_to_show = list(dict.fromkeys(id_columns + list(null_columns)))

# Show the relevant data
subset[columns_to_show]

Unnamed: 0,Player,Year,Tm,G,per_game_FG%,per_game_3P%,per_game_2P%,per_game_eFG%,per_game_FT%,per_minute_FG%,...,shooting_FG% by Distance_0-3,shooting_FG% by Distance_3-10,shooting_FG% by Distance_10-16,shooting_FG% by Distance_16-3P,shooting_FG% by Distance_3P,shooting_% of FG Ast'd_2P,shooting_% of FG Ast'd_3P,shooting_Dunks_%FGA,shooting_Corner 3s_%3PA,shooting_Corner 3s_3P%
385,Kaela Davis,2020,ATL,2.0,,,,,,,...,,,,,,,,,,
486,Joyner Holmes,2021,NYL,1.0,,,,,,,...,,,,,,,,,,
508,Layshia Clarendon,2021,NYL,1.0,,,,,,,...,,,,,,,,,,
519,Mikiah Herbert Harrigan,2021,SEA,1.0,,,,,,,...,,,,,,,,,,
528,Natasha Mack,2021,MIN,1.0,,,,,,,...,,,,,,,,,,
539,Shatori Walker-Kimbrough,2021,CON,1.0,,,,,,,...,,,,,,,,,,
622,Kiana Williams,2022,CON,1.0,,,,,,,...,,,,,,,,,,
643,Moriah Jefferson,2022,DAL,1.0,,,,,,,...,,,,,,,,,,
999,Taylor Soule,2024,MIN,2.0,,,,,,,...,,,,,,,,,,


In [139]:
# List of row indices you want to clean
target_indices = [385, 486, 508, 519, 528, 539, 622, 643, 999]

# Fill nulls with 0 for just those rows
df.loc[target_indices] = df.loc[target_indices].fillna(0)

In [140]:
# Check remaining null columns
nulls = df.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0
shooting_Corner 3s_3P%,217
shooting_% of FG Ast'd_3P,174
per_game_3P%,91
shooting_Corner 3s_%3PA,91
shooting_FG% by Distance_3P,91
per_minute_3P%,91
per_poss_3P%,91
shooting_FG% by Distance_16-3P,67
shooting_FG% by Distance_10-16,55
per_poss_FT%,38


In [141]:
# Filter the DataFrame to show rows with null values in 'per_poss_ORtg'
df[df["per_poss_2P%"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
272,Bridget Carleton,2019,MIN,22.0,4.0,0.0,2.8,0.3,0.5,0.5,...,-62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,Karlie Samuelson,2022,PHO,27.0,1.0,0.0,10.0,1.0,3.0,0.333,...,-5.7,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
850,Caitlin Bickle,2024,CON,24.0,8.0,0.0,1.9,0.0,0.4,0.0,...,-57.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1033,Caitlin Bickle,0,Career,24.0,8.0,0.0,1.9,0.0,0.4,0.0,...,-57.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
# List of row indices you want to clean
target_indices = [272,613,850,1033]

# Fill nulls with 0 for just those rows
df.loc[target_indices] = df.loc[target_indices].fillna(0)

In [143]:
# Filter the DataFrame to show rows with null values in 'per_poss_ORtg'
df[df["shooting_% of FG Ast'd_2P"].isna()]

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
271,Bridget Carleton,2019,CON,22.0,4.0,0.0,7.3,0.0,1.5,0.0,...,-18.6,0.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0
273,Bridget Carleton,2019,TOT,22.0,8.0,0.0,5.0,0.1,1.0,0.125,...,-29.0,0.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0
300,Karlie Samuelson,2019,DAL,24.0,4.0,0.0,12.0,0.5,1.8,0.286,...,5.4,0.0,0.0,2.0,0.0,0.0,2.0,5.0,0.0,0.0
301,Karlie Samuelson,2019,LAS,24.0,3.0,0.0,12.0,0.3,2.3,0.143,...,5.7,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0
302,Karlie Samuelson,2019,TOT,24.0,7.0,0.0,12.0,0.4,2.0,0.214,...,4.3,0.0,0.0,2.0,0.0,0.0,2.0,11.0,0.0,1.0
464,Dana Evans,2021,DAL,22.0,6.0,0.0,4.0,0.2,1.0,0.167,...,13.0,2.0,1.0,1.0,1.0,1.0,0.0,7.0,0.0,1.0
504,Kiana Williams,2021,SEA,22.0,10.0,0.0,3.5,0.1,0.7,0.143,...,-48.3,2.0,0.0,2.0,0.0,1.0,0.0,4.0,1.0,0.0
562,Amy Atwell,2022,LAS,24.0,4.0,1.0,8.0,0.3,2.3,0.111,...,-14.4,1.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0
607,Kaela Davis,2022,CHI,27.0,1.0,0.0,10.0,0.0,1.0,0.0,...,-11.3,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,1.0
663,Rae Burrell,2022,LAS,22.0,3.0,1.0,14.7,0.3,3.0,0.111,...,5.5,1.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0


In [145]:
# List of row indices you provided
target_indices = [
    271, 273, 300, 301, 302, 464, 504, 562, 607, 663,
    771, 879, 895, 897, 945, 970, 1057, 1073, 1075
]

# Subset the DataFrame to those rows
subset = df.loc[target_indices]

# Identify which columns have nulls in any of those rows
null_columns = subset.columns[subset.isna().any()]

# Always include identifying columns
id_columns = ["Player", "Year", "Tm", "G"]

# Combine identifying columns with the null columns
columns_to_show = list(dict.fromkeys(id_columns + list(null_columns)))

# Show the relevant data
subset[columns_to_show]

Unnamed: 0,Player,Year,Tm,G,per_game_3P%,per_game_FT%,per_minute_3P%,per_minute_FT%,per_poss_3P%,per_poss_FT%,shooting_FG% by Distance_0-3,shooting_FG% by Distance_3-10,shooting_FG% by Distance_10-16,shooting_FG% by Distance_16-3P,shooting_FG% by Distance_3P,shooting_% of FG Ast'd_2P,shooting_% of FG Ast'd_3P,shooting_Corner 3s_%3PA,shooting_Corner 3s_3P%
271,Bridget Carleton,2019,CON,4.0,0.0,,0.0,,0.0,,,0.0,0.0,0.0,0.0,,,1.0,0.0
273,Bridget Carleton,2019,TOT,8.0,0.25,,0.25,,0.25,,,0.0,0.0,0.0,0.25,,1.0,0.75,0.333
300,Karlie Samuelson,2019,DAL,4.0,0.333,,0.333,,0.333,,,0.0,,,0.333,,1.0,0.333,0.5
301,Karlie Samuelson,2019,LAS,3.0,0.167,,0.167,,0.167,,,,0.0,,0.167,,1.0,0.167,0.0
302,Karlie Samuelson,2019,TOT,7.0,0.25,,0.25,,0.25,,,0.0,0.0,,0.25,,1.0,0.25,0.333
464,Dana Evans,2021,DAL,6.0,0.333,1.0,0.333,1.0,0.333,1.0,,,0.0,0.0,0.333,,1.0,0.333,1.0
504,Kiana Williams,2021,SEA,10.0,0.167,1.0,0.167,1.0,0.167,1.0,,0.0,,,0.167,,1.0,0.0,
562,Amy Atwell,2022,LAS,4.0,0.167,,0.167,,0.167,,,0.0,0.0,,0.167,,1.0,0.0,
607,Kaela Davis,2022,CHI,1.0,,,,,,,0.0,,,,,,,,
663,Rae Burrell,2022,LAS,3.0,0.167,1.0,0.167,1.0,0.167,1.0,0.0,0.0,,0.0,0.167,,1.0,0.167,0.0


In [146]:
# List of row indices you want to clean
target_indices = [
    271, 273, 300, 301, 302, 464, 504, 562, 607, 663,
    771, 879, 895, 897, 945, 970, 1057, 1073, 1075
]

# Fill nulls with 0 for just those rows
df.loc[target_indices] = df.loc[target_indices].fillna(0)

In [147]:
# Check remaining null columns
nulls = df.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0
shooting_Corner 3s_3P%,202
shooting_% of FG Ast'd_3P,161
per_poss_3P%,84
shooting_Corner 3s_%3PA,84
per_game_3P%,84
per_minute_3P%,84
shooting_FG% by Distance_3P,84
shooting_FG% by Distance_16-3P,50
shooting_FG% by Distance_10-16,39
per_game_FT%,24


In [148]:
# Columns to fill with 0
columns_to_fill = [
    "shooting_Corner 3s_3P%",
    "shooting_% of FG Ast'd_3P",
    "per_poss_3P%",
    "shooting_Corner 3s_%3PA",
    "per_game_3P%",
    "per_minute_3P%",
    "shooting_FG% by Distance_3P",
    "shooting_FG% by Distance_16-3P",
    "shooting_FG% by Distance_10-16",
    "per_game_FT%",
    "per_poss_FT%",
    "per_minute_FT%",
    "shooting_FG% by Distance_0-3",
    "shooting_FG% by Distance_3-10"
]

# Fill NaNs in the specified columns with 0
df[columns_to_fill] = df[columns_to_fill].fillna(0)

In [149]:
# Check remaining null columns
nulls = df.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0


In [150]:
# Save fplayer_data.csv to desktop (Colab)
from google.colab import files

df.to_csv("player_data.csv", index=False)
files.download("player_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [151]:
# Check Year column
df["Year"].unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024,    0])

In [152]:
# Check Tm column
df["Tm"].unique()

array(['PHO', 'CON', 'CHI', 'NYL', 'SEA', 'LAS', 'ATL', 'IND', 'TUL',
       'SAS', 'MIN', 'WAS', 'TOT', 'DAL', 'LVA', 'Career'], dtype=object)

Unnamed: 0,0
Player,object
Year,int64
Tm,object
Age,float64
G,float64
...,...
pbp_Fouls Drawn_Shoot,float64
pbp_Fouls Drawn_Off.,float64
pbp_Misc._PGA,float64
pbp_Misc._And1,float64
