In [1]:
# Clone your GitHub repo (you’ll be prompted to authorize if it's private)
!git clone https://github.com/colterwood/LHL-final-final-project.git

Cloning into 'LHL-final-final-project'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 41 (delta 14), reused 20 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (41/41), 172.58 KiB | 3.52 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [99]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, Comment
import requests
from io import StringIO
import string
import time
import re
from functools import reduce

In [108]:
# Base URL pattern
base_url = "https://www.basketball-reference.com/wnba/players/{}/"
headers = {"User-Agent": "Mozilla/5.0"}

# Store (name, link) for all 2024 players
players = []

# Loop through a–z player index pages
for letter in string.ascii_lowercase:
    url = base_url.format(letter)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    player_paragraphs = soup.find_all("p")

    # Filter for players with '2024' in their years active
    for p in player_paragraphs:
        if "2024" in p.text:
            a_tag = p.find("a")
            name = a_tag.text.strip()
            link = a_tag["href"]
            players.append((name, link))

# Print result
print(f"Found {len(players)} players with 2024:")
for name, link in players:
    print(name, link)

Found 157 players with 2024:
Lindsay Allen /wnba/players/a/allenli01w.html
Rebecca Allen /wnba/players/a/allenre01w.html
Laeticia Amihere /wnba/players/a/amihela01w.html
Ariel Atkins /wnba/players/a/atkinar01w.html
Amy Atwell /wnba/players/a/atwelam01w.html
Shakira Austin /wnba/players/a/austish01w.html
Rachel Banham /wnba/players/b/banhara01w.html
Kierstan Bell /wnba/players/b/bellki01w.html
Grace Berger /wnba/players/b/bergegr01w.html
Morgan Bertsch /wnba/players/b/bertsmo01w.html
Caitlin Bickle /wnba/players/b/bicklca01w.html
Monique Billings /wnba/players/b/billimo01w.html
DeWanna Bonner /wnba/players/b/bonnede01w.html
Aliyah Boston /wnba/players/b/bostoal01w.html
Cameron Brink /wnba/players/b/brinkca01w.html
Jaelyn Brown /wnba/players/b/brownja06w.html
Kalani Brown /wnba/players/b/brownka01w.html
Lexie Brown /wnba/players/b/brownle02w.html
Jakia Brown-Turner /wnba/players/b/brownja07w.html
Kennedy Burke /wnba/players/b/burkeke01w.html
Rae Burrell /wnba/players/b/burrera01w.html
Ve

In [92]:
name, link = players[0]  # or any other index
table_ids = ["per_game", "per_minute", "per_poss", "advanced", "shooting", "pbp"]
no_prefix_cols = {"Player", "Year", "Tm", "Age", "G", "GS"}
url = "https://www.basketball-reference.com" + link
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

def load_table(table_id, soup, comments, player_name):
    tag = soup.find("table", {"id": table_id + "0"})
    if tag is None:
        for c in comments:
            if f'id="{table_id}0"' in c:
                tag = BeautifulSoup(c, "html.parser").find("table", {"id": table_id + "0"})
                break
    if tag is None:
        print(f"Table not found: {table_id}")
        return None

    if table_id in ["shooting", "pbp"]:
        df = pd.read_html(StringIO(str(tag)), header=[0, 1])[0]
        df.columns = [f"{a}_{b}" if not a.startswith("Unnamed") else b for a, b in df.columns]
    else:
        df = pd.read_html(StringIO(str(tag)), header=0)[0]

    df.columns = [col if col in no_prefix_cols else f"{table_id}_{col}" for col in df.columns]
    df.insert(0, "Player", player_name)
    return df

# Load each table into its own independent DataFrame
per_game = load_table("per_game", soup, comments, name)
per_minute = load_table("per_minute", soup, comments, name)
per_poss = load_table("per_poss", soup, comments, name)
advanced = load_table("advanced", soup, comments, name)
shooting = load_table("shooting", soup, comments, name)
pbp = load_table("pbp", soup, comments, name)

In [93]:
per_game

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,per_game_DRB,per_game_TRB,per_game_AST,per_game_STL,per_game_BLK,per_game_TOV,per_game_PF,per_game_PTS,per_game_Awards,per_game_Unnamed: 29
0,Lindsay Allen,2017,NYL,22.0,28,0,13.4,0.8,2.2,.371,...,1.1,1.5,2.2,0.6,0.0,0.7,0.9,1.9,,
1,Lindsay Allen,2018,LVA,23.0,24,6,14.9,1.2,3.0,.384,...,1.1,1.3,2.9,0.6,0.0,0.9,0.7,3.1,,
2,Lindsay Allen,2019,,23.0,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),...,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury)
3,Lindsay Allen,2020,LVA,25.0,21,21,13.5,1.3,3.1,.424,...,0.8,1.1,2.4,0.3,0.0,0.8,1.0,3.3,,
4,Lindsay Allen,2021,IND,26.0,32,8,17.8,2.0,4.8,.428,...,1.2,1.5,3.0,0.5,0.1,1.0,1.1,5.4,,
5,Lindsay Allen,2022,MIN,27.0,9,0,14.9,2.2,4.2,.526,...,1.6,1.6,3.4,0.2,0.0,0.8,1.2,6.7,,
6,Lindsay Allen,2023,MIN,28.0,29,20,24.1,2.2,5.6,.399,...,1.9,2.4,4.5,0.6,0.1,1.3,2.1,6.2,,
7,Lindsay Allen,2024,CHI,29.0,40,28,23.8,2.6,5.6,.466,...,1.7,2.0,3.9,0.8,0.2,1.6,1.7,6.6,,
8,Lindsay Allen,Career,,,183,83,18.4,1.8,4.2,.429,...,1.4,1.7,3.3,0.6,0.1,1.1,1.3,4.8,,
9,Lindsay Allen,,,,,,,,,,...,,,,,,,,,,


In [94]:
per_minute

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_minute_MP,per_minute_FG,per_minute_FGA,per_minute_FG%,...,per_minute_FT%,per_minute_ORB,per_minute_DRB,per_minute_TRB,per_minute_AST,per_minute_STL,per_minute_BLK,per_minute_TOV,per_minute_PF,per_minute_PTS
0,Lindsay Allen,2017,NYL,22.0,28.0,0.0,376.0,2.2,5.9,0.371,...,0.7,1.0,3.1,4.0,5.9,1.5,0.1,1.8,2.3,5.1
1,Lindsay Allen,2018,LVA,23.0,24.0,6.0,358.0,2.8,7.3,0.384,...,0.708,0.3,2.7,3.0,6.9,1.5,0.0,2.1,1.7,7.4
2,Lindsay Allen,2020,LVA,25.0,21.0,21.0,284.0,3.5,8.4,0.424,...,0.8,0.8,2.2,2.9,6.5,0.9,0.1,2.0,2.5,8.9
3,Lindsay Allen,2021,IND,26.0,32.0,8.0,571.0,4.1,9.6,0.428,...,0.811,0.6,2.5,3.0,6.1,1.1,0.2,2.0,2.1,11.0
4,Lindsay Allen,2022,MIN,27.0,9.0,0.0,134.0,5.4,10.2,0.526,...,0.923,0.0,3.8,3.8,8.3,0.5,0.0,1.9,3.0,16.1
5,Lindsay Allen,2023,MIN,28.0,29.0,20.0,698.0,3.4,8.4,0.399,...,0.792,0.9,2.8,3.7,6.7,0.9,0.2,1.9,3.1,9.2
6,Lindsay Allen,2024,CHI,29.0,40.0,28.0,950.0,3.9,8.5,0.466,...,0.808,0.5,2.5,3.0,5.9,1.2,0.3,2.4,2.5,10.0
7,Lindsay Allen,Career,,,183.0,83.0,3371.0,3.6,8.3,0.429,...,0.794,0.6,2.7,3.3,6.4,1.1,0.2,2.1,2.5,9.3
8,Lindsay Allen,,,,,,,,,,...,,,,,,,,,,
9,Lindsay Allen,2 seasons,LVA,,45.0,27.0,642.0,3.1,7.8,0.403,...,0.735,0.5,2.5,3.0,6.7,1.2,0.1,2.1,2.1,8.1


In [95]:
per_poss

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_poss_MP,per_poss_FG,per_poss_FGA,per_poss_FG%,...,per_poss_TRB,per_poss_AST,per_poss_STL,per_poss_BLK,per_poss_TOV,per_poss_PF,per_poss_PTS,per_poss_Unnamed: 27,per_poss_ORtg,per_poss_DRtg
0,Lindsay Allen,2017,NYL,22.0,28.0,0.0,376.0,3.2,8.5,0.371,...,5.8,8.5,2.2,0.1,2.6,3.3,7.3,,95.0,101.0
1,Lindsay Allen,2018,LVA,23.0,24.0,6.0,358.0,3.9,10.1,0.384,...,4.1,9.5,2.1,0.0,2.9,2.4,10.2,,96.0,110.0
2,Lindsay Allen,2020,LVA,25.0,21.0,21.0,284.0,4.9,11.5,0.424,...,4.0,8.9,1.2,0.2,2.8,3.5,12.2,,104.0,104.0
3,Lindsay Allen,2021,IND,26.0,32.0,8.0,571.0,5.9,13.8,0.428,...,4.3,8.7,1.5,0.3,2.8,3.1,15.8,,107.0,113.0
4,Lindsay Allen,2022,MIN,27.0,9.0,0.0,134.0,7.6,14.5,0.526,...,5.3,11.8,0.8,0.0,2.7,4.2,22.8,,136.0,110.0
5,Lindsay Allen,2023,MIN,28.0,29.0,20.0,698.0,4.7,11.9,0.399,...,5.2,9.5,1.2,0.2,2.7,4.4,13.1,,105.0,111.0
6,Lindsay Allen,2024,CHI,29.0,40.0,28.0,950.0,5.6,12.0,0.466,...,4.3,8.5,1.7,0.4,3.4,3.6,14.2,,104.0,109.0
7,Lindsay Allen,Career,,,183.0,83.0,3371.0,5.0,11.7,0.429,...,4.7,9.0,1.6,0.2,2.9,3.5,13.2,,105.0,109.0
8,Lindsay Allen,,,,,,,,,,...,,,,,,,,,,
9,Lindsay Allen,2 seasons,LVA,,45.0,27.0,642.0,4.3,10.7,0.403,...,4.1,9.2,1.7,0.1,2.9,2.9,11.1,,100.0,107.0


In [96]:
advanced

Unnamed: 0,Player,Year,Tm,Age,G,advanced_MP,advanced_PER,advanced_TS%,advanced_3PAr,advanced_FTr,...,advanced_AST%,advanced_STL%,advanced_BLK%,advanced_TOV%,advanced_USG%,advanced_Unnamed: 17,advanced_OWS,advanced_DWS,advanced_WS,advanced_WS/48
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,9.0,0.399,0.21,0.161,...,24.6,2.2,0.2,22.2,10.0,,0.0,0.6,0.6,0.08
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,9.8,0.443,0.219,0.329,...,27.5,2.1,0.0,20.1,12.5,,0.0,0.2,0.2,0.024
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,10.5,0.497,0.258,0.152,...,25.0,1.2,0.3,18.5,13.4,,0.3,0.4,0.6,0.106
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,13.6,0.517,0.309,0.243,...,28.2,1.5,0.4,15.6,15.6,,0.9,-0.2,0.7,0.059
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,21.4,0.686,0.368,0.342,...,38.5,0.8,0.0,13.8,16.9,,0.7,0.0,0.8,0.28
5,Lindsay Allen,2023,MIN,28.0,29.0,698.0,11.3,0.48,0.209,0.325,...,29.3,1.2,0.4,16.6,14.5,,0.8,0.2,0.9,0.064
6,Lindsay Allen,2024,CHI,29.0,40.0,950.0,12.0,0.537,0.215,0.233,...,26.1,1.7,0.6,20.7,14.3,,0.9,0.5,1.4,0.071
7,Lindsay Allen,Career,,,183.0,3371.0,11.8,0.505,0.243,0.256,...,27.5,1.6,0.4,18.4,13.9,,3.6,1.6,5.2,0.075
8,Lindsay Allen,,,,,,,,,,...,,,,,,,,,,
9,Lindsay Allen,2 seasons,LVA,,45.0,642.0,10.1,0.468,0.237,0.245,...,26.4,1.7,0.1,19.4,12.9,,0.2,0.6,0.8,0.06


In [97]:
shooting

Unnamed: 0,Player,Year,Tm,Age,G,shooting_MP,shooting_FG%,shooting_Dist.,shooting_Unnamed: 7_level_1,shooting_% of FGA by Distance_2P,...,shooting_% of FG Ast'd_3P,shooting_Unnamed: 24_level_1,shooting_Dunks_%FGA,shooting_Dunks_#,shooting_Unnamed: 27_level_1,shooting_Corner 3s_%3PA,shooting_Corner 3s_3P%,shooting_Unnamed: 30_level_1,shooting_Heaves_Att.,shooting_Heaves_#
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,0.371,13.5,,0.79,...,,,0.0,0.0,,0.0,,,1.0,0.0
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,0.384,14.5,,0.781,...,1.0,,0.0,0.0,,0.0,,,0.0,0.0
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,0.424,13.6,,0.742,...,0.667,,0.0,0.0,,0.118,0.5,,0.0,0.0
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,0.428,14.0,,0.691,...,0.857,,0.0,0.0,,0.149,0.429,,0.0,0.0
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,0.526,14.9,,0.632,...,0.25,,0.0,0.0,,0.143,0.5,,0.0,0.0
5,Lindsay Allen,2023,MIN,28.0,29.0,698.0,0.399,11.9,,0.791,...,1.0,,0.0,0.0,,0.176,0.167,,0.0,0.0
6,Lindsay Allen,2024,CHI,29.0,40.0,950.0,0.466,11.3,,0.785,...,0.857,,0.0,0.0,,0.188,0.222,,0.0,0.0
7,Lindsay Allen,Career,,,183.0,3371.0,0.429,13.4,,0.757,...,0.76,,0.0,0.0,,0.138,0.308,,1.0,0.0
8,Lindsay Allen,,,,,,,,,,...,,,,,,,,,,
9,Lindsay Allen,2 seasons,LVA,,45.0,642.0,0.403,14.1,,0.763,...,0.714,,0.0,0.0,,0.061,0.5,,0.0,0.0


In [98]:
pbp

Unnamed: 0,Player,Year,Tm,Age,G,pbp_MP,pbp_+/- Per 100 Poss._OnCourt,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Lindsay Allen,2017,NYL,22.0,28.0,376.0,10.3,9.2,13.0,5.0,8.0,1.0,5.0,5.0,137.0,2.0,1.0
1,Lindsay Allen,2018,LVA,23.0,24.0,358.0,1.3,2.2,16.0,3.0,5.0,0.0,6.0,6.0,153.0,3.0,3.0
2,Lindsay Allen,2020,LVA,25.0,21.0,284.0,3.7,-11.1,12.0,1.0,11.0,1.0,5.0,3.0,111.0,2.0,7.0
3,Lindsay Allen,2021,IND,26.0,32.0,571.0,-17.2,-7.6,21.0,4.0,12.0,2.0,16.0,7.0,219.0,3.0,8.0
4,Lindsay Allen,2022,MIN,27.0,9.0,134.0,-5.2,-3.5,6.0,0.0,5.0,1.0,4.0,1.0,70.0,1.0,2.0
5,Lindsay Allen,2023,MIN,28.0,29.0,698.0,-2.5,6.1,25.0,5.0,21.0,5.0,25.0,14.0,298.0,9.0,10.0
6,Lindsay Allen,2024,CHI,29.0,40.0,950.0,0.1,15.8,43.0,16.0,18.0,3.0,23.0,25.0,346.0,8.0,12.0
7,Lindsay Allen,Career,,,183.0,3371.0,-2.0,4.6,136.0,34.0,80.0,13.0,84.0,61.0,1334.0,28.0,43.0
8,Lindsay Allen,,,,,,,,,,,,,,,,
9,Lindsay Allen,2 seasons,LVA,,45.0,642.0,2.4,-3.7,28.0,4.0,16.0,1.0,11.0,9.0,264.0,5.0,10.0


In [102]:
def clean_year_rows(df):
    return df[df["Year"].astype(str).str.match(r"^\d{4}$|^Career$")].copy()

# Clean each table
per_game = clean_year_rows(per_game)
per_minute = clean_year_rows(per_minute)
per_poss = clean_year_rows(per_poss)
advanced = clean_year_rows(advanced)
shooting = clean_year_rows(shooting)
pbp = clean_year_rows(pbp)

In [104]:
# Keys to merge on
merge_keys = ["Player", "Year"]

# All the other shared columns we want to keep only once
shared_cols = list(no_prefix_cols - set(merge_keys))

# Prepare list: keep shared_cols only from the first df
dfs = [per_game, per_minute, per_poss, advanced, shooting, pbp]
cleaned_dfs = [dfs[0]]  # keep everything in first one

# For all others: drop shared_cols if they exist
for df in dfs[1:]:
    drop = [col for col in shared_cols if col in df.columns]
    cleaned_dfs.append(df.drop(columns=drop))

# Merge on Player and Year
df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), cleaned_dfs)

# Optional: sort and reset index
df = df.sort_values(by=["Year", "Player"]).reset_index(drop=True)

print("Final merged shape:", df.shape)
df.head(10)

Final merged shape: (9, 137)


Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Lindsay Allen,2017,NYL,22.0,28,0,13.4,0.8,2.2,.371,...,9.2,13.0,5.0,8.0,1.0,5.0,5.0,137.0,2.0,1.0
1,Lindsay Allen,2018,LVA,23.0,24,6,14.9,1.2,3.0,.384,...,2.2,16.0,3.0,5.0,0.0,6.0,6.0,153.0,3.0,3.0
2,Lindsay Allen,2019,,23.0,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),...,,,,,,,,,,
3,Lindsay Allen,2020,LVA,25.0,21,21,13.5,1.3,3.1,.424,...,-11.1,12.0,1.0,11.0,1.0,5.0,3.0,111.0,2.0,7.0
4,Lindsay Allen,2021,IND,26.0,32,8,17.8,2.0,4.8,.428,...,-7.6,21.0,4.0,12.0,2.0,16.0,7.0,219.0,3.0,8.0
5,Lindsay Allen,2022,MIN,27.0,9,0,14.9,2.2,4.2,.526,...,-3.5,6.0,0.0,5.0,1.0,4.0,1.0,70.0,1.0,2.0
6,Lindsay Allen,2023,MIN,28.0,29,20,24.1,2.2,5.6,.399,...,6.1,25.0,5.0,21.0,5.0,25.0,14.0,298.0,9.0,10.0
7,Lindsay Allen,2024,CHI,29.0,40,28,23.8,2.6,5.6,.466,...,15.8,43.0,16.0,18.0,3.0,23.0,25.0,346.0,8.0,12.0
8,Lindsay Allen,Career,,,183,83,18.4,1.8,4.2,.429,...,4.6,136.0,34.0,80.0,13.0,84.0,61.0,1334.0,28.0,43.0


In [114]:
# Accumulators
per_game_frames = []
per_minute_frames = []
per_poss_frames = []
advanced_frames = []
shooting_frames = []
pbp_frames = []

for name, link in players:
    print(f"Processing: {name}")

    url = "https://www.basketball-reference.com" + link
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    def load_table(table_id, soup, comments, player_name):
        tag = soup.find("table", {"id": table_id + "0"})
        if tag is None:
            for c in comments:
                if f'id="{table_id}0"' in c:
                    tag = BeautifulSoup(c, "html.parser").find("table", {"id": table_id + "0"})
                    break
        if tag is None:
            print(f"Table not found: {table_id}")
            return None

        if table_id in ["shooting", "pbp"]:
            df = pd.read_html(StringIO(str(tag)), header=[0, 1])[0]
            df.columns = [f"{a}_{b}" if not a.startswith("Unnamed") else b for a, b in df.columns]
        else:
            df = pd.read_html(StringIO(str(tag)), header=0)[0]

        df.columns = [col if col in no_prefix_cols else f"{table_id}_{col}" for col in df.columns]
        df.insert(0, "Player", player_name)
        return df

    # Load all 6 tables and append
    t = load_table("per_game", soup, comments, name)
    if t is not None: per_game_frames.append(t)

    t = load_table("per_minute", soup, comments, name)
    if t is not None: per_minute_frames.append(t)

    t = load_table("per_poss", soup, comments, name)
    if t is not None: per_poss_frames.append(t)

    t = load_table("advanced", soup, comments, name)
    if t is not None: advanced_frames.append(t)

    t = load_table("shooting", soup, comments, name)
    if t is not None: shooting_frames.append(t)

    t = load_table("pbp", soup, comments, name)
    if t is not None: pbp_frames.append(t)

    time.sleep(5)

# Combine each full table
per_game = pd.concat(per_game_frames, ignore_index=True)
per_minute = pd.concat(per_minute_frames, ignore_index=True)
per_poss = pd.concat(per_poss_frames, ignore_index=True)
advanced = pd.concat(advanced_frames, ignore_index=True)
shooting = pd.concat(shooting_frames, ignore_index=True)
pbp = pd.concat(pbp_frames, ignore_index=True)

Processing: Lindsay Allen
Processing: Rebecca Allen
Processing: Laeticia Amihere
Processing: Ariel Atkins
Processing: Amy Atwell
Processing: Shakira Austin
Processing: Rachel Banham
Processing: Kierstan Bell
Processing: Grace Berger
Processing: Morgan Bertsch
Processing: Caitlin Bickle
Processing: Monique Billings
Processing: DeWanna Bonner
Processing: Aliyah Boston
Processing: Cameron Brink
Processing: Jaelyn Brown
Processing: Kalani Brown
Processing: Lexie Brown
Processing: Jakia Brown-Turner
Processing: Kennedy Burke
Processing: Rae Burrell
Processing: Veronica Burton
Processing: Maya Caldwell
Processing: Jordin Canada
Processing: Emma Cannon
Processing: Kamilla Cardoso
Processing: Bridget Carleton
Processing: DiJonai Carrington
Processing: Chennedy Carter
Processing: Jessika Carter
Processing: Tina Charles
Processing: Layshia Clarendon
Processing: Alysha Clark
Processing: Caitlin Clark
Processing: Natasha Cloud
Processing: Nia Coffey
Processing: Napheesa Collier
Processing: Sydney 

In [115]:
def clean_year_rows(df):
    return df[df["Year"].astype(str).str.match(r"^\d{4}$|^Career$")].copy()

# Clean each table
per_game = clean_year_rows(per_game)
per_minute = clean_year_rows(per_minute)
per_poss = clean_year_rows(per_poss)
advanced = clean_year_rows(advanced)
shooting = clean_year_rows(shooting)
pbp = clean_year_rows(pbp)

In [116]:
# Keys to merge on
merge_keys = ["Player", "Year"]

# All the other shared columns we want to keep only once
shared_cols = list(no_prefix_cols - set(merge_keys))

# Prepare list: keep shared_cols only from the first df
dfs = [per_game, per_minute, per_poss, advanced, shooting, pbp]
cleaned_dfs = [dfs[0]]  # keep everything in first one

# For all others: drop shared_cols if they exist
for df in dfs[1:]:
    drop = [col for col in shared_cols if col in df.columns]
    cleaned_dfs.append(df.drop(columns=drop))

# Merge on Player and Year
df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), cleaned_dfs)

# Optional: sort and reset index
df = df.sort_values(by=["Year", "Player"]).reset_index(drop=True)

print("Final merged shape:", df.shape)
df.head(10)

Final merged shape: (42945, 137)


Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Diana Taurasi,2004,PHO,22.0,34,34,33.2,6.1,14.8,.416,...,1.6,46.0,13.0,38.0,27.0,,,317.0,18.0,18.0
1,Diana Taurasi,2005,PHO,23.0,33,33,33.0,5.3,12.9,.410,...,4.8,61.0,20.0,42.0,22.0,,,342.0,8.0,12.0
2,Diana Taurasi,2006,PHO,24.0,34,34,33.9,8.8,19.4,.452,...,14.0,33.0,16.0,53.0,18.0,71.0,2.0,299.0,17.0,19.0
3,Diana Taurasi,2007,PHO,25.0,32,32,32.0,6.4,14.6,.440,...,2.7,50.0,13.0,49.0,15.0,52.0,6.0,312.0,23.0,10.0
4,Diana Taurasi,2008,PHO,26.0,34,34,31.9,7.6,17.0,.446,...,10.4,44.0,17.0,46.0,20.0,102.0,14.0,276.0,25.0,27.0
5,DeWanna Bonner,2009,PHO,21.0,34,0,21.3,3.7,8.2,.457,...,-5.3,5.0,14.0,34.0,6.0,65.0,3.0,30.0,6.0,27.0
6,Diana Taurasi,2009,PHO,27.0,31,31,31.5,6.5,14.0,.461,...,7.2,41.0,18.0,42.0,15.0,77.0,12.0,247.0,14.0,13.0
7,Alysha Clark,2010,,22.0,Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),...,,,,,,,,,,
8,DeWanna Bonner,2010,PHO,22.0,32,4,25.4,4.1,8.9,.465,...,-3.5,20.0,13.0,34.0,4.0,55.0,2.0,87.0,15.0,28.0
9,Diana Taurasi,2010,PHO,28.0,31,31,32.2,6.8,16.0,.427,...,3.8,57.0,22.0,47.0,23.0,83.0,9.0,330.0,19.0,11.0


In [117]:
df.shape

(42945, 137)

In [118]:
df.head(20)

Unnamed: 0,Player,Year,Tm,Age,G,GS,per_game_MP,per_game_FG,per_game_FGA,per_game_FG%,...,pbp_+/- Per 100 Poss._On-Off,pbp_Turnovers_BadPass,pbp_Turnovers_LostBall,pbp_Fouls Committed_Shoot,pbp_Fouls Committed_Off.,pbp_Fouls Drawn_Shoot,pbp_Fouls Drawn_Off.,pbp_Misc._PGA,pbp_Misc._And1,pbp_Misc._Blkd
0,Diana Taurasi,2004,PHO,22.0,34,34,33.2,6.1,14.8,.416,...,1.6,46.0,13.0,38.0,27.0,,,317.0,18.0,18.0
1,Diana Taurasi,2005,PHO,23.0,33,33,33.0,5.3,12.9,.410,...,4.8,61.0,20.0,42.0,22.0,,,342.0,8.0,12.0
2,Diana Taurasi,2006,PHO,24.0,34,34,33.9,8.8,19.4,.452,...,14.0,33.0,16.0,53.0,18.0,71.0,2.0,299.0,17.0,19.0
3,Diana Taurasi,2007,PHO,25.0,32,32,32.0,6.4,14.6,.440,...,2.7,50.0,13.0,49.0,15.0,52.0,6.0,312.0,23.0,10.0
4,Diana Taurasi,2008,PHO,26.0,34,34,31.9,7.6,17.0,.446,...,10.4,44.0,17.0,46.0,20.0,102.0,14.0,276.0,25.0,27.0
5,DeWanna Bonner,2009,PHO,21.0,34,0,21.3,3.7,8.2,.457,...,-5.3,5.0,14.0,34.0,6.0,65.0,3.0,30.0,6.0,27.0
6,Diana Taurasi,2009,PHO,27.0,31,31,31.5,6.5,14.0,.461,...,7.2,41.0,18.0,42.0,15.0,77.0,12.0,247.0,14.0,13.0
7,Alysha Clark,2010,,22.0,Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),Did Not Play (waived),...,,,,,,,,,,
8,DeWanna Bonner,2010,PHO,22.0,32,4,25.4,4.1,8.9,.465,...,-3.5,20.0,13.0,34.0,4.0,55.0,2.0,87.0,15.0,28.0
9,Diana Taurasi,2010,PHO,28.0,31,31,32.2,6.8,16.0,.427,...,3.8,57.0,22.0,47.0,23.0,83.0,9.0,330.0,19.0,11.0


In [119]:
for col in df.columns:
    print(col)

Player
Year
Tm
Age
G
GS
per_game_MP
per_game_FG
per_game_FGA
per_game_FG%
per_game_3P
per_game_3PA
per_game_3P%
per_game_2P
per_game_2PA
per_game_2P%
per_game_eFG%
per_game_FT
per_game_FTA
per_game_FT%
per_game_ORB
per_game_DRB
per_game_TRB
per_game_AST
per_game_STL
per_game_BLK
per_game_TOV
per_game_PF
per_game_PTS
per_game_Awards
per_game_Unnamed: 29
per_minute_MP
per_minute_FG
per_minute_FGA
per_minute_FG%
per_minute_3P
per_minute_3PA
per_minute_3P%
per_minute_2P
per_minute_2PA
per_minute_2P%
per_minute_FT
per_minute_FTA
per_minute_FT%
per_minute_ORB
per_minute_DRB
per_minute_TRB
per_minute_AST
per_minute_STL
per_minute_BLK
per_minute_TOV
per_minute_PF
per_minute_PTS
per_poss_MP
per_poss_FG
per_poss_FGA
per_poss_FG%
per_poss_3P
per_poss_3PA
per_poss_3P%
per_poss_2P
per_poss_2PA
per_poss_2P%
per_poss_FT
per_poss_FTA
per_poss_FT%
per_poss_ORB
per_poss_DRB
per_poss_TRB
per_poss_AST
per_poss_STL
per_poss_BLK
per_poss_TOV
per_poss_PF
per_poss_PTS
per_poss_Unnamed: 27
per_poss_ORtg
per_po

In [120]:
for player in df["Player"].unique():
    print(player)

Diana Taurasi
DeWanna Bonner
Alysha Clark
Tina Charles
Courtney Vandersloot
Sydney Colson
Damiris Dantas
Nneka Ogwumike
Tiffany Hayes
Brittney Griner
Layshia Clarendon
Skylar Diggins-Smith
Alyssa Thomas
Astou Ndour-Fall
Chelsea Gray
Kayla McBride
Natasha Howard
Odyssey Sims
Stefanie Dolson
Stephanie Talbot
Betnijah Laney-Hamilton
Cheyenne Parker-Tyus
Dearica Hamby
Elizabeth Williams
Erica Wheeler
Isabelle Harrison
Jewell Loyd
Kayla Thornton
Kiah Stokes
Natasha Cloud
Rebecca Allen
Aerial Powers
Breanna Stewart
Courtney Williams
Jonquel Jones
Kahleah Copper
Moriah Jefferson
Rachel Banham
Temi Fagbenle
Tiffany Mitchell
Allisha Gray
Brionna Jones
Brittney Sykes
Cecilia Zandalasini
Emma Cannon
Kaela Davis
Kelsey Plum
Lindsay Allen
Nia Coffey
Sami Whitcomb
Shatori Walker-Kimbrough
A'ja Wilson
Ariel Atkins
Azura Stevens
Diamond DeShields
Gabby Williams
Jordin Canada
Karlie Samuelson
Kelsey Mitchell
Kia Nurse
Kristy Wallace
Lexie Brown
Mercedes Russell
Monique Billings
Myisha Hines-Allen
Victo

In [121]:
print(df["Player"].nunique())

157


In [122]:
# Null check
print(df.isnull().sum())

Player                     0
Year                       0
Tm                       255
Age                      157
G                          0
                        ... 
pbp_Fouls Drawn_Shoot    100
pbp_Fouls Drawn_Off.     100
pbp_Misc._PGA             98
pbp_Misc._And1            98
pbp_Misc._Blkd            98
Length: 137, dtype: int64


In [123]:
print(df.duplicated().sum())

0


In [124]:
from google.colab import files
df.to_csv("player_data.csv", index=False)
files.download("player_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>