In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from io import StringIO

# List of team codes for 2024 season
teams = ['ATL', 'CHI', 'CON', 'DAL', 'IND', 'LAS', 'MIN', 'NYL', 'PHO', 'SEA', 'WAS', 'LVA']

# Base URL pattern for advanced game logs
base_url = "https://www.basketball-reference.com/wnba/teams/{team}/2024/gamelog-advanced/"

frames = []

for team in teams:
    url = base_url.format(team=team)
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", id="wnba_tgl_advanced")

    if table is not None:
        df_team = pd.read_html(StringIO(str(table)))[0]

        # Combine multi-level headers
        df_team.columns = [f"{a}_{b}" for a, b in df_team.columns]

        # Add team column as first column
        df_team.insert(0, "Team", team)

        frames.append(df_team)
    else:
        print(f"No advanced table found for {team}")

# Combine into single DataFrame
df_adv = pd.concat(frames, ignore_index=True)

In [4]:
df_adv.head()

Unnamed: 0,Team,Unnamed: 0_level_0_Rk,Unnamed: 1_level_0_Date,Unnamed: 2_level_0_Unnamed: 2_level_1,Unnamed: 3_level_0_Opp,Unnamed: 4_level_0_W/L,Unnamed: 5_level_0_Tm,Unnamed: 6_level_0_Opp,Unnamed: 7_level_0_Unnamed: 7_level_1,Advanced_ORtg,...,Unnamed: 18_level_0_Unnamed: 18_level_1,Offensive Four Factors_eFG%,Offensive Four Factors_TOV%,Offensive Four Factors_ORB%,Offensive Four Factors_FT/FGA,Unnamed: 23_level_0_Unnamed: 23_level_1,Defensive Four Factors_eFG%,Defensive Four Factors_TOV%,Defensive Four Factors_DRB%,Defensive Four Factors_FT/FGA
0,ATL,1,2024-05-15,@,LAS,W,92,81,,114.0,...,,0.567,14.7,20.6,0.239,,0.472,11.9,77.1,0.181
1,ATL,2,2024-05-18,@,PHO,L,85,88,,103.6,...,,0.485,14.3,18.4,0.318,,0.455,13.0,78.8,0.424
2,ATL,3,2024-05-21,,DAL,W,83,78,,107.0,...,,0.465,12.8,35.1,0.222,,0.443,15.1,67.6,0.229
3,ATL,4,2024-05-26,,MIN,L,79,92,,107.0,...,,0.5,11.9,27.8,0.162,,0.605,12.2,81.5,0.274
4,ATL,5,2024-05-29,@,WAS,W,73,67,,92.0,...,,0.468,19.3,18.5,0.242,,0.405,14.4,77.5,0.095


In [18]:
# Drop unnecessary columns
drop_cols = [
    'Unnamed: 2_level_0_Unnamed: 2_level_1',
    'Unnamed: 7_level_0_Unnamed: 7_level_1',
    'Unnamed: 18_level_0_Unnamed: 18_level_1',
    'Unnamed: 23_level_0_Unnamed: 23_level_1'
]
df_adv = df_adv.drop(columns=drop_cols, errors='ignore')

# Rename key columns
df_adv = df_adv.rename(columns={
    'Unnamed: 0_level_0_Rk': 'Rk',
    'Unnamed: 1_level_0_Date': 'Date',
    'Unnamed: 3_level_0_Opp': 'Opp',
    'Unnamed: 4_level_0_W/L': 'W/L',
    'Unnamed: 5_level_0_Tm': 'Team_Score',
    'Unnamed: 6_level_0_Opp': 'Opp_Score'
})

In [22]:
df_adv.head()

Unnamed: 0,Team,Rk,Date,Opp,W/L,Team_Score,Opp_Score,Advanced_ORtg,Advanced_DRtg,Advanced_Pace,...,Advanced_STL%,Advanced_BLK%,Offensive Four Factors_eFG%,Offensive Four Factors_TOV%,Offensive Four Factors_ORB%,Offensive Four Factors_FT/FGA,Defensive Four Factors_eFG%,Defensive Four Factors_TOV%,Defensive Four Factors_DRB%,Defensive Four Factors_FT/FGA
0,ATL,1,2024-05-15,LAS,W,92,81,114.0,100.4,80.7,...,9.9,22.0,0.567,14.7,20.6,0.239,0.472,11.9,77.1,0.181
1,ATL,2,2024-05-18,PHO,L,85,88,103.6,107.3,82.0,...,9.8,11.6,0.485,14.3,18.4,0.318,0.455,13.0,78.8,0.424
2,ATL,3,2024-05-21,DAL,W,83,78,107.0,100.5,77.6,...,11.6,5.6,0.465,12.8,35.1,0.222,0.443,15.1,67.6,0.229
3,ATL,4,2024-05-26,MIN,L,79,92,107.0,124.6,73.8,...,12.2,3.0,0.5,11.9,27.8,0.162,0.605,12.2,81.5,0.274
4,ATL,5,2024-05-29,WAS,W,73,67,92.0,84.4,79.4,...,11.3,4.3,0.468,19.3,18.5,0.242,0.405,14.4,77.5,0.095


In [27]:
# Print each column name on its own line
for col in df_adv.columns:
    print(col)

Team
Rk
Date
Opp
W/L
Team_Score
Opp_Score
Advanced_ORtg
Advanced_DRtg
Advanced_Pace
Advanced_FTr
Advanced_3PAr
Advanced_TS%
Advanced_TRB%
Advanced_AST%
Advanced_STL%
Advanced_BLK%
Offensive Four Factors_eFG%
Offensive Four Factors_TOV%
Offensive Four Factors_ORB%
Offensive Four Factors_FT/FGA
Defensive Four Factors_eFG%
Defensive Four Factors_TOV%
Defensive Four Factors_DRB%
Defensive Four Factors_FT/FGA


In [30]:
# Save to data folder
df_adv.to_csv(r"C:\Users\colte\LHL-final-project\LHL-final-final-project\data\2024_basketball_reference_gamelog-advanced.csv", index=False)