In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [3]:
# List to track request timestamps for rate limiting
request_times = []

def rate_limit():
    """Ensures we do not exceed the rate limit of 19 requests per minute."""
    global request_times
    current_time = time.time()
    
    # Remove timestamps older than 60 seconds
    request_times = [t for t in request_times if current_time - t < 60]
    
    if len(request_times) >= 19:
        wait_time = 60 - (current_time - request_times[0])
        print(f"Rate limit reached. Waiting for {wait_time:.2f} seconds...")
        time.sleep(wait_time)
    
    request_times.append(current_time)

In [4]:
def fetch_players_for_season(season):
    """
    Fetches player names, links, and IDs from the WNBA season totals page.

    Args:
        season (int): The season year to fetch data for.

    Returns:
        list: A list of dictionaries with 'Season', 'Player', 'Player Link', and 'Player ID'.
    """
    url = f"https://www.basketball-reference.com/wnba/years/{season}_totals.html"
    print(f"Fetching player list for {season} from: {url}")

    # Ensure we respect the rate limit
    rate_limit()

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch player list for {season}. Status code: {response.status_code}")
        return []

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the player totals table
    table = soup.find('table', id='totals')
    if not table:
        print(f"No player totals table found for {season}.")
        return []

    print(f"Player totals table found for {season}. Extracting data...")

    # Extract player data
    player_data = []
    rows = table.find('tbody').find_all('tr', class_='full_table')  # Targeting player rows

    for row in rows:
        player_cell = row.find('th', {'data-stat': 'player'})
        if player_cell:
            player_link = player_cell.find('a')
            if player_link and 'href' in player_link.attrs:
                player_name = player_link.text
                player_url = "https://www.basketball-reference.com" + player_link['href']
                player_id = player_link['href'].split('/')[-1].replace('.html', '')  # Extract player ID
                
                player_data.append({
                    'Season': season,
                    'Player': player_name,
                    'Player Link': player_url,
                    'Player ID': player_id
                })
    
    print(f"Extracted {len(player_data)} players for {season}.")
    return player_data

In [5]:
# Loop through seasons from 1997 to 2024 and collect player data
all_players = []

for year in range(1997, 2025):  # From 1997 to 2024
    season_data = fetch_players_for_season(year)
    all_players.extend(season_data)  # Add to main list

# Convert to DataFrame
df_players = pd.DataFrame(all_players)

# Display first few rows
print(df_players.head())

Fetching player list for 1997 from: https://www.basketball-reference.com/wnba/years/1997_totals.html
Player totals table found for 1997. Extracting data...
Extracted 97 players for 1997.
Fetching player list for 1998 from: https://www.basketball-reference.com/wnba/years/1998_totals.html
Player totals table found for 1998. Extracting data...
Extracted 127 players for 1998.
Fetching player list for 1999 from: https://www.basketball-reference.com/wnba/years/1999_totals.html
Player totals table found for 1999. Extracting data...
Extracted 150 players for 1999.
Fetching player list for 2000 from: https://www.basketball-reference.com/wnba/years/2000_totals.html
Player totals table found for 2000. Extracting data...
Extracted 204 players for 2000.
Fetching player list for 2001 from: https://www.basketball-reference.com/wnba/years/2001_totals.html
Player totals table found for 2001. Extracting data...
Extracted 207 players for 2001.
Fetching player list for 2002 from: https://www.basketball-re

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to extract game log data for a given player and year
def fetch_game_log(player_link, year):
    # Convert player profile link into game log link
    game_log_url = player_link.replace(".html", f"/gamelog/{year}/")
    print(f"Fetching game log from: {game_log_url}")

    # Respect rate limit (no more than 19 requests per minute)
    time.sleep(3.2)  # Ensures we stay within 19 requests/min

    # Request the page
    response = requests.get(game_log_url)
    if response.status_code != 200:
        print(f"Failed to fetch game log for {player_link}. Status code: {response.status_code}")
        return None

    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the div containing the game log table
    div_container = soup.find('div', id='div_wnba_pgl_basic')
    if not div_container:
        print("Game log container not found.")
        return None

    # Now find the table within this div
    table = div_container.find('table')
    if not table:
        print("No game log table found inside the container.")
        return None

    # Extract table headers
    headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]

    # Extract rows of data
    rows = []
    for row in table.find('tbody').find_all('tr', class_=lambda x: x != 'thead'):  # Skip subheaders
        cols = row.find_all(['th', 'td'])  # 'th' sometimes contains row headers (e.g., game number)
        rows.append([col.get_text(strip=True) for col in cols])

    # Convert to DataFrame
    df_game_log = pd.DataFrame(rows, columns=headers)
    return df_game_log

# **Test with One Player**
test_player_link = "https://www.basketball-reference.com/wnba/players/m/maxwean01w.html"
test_year = 1997  # Change as needed

df_test_game_log = fetch_game_log(test_player_link, test_year)

# Display result
if df_test_game_log is not None:
    print(df_test_game_log.head())  # Show first few rows

Fetching game log from: https://www.basketball-reference.com/wnba/players/m/maxwean01w/gamelog/1997/
  Rk        Date     Age   Tm     Opp          GS     MP FG  ... ORB DRB TRB  \
0  1  1997-07-03  23-087  CLE     LAS  L (-12)  0   0:09  0  ...   0   1   1   
1  2  1997-07-12  23-096  CLE  @  CHA  L (-29)  0   7:40  1  ...   1   1   2   
2  3  1997-07-14  23-098  CLE     NYL  L (-11)  0   2:27  0  ...   0   1   1   
3  4  1997-07-15  23-099  CLE  @  NYL  L (-17)  0  10:50  2  ...   0   2   2   
4  5  1997-07-17  23-101  CLE     CHA  W (+18)  0   0:42  1  ...   0   1   1   

  AST STL BLK TOV PF PTS  GmSc  
0   0   0   0   0  0   0   0.3  
1   1   0   0   0  0   3   3.3  
2   0   0   0   1  0   0  -1.4  
3   2   4   0   1  0   5   6.2  
4   0   0   0   0  0   2   2.0  

[5 rows x 28 columns]
