In [1]:
import re
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import urllib.request
import time
import random

In [2]:
import time, random
import requests
from bs4 import BeautifulSoup
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# one global session you reuse for all requests
def make_session():
    s = requests.Session()
    # polite headers
    s.headers.update({
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Cache-Control": "no-cache",
    })

    # automatic retries with backoff, including 429
    retry = Retry(
        total=5,                # total retries
        connect=3,
        read=3,
        backoff_factor=1.5,     # exponential backoff: 0, 1.5, 3, 4.5, ...
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods={"GET", "HEAD"},
        respect_retry_after_header=True,   # honors Retry-After on 429/503
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    return s

SESSION = make_session()
_LAST_REQUEST_TS = 0.0

def rate_limit(min_interval=1.1):
    """Ensure at most ~1 request/second (adjust as needed)."""
    global _LAST_REQUEST_TS
    now = time.monotonic()
    delta = now - _LAST_REQUEST_TS
    if delta < min_interval:
        time.sleep(min_interval - delta + random.uniform(0, 0.25))  # small jitter
    _LAST_REQUEST_TS = time.monotonic()

def get_soup(url, timeout=20):
    rate_limit()  # be nice
    resp = SESSION.get(url, timeout=timeout)
    # If the site still returns 429 without Retry-After, add a manual sleep
    if resp.status_code == 429:
        # Try to honor Retry-After if present
        ra = resp.headers.get("Retry-After")
        if ra:
            try:
                time.sleep(int(ra))
            except ValueError:
                time.sleep(5)
        else:
            time.sleep(5 + random.uniform(0, 2))
        # one more attempt after cooling down
        rate_limit()
        resp = SESSION.get(url, timeout=timeout)

    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


Exploratory Data Analysis

In [2]:
# nba_teams = [
#     "ATL", "BOS", "BRK", "CHI", "CLE", "DAL", "DEN", "DET", 
#     "GSW", "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", 
#     "MIN", "NOP", "NYK", "OKC", "ORL", "PHI", "PHO", "POR", 
#     "SAC", "SAS", "TOR", "UTA", "WAS", "CHO"
# ]

In [3]:
# import matplotlib.pyplot as plt

# plt.plot(reg_seas_played['Date'], reg_seas_played['MP'])
# plt.show()

---

In [7]:
### get html from urllib
def get_soup(url):
    try:
        with urllib.request.urlopen(url) as response:
            html = response.read()
    except urllib.error.URLError as e:
        print(f"Error fetching URL: {e.reason}")

    soup = BeautifulSoup(html, 'html.parser')

    return soup

In [3]:
def get_roster(team_name, season):
    '''
    Retrieve the roster for a certain season

    Args:
        team_name - String (ex. CHI, BOS, PHI)
        season - String (ex. 2025, 2024)

    Returns:
        roster - Dataframe
    '''

    ### call get_html to request data from url
    soup = get_soup(f'https://www.basketball-reference.com/teams/{team_name}/{season}.html')

    ### extract player names from basketball-reference
    cells = soup.select('td[data-stat="player"]')
    players = [c.get_text(strip=True) for c in cells]

    ### extract player position from basketball-reference
    cells = soup.select('td[data-stat="pos"][csk="1"], '
                        'td[data-stat="pos"][csk="2"], '
                        'td[data-stat="pos"][csk="3"], '
                        'td[data-stat="pos"][csk="4"], '
                        'td[data-stat="pos"][csk="5"]')
    pos = [c.get_text(strip=True) for c in cells]

    ### build roster
    roster = pd.DataFrame({
        'Player_NAME': players,
        'Player_POSITION': pos
    })

    ### add team name 
    roster['Player_TEAM'] = team_name

    return roster

In [6]:
# string = 'hello world'
# string.replace('ello', 'fart')

In [4]:
def get_game_data(team, opp, url):
    '''
    Args:
        team - String (ex. SAC, LAC)
        opp - String (ex. SAC, OKC)
        url - String

    Returns:
        team_df
        opp_df
    '''
    other_url = url
    other_url = other_url.replace(team, opp)

    print(f'Original URL: {url}')
    print(f'Alternate URL: {other_url}')

    for attempt in range(1, 3):
        time.sleep(0.5 + random.random() * 0.5)
        
        try:
            soup = get_soup(url)
            print('Success')

            break
        except Exception as ex:
            sleep_s = 0.8 * (2 ** (attempt - 1)) + random.random()
            time.sleep(sleep_s)
            soup = get_soup(other_url)

            break


    team_table = soup.select(f'table[id="box-{team}-game-basic"]')
    opp_table = soup.select(f'table[id="box-{opp}-game-basic"]')

    def parse_box_table(table):
        rows = []
        for tr in table.select("tbody tr"):
            row_data = {}
            for cell in tr.select("th[data-stat], td[data-stat]"):
                key = cell["data-stat"]
                val = cell.get_text(strip=True)
                row_data[key] = val
            if row_data['player'] == 'Reserves':
                continue
            rows.append(row_data)
        return rows

    # Pick the first matching table (since .select returns a list)
    team_rows = parse_box_table(team_table[0])
    opp_rows  = parse_box_table(opp_table[0])

    ### Convert to dataframe
    team_df = pd.DataFrame(team_rows)
    opp_df = pd.DataFrame(opp_rows)

    return team_df, opp_df

In [55]:
def get_player_data(name, season):
    url = f'https://www.basketball-reference.com/players/{name[0]}/{name}/gamelog/{season}/'

    soup = get_soup(url)

    ### parse html for table data (first 7 rows are useless)
    table = soup.find_all('td', class_=['center', 'left', 'right'])[7:]

    ### extract the data from the td tags
    data_list = []
    for i in table:
        data = i.get_text(strip=True)

        ### account for rows of inactive games
        if data.lower() in ['inactive', 'did not dress', 'did not play']:
            data_list.append(data)
            for _ in range(25):
                data_list.append('')
        else:
            data_list.append(data)

    ### format data for dataframe
    rows = []
    BATCH_SIZE = 33 # 33 columns 
    for i in range(0, len(data_list), BATCH_SIZE):
        curr_data = data_list[i: i+BATCH_SIZE] # row data

        rows.append(curr_data)

    ### set column names
    columns = [
        'Gcar','Gtm','Date','Team','at','Opp','Result','GS','MP','FG','FGA',
        'FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB',
        'DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-'
    ]

    ### create the frame
    data = pd.DataFrame(rows, columns=columns)

    ### separate the final data tables
    totals = data[
        (data['Gcar'] == '') & 
        (data['Gtm'] == '') & 
        (data['Date'] == '') & 
        (data['Team'] == '') & 
        (data['at'] == '') & 
        (data['Opp'] == '')
    ].drop(columns=['Gcar', 'Gtm', 'Date', 'Team', 'at', 'Opp'])
    reg_season = data.iloc[:totals.index[0]] 

    ### calculate games missed and played
    missed_reg_seas_played = reg_season[reg_season['GS'] != '*'] ### this ###
    total_reg_seas_missed_games = len(missed_reg_seas_played)
    reg_seas_played = reg_season[reg_season['GS'] == '*'] ### this ###

    ### because 3p are 0, their percentage is also gonna be 0 or not reported
    ### therefore fill with 0 for any col of %
    reg_seas_played[reg_seas_played['3P%'] == '']['3P']
    reg_seas_played[reg_seas_played['3P'] == '0'].shape

    for col in [c for c in reg_seas_played if '%' in c]:
        for row in reg_seas_played.index:
            if reg_seas_played.at[row, col] == '':
                reg_seas_played.at[row, col] = 0

    reg_seas_played = reg_seas_played.drop(columns=['at', 'GS'])

    reg_seas_played['Date'] = (
        pd.to_datetime(reg_seas_played['Date']).dt.strftime('%Y%m%d')
    )

    result = [1 if r[0] == 'W' else 0 for r in reg_seas_played['Result']]
    team_score = [
        re.findall(
            r'(?<= )\d+(?=\s*-\s*)', 
            reg_seas_played['Result'][i])[0] for i in reg_seas_played['Result'].index
    ]
    opp_score = [
        re.findall(
            r'(?<=-)\s*\d+', 
            reg_seas_played['Result'][i])[0] for i in reg_seas_played['Result'].index
    ]

    reg_seas_played['Result'] = result
    reg_seas_played['Team Score'] = team_score
    reg_seas_played['Opp Score'] = opp_score

    ### convert total mins played to sec
    def time_to_secs(time_str):
        minutes, seconds = map(int, time_str.split(":"))
        total_seconds = minutes * 60 + seconds  

        return total_seconds

    reg_seas_played['MP'] = reg_seas_played['MP'].apply(time_to_secs)

    ### calculate what percent of the score was attributed by player
    reg_seas_played['Percent Score'] = round(reg_seas_played['PTS'].astype(int) / reg_seas_played['Team Score'].astype(int), 2)

    ### convert the data to float values
    to_convert = ['FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-', 'Team Score', 'Opp Score']
    reg_seas_played[to_convert] = reg_seas_played[to_convert].astype(float)

    urls = []
    for row in reg_seas_played.index:
        curr_data = reg_seas_played.loc[row, :]
        curr_team = curr_data['Team']
        curr_game_date = curr_data['Date']
        game_url = f'https://www.basketball-reference.com/boxscores/{curr_game_date}0{curr_team}.html'

        urls.append(game_url)

    reg_seas_played['URL'] = urls

    data = soup.select('span[itemprop="name"]')

    names = []
    for i in data:
        names.append(i.get_text(strip=True))

    name = names[3]

    reg_seas_played['Player_NAME'] = name

    ### ERROR: WILL TIMEOUT REQUESTS ###

    # team_start = []
    # opp_start = []
    # start = []

    # reg_seas_played = reg_seas_played.sort_values('Date', ascending=False)
    # last_game_played = reg_seas_played.iloc[0]

    # team = last_game_played['Team']
    # opp = last_game_played['Opp']
    # url = last_game_played['URL']

    # game_data = get_game_data(team, opp, url)

    # team_starters = list(game_data[0].head()['player'])
    # opp_starters = list(game_data[1].head()['player'])

    # did_start = name in team_starters


    # reg_seas_played['team_starting_five'] = team_start
    # reg_seas_played['opp_starting_five'] = opp_start
    # reg_seas_played['starter'] = start

    ### ERROR: WILL TIMEOUT REQUESTS ###

    ### Add opp as dummy variable
    nba_teams = [
        "ATL", "BOS", "BRK", "CHI", "CLE", "DAL", "DEN", "DET", 
        "GSW", "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", 
        "MIN", "NOP", "NYK", "OKC", "ORL", "PHI", "PHO", "POR", 
        "SAC", "SAS", "TOR", "UTA", "WAS", "CHO"
    ]

    teams = pd.DataFrame()

    for team in nba_teams:
        reg_seas_played[team] = 0

    curr_team = reg_seas_played['Team'].iloc[0]
    reg_seas_played = reg_seas_played.drop(columns=curr_team)

    for row in reg_seas_played.index:
        curr_data = reg_seas_played.loc[row]
        opp_team = curr_data['Opp']
        reg_seas_played.at[row, opp_team] = 1

    reg_seas_played = (
        reg_seas_played
        .drop(
            columns=['Team', 'Opp', 'Gtm', 'Gcar']
        )
        .reset_index(drop=True)
    )
    
    # Base stats
    rebounds = reg_seas_played['TRB']
    assists = reg_seas_played['AST']
    points = reg_seas_played['PTS'] * 0.5
    turnovers = reg_seas_played['TOV'] * -1
    steals = reg_seas_played['STL']
    blocks = reg_seas_played['BLK']

    # Double/Triple doubles
    cats = ['PTS', 'TRB', 'AST', 'STL', 'BLK']
    counts = (reg_seas_played[cats] >= 10).sum(axis=1)

    reg_seas_played['double_double'] = (counts >= 2).astype(int)        # 1 point bonus
    reg_seas_played['triple_double'] = (counts >= 3).astype(int) * 2    # 2 point bonus

    # Build fantasy score
    reg_seas_played['fantasy_score'] = (
        rebounds
        + assists
        + points
        + steals
        + blocks
        + turnovers
        + reg_seas_played['double_double']
        + reg_seas_played['triple_double']
    )

    # Optional: round for neatness
    reg_seas_played['fantasy_score'] = reg_seas_played['fantasy_score'].round(2)

    return missed_reg_seas_played, reg_seas_played #, player_positions

In [56]:
keegan_data = get_player_data('murrake02', '2025')

missed_data = keegan_data[0]
player_data = keegan_data[1]
# player_positions = keegan_data[2]

kings_roster = get_roster('SAC', '2025')

In [58]:
player_data

Unnamed: 0,Date,Result,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,PHO,POR,SAS,TOR,UTA,WAS,CHO,double_double,triple_double,fantasy_score
0,20241024,0,2580,8.0,17.0,0.471,5.0,10.0,0.500,3.0,...,0,0,0,0,0,0,0,1,0,25.5
1,20241026,0,2385,5.0,11.0,0.455,2.0,7.0,0.286,3.0,...,0,0,0,0,0,0,0,0,0,15.0
2,20241028,1,1992,2.0,9.0,0.222,0.0,3.0,0.000,2.0,...,0,1,0,0,0,0,0,0,0,12.0
3,20241029,1,1941,6.0,14.0,0.429,2.0,7.0,0.286,4.0,...,0,0,0,0,1,0,0,0,0,13.0
4,20241101,1,2332,6.0,12.0,0.500,1.0,6.0,0.167,5.0,...,0,0,0,0,0,0,0,1,0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,20250331,0,2141,5.0,11.0,0.455,2.0,5.0,0.400,3.0,...,0,0,0,0,0,0,0,0,0,16.0
72,20250402,0,2113,7.0,14.0,0.500,4.0,10.0,0.400,3.0,...,0,0,0,0,0,1,0,0,0,20.0
73,20250404,1,1520,2.0,8.0,0.250,1.0,6.0,0.167,1.0,...,0,0,0,0,0,0,1,0,0,9.5
74,20250411,0,1914,3.0,9.0,0.333,1.0,6.0,0.167,2.0,...,0,0,0,0,0,0,0,0,0,14.5


In [16]:
# data = player_data.iloc[0]
# team = data['Team']
# opp = data['Opp']
# url = data['URL']

team = 'SAC'
opp = 'MIN'
url = 'https://www.basketball-reference.com/boxscores/202410240SAC.html'

game_data = get_game_data(team, opp, url)

team_data = game_data[0]
opp_data = game_data[1]

Original URL: https://www.basketball-reference.com/boxscores/202410240SAC.html
Alternate URL: https://www.basketball-reference.com/boxscores/202410240MIN.html
Success


In [None]:
pd.merge(team_data, kings_roster, right_on='Player_NAME', left_on='player', how='inner').drop(columns=['Player_NAME', 'Player_TEAM'])

Unnamed: 0,player,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,...,ast,stl,blk,tov,pf,pts,game_score,plus_minus,reason,Player_POSITION
0,Keegan Murray,43:00,8.0,17.0,0.471,5.0,10.0,0.5,2.0,2.0,...,2.0,0.0,0.0,0.0,3.0,23.0,18.6,9.0,,PF
1,DeMar DeRozan,42:32,7.0,18.0,0.389,0.0,2.0,0.0,12.0,14.0,...,2.0,1.0,0.0,2.0,0.0,26.0,18.6,8.0,,SF
2,Domantas Sabonis,38:11,8.0,13.0,0.615,3.0,4.0,0.75,5.0,8.0,...,1.0,2.0,1.0,4.0,4.0,24.0,17.5,-2.0,,C
3,De'Aaron Fox,37:09,6.0,14.0,0.429,1.0,4.0,0.25,2.0,2.0,...,11.0,1.0,0.0,2.0,3.0,15.0,13.7,2.0,,PG
4,Kevin Huerter,19:36,1.0,4.0,0.25,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,9.0,,SG
5,Malik Monk,26:24,6.0,9.0,0.667,1.0,2.0,0.5,4.0,5.0,...,4.0,0.0,0.0,0.0,1.0,17.0,15.4,-7.0,,SG
6,Keon Ellis,11:11,0.0,1.0,0.0,0.0,0.0,,2.0,2.0,...,0.0,2.0,0.0,0.0,1.0,2.0,3.2,-12.0,,SG
7,Trey Lyles,10:45,1.0,4.0,0.25,1.0,3.0,0.333,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,3.0,2.2,-14.0,,PF
8,Alex Len,5:40,1.0,1.0,1.0,0.0,0.0,,1.0,2.0,...,0.0,0.0,1.0,1.0,1.0,3.0,2.2,0.0,,C
9,Doug McDermott,5:32,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,-1.5,-3.0,,SF


In [26]:
nba_teams = [
    "ATL", "BOS", "BRK", "CHI", "CLE", "DAL", "DEN", "DET", 
    "GSW", "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", 
    "MIN", "NOP", "NYK", "OKC", "ORL", "PHI", "PHO", "POR", 
    "SAC", "SAS", "TOR", "UTA", "WAS", "CHO"
]

teams = pd.DataFrame()

for team in nba_teams:
    teams[team] = 0

teams

Unnamed: 0,ATL,BOS,BRK,CHI,CLE,DAL,DEN,DET,GSW,HOU,...,ORL,PHI,PHO,POR,SAC,SAS,TOR,UTA,WAS,CHO
