<a href="https://colab.research.google.com/github/ezesalvatore/205Lab/blob/main/BasketballScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BasketBall Reference Uniform Scraper

## Project overview




### Objectives
- Extract players name, team and uniform number
- Support wireframe development
- Shows my webscraping expertises

### Methods used
- The webscraper being used is Beautiful Soup, since Basketball Reference is primarily build with HTML
- This project also make sure that it follows Basketball Reference rate limiting and crawl-delay violations

## Imports

In [None]:
#Makes a request to Basketball Reference
import requests

#How we are going to webscrape, able to parse through HTML to get the name, team, and uniform
from bs4 import BeautifulSoup

#Helps me with exporting a csv file
import pandas as pd

#Allows me to add delays
import time

#Data processing and cleaning
import re

#Creates url for web scraping
from urllib.parse import urljoin

#Get rid of erros
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported")

## Session Setup

In [None]:
def setup_session():
    """
    This function is setting up the sessions for web scraping. Making sure the headers are compliant.

    """
    session = requests.Session()

    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml',
        'Connection': 'keep-alive',
    })

    print("Session is created!")

    return session

## Fetching Basketball Reference Data

In [None]:
def fetch_basketball_data():
    """
    Make single compliant request to Basketball Reference and following the robots.txt rules.

    """
    url = "https://www.basketball-reference.com/leagues/NBA_2025_numbers.html"

    session = setup_session()


    try:
        # Robots.txt compliance: Crawl-delay: 3
        time.sleep(3.0)

        response = session.get(url, timeout=30)

        print(f"The request to: {url} has been sent out")

        if response.status_code == 429:
            raise Exception("Rate limited - blocked for 24 hours")

        response.raise_for_status()

        print(f"Server Status: {response.status_code}")

        #HTML page returned
        return response.text

    except Exception as e:
        print(f" Request failed: {e}")
        return None
    finally:
        session.close()

## Web Scraping

### Web Scraping Strategy

I will use the the CSS Selector to get the values from the html site

**Uniform Number** : `div.data_grid_box table caption`
<br>

**Player Name**: `div.data_grid_box a[href*='/players/']`
<br>

**Team**: `span.desc a[href*='/teams/'] `



In [None]:
def extract_uniform_data(html_content):
    """
    Parse HTML - Single row per player, highest GP team priority
    """
    if not html_content:
        return pd.DataFrame()

    soup = BeautifulSoup(html_content, 'html.parser')

    # First pass: collect all player-team-number combinations
    player_teams = {}
    uniform_sections = soup.find_all('div', class_='data_grid_box')

    print(f"🔍 Found {len(uniform_sections)} uniform number sections")

    for section in uniform_sections:
        try:
            caption = section.find('caption', class_='poptip')
            if not caption:
                continue

            uniform_number = caption.get_text().strip()
            player_rows = section.find_all('tr')

            for row in player_rows:
                player_link = row.find('a', href=lambda x: x and '/players/' in x)
                if not player_link:
                    continue

                player_name = player_link.get_text().strip()

                # Extract teams
                team_span = row.find('span', class_='desc')
                teams = []

                if team_span:
                    team_links = team_span.find_all('a', href=lambda x: x and '/teams/' in x)
                    teams = [link.get_text().strip() for link in team_links]

                # Store each team-number combination
                if player_name not in player_teams:
                    player_teams[player_name] = {}

                for team in teams:
                    player_teams[player_name][team] = uniform_number

        except Exception as e:
            print(f"⚠️ Error processing section: {e}")
            continue

    print(f"✅ Collected data for {len(player_teams)} unique players")
    return player_teams

## Intergrate with other csv file

In [None]:
def create_uniform_csv(player_teams):
    """Convert player uniform data to CSV format"""

    players_data = []

    for player_name, teams_numbers in player_teams.items():

        if len(teams_numbers) == 1:
            # Single team player
            team = list(teams_numbers.keys())[0]
            uniform_number = teams_numbers[team]

            players_data.append({
                'player_name': player_name,
                'uniform_numbers': uniform_number,
                'is_multi_team': False,
                'all_teams': team,
                'team_count': 1
            })

        else:
            # Multi-team player - create "17-71" format
            unique_numbers = list(set(teams_numbers.values()))
            uniform_numbers_string = '-'.join(unique_numbers)

            players_data.append({
                'player_name': player_name,
                'uniform_numbers': uniform_numbers_string,
                'is_multi_team': True,
                'all_teams': '-'.join(teams_numbers.keys()),
                'team_count': len(teams_numbers)
            })

    return pd.DataFrame(players_data)