<a href="https://colab.research.google.com/github/ezesalvatore/BasketballScraper/blob/main/BasketballScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BasketBall Reference Uniform Scraper

## Project overview




### Objectives
- Extract players name, team and uniform number
- Support wireframe development
- Shows my webscraping expertises

### Methods used
- The webscraper being used is Beautiful Soup, since Basketball Reference is primarily build with HTML
- This project also make sure that it follows Basketball Reference rate limiting and crawl-delay violations

# WebScraping Workflow

Session `->` Basketball Reference HTML `->` Beautiful Soup Parser `->` Uniform Data `->` Stats CSV Intergration `->` Final CSV

## Imports

In [None]:
#Makes a request to Basketball Reference
import requests

#How we are going to webscrape, able to parse through HTML to get the name, team, and uniform
from bs4 import BeautifulSoup

#Helps me with exporting a csv file
import pandas as pd

#Allows me to add delays
import time

#Data processing and cleaning
import re

#Creates url for web scraping
from urllib.parse import urljoin

# Handle Unicode normalization for special characters/accents
import unicodedata

#Get rid of errors
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported")

Libraries imported


## Session Setup

In [None]:
def setup_session():
    """
    This function is setting up the sessions for web scraping. Making sure the headers are compliant.

    """
    session = requests.Session()

    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml',
        'Connection': 'keep-alive',
    })

    print("Session is created!")

    return session

## Fetching Basketball Reference Data

In [None]:
def fetch_basketball_data():
    """
    Make single compliant request to Basketball Reference with proper encoding.
    """
    url = "https://www.basketball-reference.com/leagues/NBA_2025_numbers.html"
    session = setup_session()

    try:
        # Robots.txt compliance: Crawl-delay: 3
        time.sleep(3.0)

        response = session.get(url, timeout=30)

        if response.status_code == 429:
            raise Exception("Rate limited - blocked for 24 hours")

        response.raise_for_status()

        # Fix encoding if needed
        if response.encoding != 'utf-8':
            response.encoding = 'utf-8'

        return response.text

    except Exception as e:
        print(f"Request failed: {e}")
        return None
    finally:
        session.close()

## Web Scraping

### Web Scraping Strategy

I will use the the CSS Selector to get the values from the html site

**Uniform Number** : `div.data_grid_box table caption`
<br>

**Player Name**: `div.data_grid_box a[href*='/players/']`
<br>

**Team**: `span.desc a[href*='/teams/'] `



In [None]:
def extract_uniform_data(html_content):
    """
    Extract player names, teams, and uniform numbers from Basketball Reference HTML.
    """

    soup = BeautifulSoup(html_content, 'html.parser')
    player_uniform_data = {}

    print("Parsing HTML content...")

    # Find all uniform number sections
    uniform_sections = soup.find_all('div', {'class': 'data_grid_box'})

    for section in uniform_sections:
        # Extract uniform number from caption
        caption = section.find('caption')
        if not caption:
            continue

        uniform_number = caption.get_text().strip()

        print(f"Processing uniform #{uniform_number}...")

        # Find all player rows in this uniform section
        player_rows = section.find_all('tr')

        for row in player_rows:
            # Extract player name
            player_link = row.find('a', href=lambda x: x and '/players/' in x)
            if not player_link:
                continue

            player_name = player_link.get_text().strip()

            # Extract team(s) - could be multiple teams separated by dashes
            team_spans = row.find_all('span', {'class': 'desc'})
            teams = []

            for span in team_spans:
                team_links = span.find_all('a', href=lambda x: x and '/teams/' in x)
                for team_link in team_links:
                    team_code = team_link.get_text().strip()
                    teams.append(team_code)

            # Store player data
            if player_name and teams:
                if player_name not in player_uniform_data:
                    player_uniform_data[player_name] = []

                for team in teams:
                    player_uniform_data[player_name].append((uniform_number, team))

    print(f"✅ Extracted data for {len(player_uniform_data)} players")
    return player_uniform_data

## Creating final_csv

In [None]:
def normalize_name(name):
    """
    Remove accents and special characters from player names for matching.
    Handles NaN values and edge cases safely.
    """
    # Handle NaN, None, or empty values
    if pd.isna(name) or not name or str(name).strip() == '':
        return None

    # Convert to string and strip whitespace
    name_str = str(name).strip()
    if not name_str:
        return None

    try:
        # Normalize to NFD (decomposed form) then remove combining characters
        normalized = unicodedata.normalize('NFD', name_str)
        ascii_name = ''.join(c for c in normalized if not unicodedata.combining(c))
        return ascii_name
    except Exception:
        return name_str  # Return original if normalization fails

In [None]:
def create_final_csv(player_uniform_data, stats_csv_path):
    """
    Combine uniform data with existing stats CSV.
    Uses name normalization to handle special characters/accents.
    """
    # Load existing stats
    try:
        stats_df = pd.read_csv(stats_csv_path)
    except FileNotFoundError:
        print(f"Could not find {stats_csv_path}")
        return None

    # Data cleaning: Remove rows with missing player names
    stats_df = stats_df.dropna(subset=['Player'])
    stats_df = stats_df[stats_df['Player'].str.strip() != '']

    # Add new columns
    stats_df['Uniform'] = ''
    stats_df['mult_team'] = False

    # Create normalized lookup dictionary for uniform data
    normalized_uniform_data = {}
    for player_name, uniform_list in player_uniform_data.items():
        normalized_key = normalize_name(player_name)
        if normalized_key:
            normalized_uniform_data[normalized_key] = {
                'original_name': player_name,
                'uniforms': uniform_list
            }

    # Process each player in stats
    for idx, row in stats_df.iterrows():
        player_name = row['Player']
        team_code = row['Team']

        # Normalize the player name from stats CSV
        normalized_stats_name = normalize_name(player_name)

        if normalized_stats_name and normalized_stats_name in normalized_uniform_data:
            uniform_info = normalized_uniform_data[normalized_stats_name]
            uniform_assignments = uniform_info['uniforms']

            # Check if player has multiple teams
            player_teams = [team for _, team in uniform_assignments]
            is_multi_team = len(set(player_teams)) > 1

            if is_multi_team and str(team_code).endswith('TM'):
                # This is a season total row - create complex uniform string
                uniform_parts = []
                for uniform_num, team in uniform_assignments:
                    uniform_parts.append(f"{uniform_num}-{team}")
                stats_df.at[idx, 'Uniform'] = "|".join(uniform_parts)
                stats_df.at[idx, 'mult_team'] = True

            else:
                # Find specific uniform for this team
                team_uniform = None
                for uniform_num, team in uniform_assignments:
                    if team == team_code:
                        team_uniform = uniform_num
                        break

                if team_uniform:
                    stats_df.at[idx, 'Uniform'] = team_uniform
                    stats_df.at[idx, 'mult_team'] = is_multi_team

    return stats_df

## Main Completed the workflow

In [None]:
def main():
    """Execute complete workflow: scrape → extract → combine → export"""

    # Step 1: Scrape Basketball Reference
    html_content = fetch_basketball_data()
    if not html_content:
        print("❌ Failed to fetch HTML content")
        return None

    # Step 2: Extract uniform data
    player_number_teams = extract_uniform_data(html_content)
    if not player_number_teams:
        print("❌ No uniform data extracted")
        return None

    # Step 3: Combine with stats CSV
    stats_csv_path = '/content/drive/MyDrive/Basketball-Data/baskeball-stats.csv'
    final_df = create_final_csv(player_number_teams, stats_csv_path)
    if final_df is None:
        return None

    # Step 4: Export final CSV
    output_filename = '/content/drive/MyDrive/Basketball-Data/nba_2025_final.csv'
    final_df.to_csv(output_filename, index=False)

    print(f"✅ Successfully exported {len(final_df)} records to: {output_filename}")
    return final_df

# Execute the workflow
if __name__ == "__main__":
    basketball_data = main()

Session is created!
Parsing HTML content...
Processing uniform #00...
Processing uniform #0...
Processing uniform #1...
Processing uniform #2...
Processing uniform #3...
Processing uniform #4...
Processing uniform #5...
Processing uniform #6...
Processing uniform #7...
Processing uniform #8...
Processing uniform #9...
Processing uniform #10...
Processing uniform #11...
Processing uniform #12...
Processing uniform #13...
Processing uniform #14...
Processing uniform #15...
Processing uniform #16...
Processing uniform #17...
Processing uniform #18...
Processing uniform #19...
Processing uniform #20...
Processing uniform #21...
Processing uniform #22...
Processing uniform #23...
Processing uniform #24...
Processing uniform #25...
Processing uniform #26...
Processing uniform #27...
Processing uniform #28...
Processing uniform #29...
Processing uniform #30...
Processing uniform #31...
Processing uniform #32...
Processing uniform #33...
Processing uniform #34...
Processing uniform #35...
Proc