# Pull NBA Playoffs Data 
##### Webscrapper to pull NBA Playoffs data.

### Interface Options

In [25]:
#Input to specify the year processing will begin
start_year = 1960

#Input to specify the file to load and save updated output
file_path = 'nba_playoffs.csv'

---

### Import Libraries

In [27]:
import os
import pandas as pd
import numpy as np
import string
from datetime import datetime
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment

---

### Create Functions to Scrape and Clean the Data

In [28]:
def get_current_playoff_year():
    """
    Determine the current NBA playoff year and whether it should be processed.

    NBA playoffs run from April through June.
    - If today's date is in June or before, the playoff have not happened yet.
    - Processing is skipped for the current season if the playoffs are not completed.

    Returns:
        tuple:
            - year (int): The NBA season year (e.g., 2025).
    """
    
    #Identify Date
    today = datetime.today()
    
    #Identify Year and Season
    if today.month < 5:  #Playoffs don't start until late April
        year = today.year - 1
    else:
        year = today.year
    
    #Return Objects
    return year


def load_existing_data(file_path, min_size_kb):
    """
    Load existing NBA playoff data from CSV if it exists and is large enough.

    Args:
        file_path (str): Path to the CSV file containing standings.
        min_size_kb (int, optional): Minimum file size in kilobytes to consider
                                     the file valid. Defaults to 1 KB.

    Returns:
        tuple:
            - DataFrame: Existing standings data, or an empty DataFrame if not found/invalid.
            - set[int]: Unique season years already present in the data.
    """
    
    #Identify File
    if os.path.exists(file_path) and os.path.getsize(file_path) > (min_size_kb * 1024):
        df = pd.read_csv(file_path)
        return df, set(df["Year"].unique())
    
    #Return Dataframe
    return pd.DataFrame(), set()


def fetch_playoff_html(year):
    """
    Fetch the NBA playoff HTML page for a given season year.

    Args:
        year (int): The NBA season year (e.g., 2025 for the 2024–25 season).

    Returns:
        BeautifulSoup: Parsed HTML content of the playoff page.
    """
    
    #Scrape Basketball Reference
    url = f"https://www.basketball-reference.com/playoffs/NBA_{year}_standings.html"
    html = urlopen(url)

    #Return Results
    return BeautifulSoup(html, features="html.parser")


def parse_playoff(soup, year):
    """
    Parse a playoff HTML page into a cleaned pandas DataFrame.

    Args:
        soup (BeautifulSoup): Parsed HTML of the playoff page.
        year (int): The NBA season year.

    Returns:
        DataFrame: Cleaned standings data with columns
    """
    
    #Find all HTML comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    #Create an Object that will be in our For Loop to identify once we find the first table
    first_table_found = False

    #Parse and find the playoff table needed
    for comment in comments:
        if '<table' in comment and not first_table_found:
            table_soup = BeautifulSoup(comment, 'html.parser')
            table = table_soup.find('table')

            if table:
                first_table_found = True  #Mark as found so we stop after this

                #Get headers (second <tr>)
                header_row = table.find_all('tr')[1]
                headers = [th.get_text(strip=True) for th in header_row.find_all('th')]

                #Get all data rows starting after the header row
                rows = []
                for tr in table.find_all('tr')[2:]:
                    cells = [td.get_text(strip=True) for td in tr.find_all(['th', 'td'])]
                    if cells:  #Skip empty rows
                        rows.append(cells)

            break  #Exit after first table

    #Create Dataframe and Rename Columns
    df = pd.DataFrame(rows, columns=headers)
    
    #Create Year
    df['Year'] = year

    #Create Wins
    df['Wins'] = df['Overall'].str[:2]
    invalidchar = string.punctuation
    df['Wins'] = pd.to_numeric(df['Wins'].str.strip(invalidchar))

    #Identify Champion and Conference Champion by observing Win totals
    #The playoffs were restructed in 1984 to include 16 teams and no bye teams
    if year <= 1983:
        df['E_Wins'] = df['E'].str[:2]
        df['E_Losses'] = df['E'].str[-2:]
        df['W_Wins'] = df['W'].str[:2]
        df['W_Losses'] = df['W'].str[-2:]
        df['E_Wins'] = df['E_Wins'].str.strip(invalidchar)
        df['E_Losses'] = df['E_Losses'].str.strip(invalidchar)
        df['W_Wins'] = df['W_Wins'].str.strip(invalidchar)
        df['W_Losses'] = df['W_Losses'].str.strip(invalidchar)
        df['E_Wins'] = pd.to_numeric(df['E_Wins'])
        df['E_Losses'] = pd.to_numeric(df['E_Losses'])
        df['W_Wins'] = pd.to_numeric(df['W_Wins'])
        df['W_Losses'] = pd.to_numeric(df['W_Losses'])
        df['E_Games'] = df['E_Wins'] + df['E_Losses']
        df['W_Games'] = df['W_Wins'] + df['W_Losses']
        df['Conference_Champion'] = np.where((df['E_Games'] >=1) & (df['W_Games'] >=1), 1, 0)
        df['Champion'] = np.where((df['E_Wins'] >=4) & (df['W_Wins'] >=4), 1, 0)
    else:
        df['Wins'] = df['Overall'].str[:2]
        df['Wins'] = pd.to_numeric(df['Wins'].str.strip(invalidchar))
        max_wins = df['Wins'].max()
        second_max_wins = df['Wins'].nlargest(2).iloc[1]
        df['Champion'] = np.where(df['Wins'] >= max_wins, 1, 0)
        df['Conference_Champion'] = np.where(df['Wins'] >= second_max_wins, 1, 0)

    #Return Dataframe
    return df


def main():
    """
    Main script entry point for scraping and updating NBA playoff data.

    Workflow:
        1. Determine the current season year and processing eligibility.
        2. Load existing data (if available).
        3. Loop through season years from start_year to current_season_year.
        4. Skip already-processed years (except eligible current season).
        5. Fetch and parse standings HTML.
        6. Append new data and save the updated CSV file.
    """

    #Run Function to Identify Years to Process
    current_season_year = get_current_playoff_year()
    
    #Run Function to Load Existing Data
    existing_df, existing_years = load_existing_data(file_path, min_size_kb=1)

    #Begin Scraping each Year
    for year in range(start_year, current_season_year +1):
        if year in existing_years:
            print(f"Skipping {year} — already processed or not in season.")
            continue

        print(f"Processing {year}...")
        soup = fetch_playoff_html(year)
        playoff_df = parse_playoff(soup, year)
        existing_df = pd.concat([existing_df, playoff_df], ignore_index=True)
        existing_df = existing_df.sort_values(by=['Year','Champion','Conference_Champion','Wins'], ascending=[False, False, False, False])
    
    #Update the Standings File
    existing_df.to_csv(file_path, index=False)
    print("Updated nba_playoffs.csv")

In [29]:
#Run the main function to scrape standings
if __name__ == "__main__":
    main()

Processing 1960...
Processing 1961...
Processing 1962...
Processing 1963...
Processing 1964...
Processing 1965...
Processing 1966...
Processing 1967...
Processing 1968...
Processing 1969...
Processing 1970...
Processing 1971...
Processing 1972...
Processing 1973...
Processing 1974...
Processing 1975...
Processing 1976...
Processing 1977...
Processing 1978...
Processing 1979...
Processing 1980...
Processing 1981...
Processing 1982...
Skipping 1983 — already processed or not in season.
Skipping 1984 — already processed or not in season.
Skipping 1985 — already processed or not in season.
Skipping 1986 — already processed or not in season.
Skipping 1987 — already processed or not in season.
Skipping 1988 — already processed or not in season.
Skipping 1989 — already processed or not in season.
Skipping 1990 — already processed or not in season.
Skipping 1991 — already processed or not in season.
Skipping 1992 — already processed or not in season.
Skipping 1993 — already processed or not in

In [None]:
#The End