# Pull NBA Standings Data 
##### Webscrapper to pull NBA Standings for the regular season.

### Interface Options

In [134]:
#Input to specify the year processing will begin
start_year = 1960

#Input to specify the file to load and save updated output
file_path = 'nba_standings.csv'

---

### Import Libraries

In [135]:
import os
import pandas as pd
import numpy as np
import string
from datetime import datetime
from urllib.request import urlopen
from bs4 import BeautifulSoup

---

### Create Functions to Scrape and Clean the Data

In [143]:
def get_current_season_year():
    """
    Determine the current NBA season year and whether it should be processed.

    NBA seasons run from October through April.
    - If today's date is in May or later, the regular season year is considered the *next* year.
    - Processing is skipped for the current season if today's month is between June and October.

    Returns:
        tuple:
            - year (int): The NBA season year (e.g., 2025).
            - process_season (bool): True if the current season should be processed
              (November–May), False otherwise.
    """
    
    #Identify Date
    today = datetime.today()
    
    #Identify Year and Season
    if today.month >= 5:  #May or later is considered next season year
        year = today.year + 1
    else:
        year = today.year
    process_season = not (5 <= today.month <= 10)  #Skip May–Oct
    
    #Return Objects
    return year, process_season


def load_existing_data(file_path, min_size_kb):
    """
    Load existing NBA standings data from CSV if it exists and is large enough.

    Args:
        file_path (str): Path to the CSV file containing standings.
        min_size_kb (int, optional): Minimum file size in kilobytes to consider
                                     the file valid. Defaults to 1 KB.

    Returns:
        tuple:
            - DataFrame: Existing standings data, or an empty DataFrame if not found/invalid.
            - set[int]: Unique season years already present in the data.
    """
    
    #Identify File
    if os.path.exists(file_path) and os.path.getsize(file_path) > (min_size_kb * 1024):
        df = pd.read_csv(file_path)
        return df, set(df["Year"].unique())
    
    #Return Dataframe
    return pd.DataFrame(), set()


def fetch_standings_html(year):
    """
    Fetch the NBA standings HTML page for a given season year.

    Args:
        year (int): The NBA season year (e.g., 2025 for the 2024–25 season).

    Returns:
        BeautifulSoup: Parsed HTML content of the standings page.
    """
    
    #Scrape Basketball Reference
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    html = urlopen(url)

    #Return Results
    return BeautifulSoup(html, features="html.parser")


def parse_standings(soup, year):
    """
    Parse a standings HTML page into a cleaned pandas DataFrame.

    Args:
        soup (BeautifulSoup): Parsed HTML of the standings page.
        year (int): The NBA season year.

    Returns:
        DataFrame: Cleaned standings data with columns:
            ['Team', 'W', 'L', 'WL_pct', 'GB', 'PPG', 'OPPG', 'SRS', 'Year', 'Conference'].

    Notes:
        - Only keeps rows with exactly 8 columns of data.
        - Cleans up 'GB' values and strips punctuation from team names.
        - Infers and forward-fills conference names.
        - Removes conference and division header rows.
    """
    
    #Identify Headers
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')][:8]
    
    #Identify Rows
    rows = soup.findAll('tr')[1:]

    #Identify Rows that are Standings
    standings = [
        [tr.getText() for tr in rows[i].findAll(['th', 'td'])]
        for i in range(len(rows))
    ]

    #Keep only valid rows
    if year >= 1971:
        standings = [lst for lst in standings if len(lst) == 8]  

    #Create Dataframe and Rename Columns
    df = pd.DataFrame(standings, columns=headers)
    df.columns = ['Team', 'W', 'L', 'WL_pct', 'GB', 'PPG', 'OPPG', 'SRS']
    
    #Create Year
    df['Year'] = year

    #Clean GB
    df['GB'] = df['GB'].replace({'—': 0, '-': 0})

    #Clean Team Names
    invalidchar = string.punctuation
    df['Team'] = df['Team'].str.strip(invalidchar)

    #Set Conference and Forward Fill
    df['Conference'] = df['Team'].where(
        df['Team'].isin(['Eastern Conference', 'Western Conference', 'Eastern Division', 'Western Division'])
    )
    df['Conference'] = df['Conference'].ffill()

    #Remove Rows not related to a team
    df = df[~df['Team'].str.contains(r'Conference|Division', na=False, case=False)]

    #Remove observations where 'W' is null. These are obsercations that are not related to a team's performance
    df = df[df['W'].notna()]
    df = df[df['W'] != 'W']

    #Drop Team Duplicates
    df = df.drop_duplicates(subset=['Team'])

    #Return Dataframe
    return df


def main():
    """
    Main script entry point for scraping and updating NBA standings data.

    Workflow:
        1. Determine the current season year and processing eligibility.
        2. Load existing data (if available).
        3. Loop through season years from start_year to current_season_year.
        4. Skip already-processed years (except eligible current season).
        5. Fetch and parse standings HTML.
        6. Append new data and save the updated CSV file.
    """

    #Run Function to Identify Years to Process
    current_season_year, process_current_season = get_current_season_year()
    
    #Run Function to Load Existing Data
    existing_df, existing_years = load_existing_data(file_path, min_size_kb=1)

    #Begin Scraping each Year
    for year in range(start_year, current_season_year + 1):
        if year in existing_years:
            if year != current_season_year or not process_current_season:
                print(f"Skipping {year} — already processed or not in season.")
                continue

        if year == current_season_year and not process_current_season:
            print(f"Skipping {year} — season hasn't started yet.")
            continue

        print(f"Processing {year}...")
        soup = fetch_standings_html(year)
        standings_df = parse_standings(soup, year)
        existing_df = pd.concat([existing_df, standings_df], ignore_index=True)
        existing_df = existing_df.sort_values(by=['Year','Conference','W'], ascending=[False, False, False])

    #Update the Standings File
    existing_df.to_csv(file_path, index=False)
    print("Updated nba_standings.csv")

In [144]:
#Run the main function to scrape standings
if __name__ == "__main__":
    main()

Skipping 1960 — already processed or not in season.
Skipping 1961 — already processed or not in season.
Skipping 1962 — already processed or not in season.
Skipping 1963 — already processed or not in season.
Skipping 1964 — already processed or not in season.
Skipping 1965 — already processed or not in season.
Skipping 1966 — already processed or not in season.
Skipping 1967 — already processed or not in season.
Skipping 1968 — already processed or not in season.
Skipping 1969 — already processed or not in season.
Processing 1970...
Skipping 1971 — already processed or not in season.
Skipping 1972 — already processed or not in season.
Skipping 1973 — already processed or not in season.
Skipping 1974 — already processed or not in season.
Skipping 1975 — already processed or not in season.
Skipping 1976 — already processed or not in season.
Skipping 1977 — already processed or not in season.
Skipping 1978 — already processed or not in season.
Skipping 1979 — already processed or not in s

In [138]:
#The End