In [1]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def get_nba_db():
    """
    Creates a connection to the NBA PostgreSQL database.
    """
    host = os.getenv("NBA_DB_HOST")
    port = os.getenv("NBA_DB_PORT")
    database = os.getenv("NBA_DB_NAME")
    user = os.getenv("NBA_DB_USER")
    password = os.getenv("NBA_DB_PASSWORD")
    
    connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
    return create_engine(connection_string)

def query(sql):
    """
    Executes SQL query against the NBA database and returns results as a pandas DataFrame.
    """
    engine = get_nba_db()
    return pd.read_sql(sql, engine)

# Function to display available tables
def list_tables():
    """Lists all tables available in the NBA database."""
    tables = query("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
    return tables['table_name'].tolist()

In [2]:
list_tables()

['common_player_info',
 'draft_combine_stats',
 'draft_history',
 'game',
 'game_info',
 'game_summary',
 'games',
 'inactive_players',
 'leagueschedule24_25',
 'leagueschedule25_26',
 'line_score',
 'officials',
 'other_stats',
 'play_by_play',
 'player',
 'playeroftheweek',
 'playeroftheweek2',
 'players',
 'playerstatistics',
 'team',
 'team_details',
 'team_history',
 'team_info_common',
 'teamhistories',
 'teamstatistics']

In [3]:
query("""
WITH BaseData AS (

SELECT
firstname
,lastname
,CAST(personid AS INT)
,CAST(gameid AS INT)
,CAST(gamedate AS DATE)
,CAST(EXTRACT('week' FROM CAST(gamedate AS DATE)) AS INT) AS week
,CAST(EXTRACT('month' FROM CAST(gamedate AS DATE)) AS INT) AS month
,CAST(EXTRACT('year' FROM CAST(gamedate AS DATE)) AS INT) AS year
,CONCAT(CAST(EXTRACT('week' FROM CAST(gamedate AS DATE)) AS INT) , '-' ,CAST(EXTRACT('year' FROM CAST(gamedate AS DATE)) AS INT)) AS weekyear
,playerteamcity
,playerteamname
,opponentteamcity
,opponentteamname
,gametype
,gamelabel
,gamesublabel
,seriesgamenumber
,CAST(win AS INT)
,CAST(home AS INT)
,numminutes
,CAST(points AS INT)
,CAST(assists AS INT)
,CAST(blocks AS INT)
,CAST(steals AS INT)
,CAST(fieldgoalsmade AS INT)
,CAST(fieldgoalsattempted AS INT)
,fieldgoalspercentage
,CAST(threepointersmade AS INT)
,CAST(threepointersattempted AS INT)
,threepointerspercentage
,CAST(freethrowsmade AS INT)
,CAST(freethrowsattempted AS INT)
,freethrowspercentage
,CAST(reboundsoffensive AS INT)
,CAST(reboundsdefensive AS INT)
,CAST(reboundstotal AS INT)
,CAST(foulspersonal AS INT)
,CAST(turnovers AS INT)
,CAST(plusminuspoints AS INT)

FROM playerstatistics

)

SELECT

firstname
,lastname
,week
,playerteamname
,SUM(numminutes) AS numminutes
,SUM(points) AS points
,SUM(assists) AS assists
,SUM(blocks) AS blocks
,SUM(steals) AS steals
,SUM(fieldgoalsmade) AS fieldgoalsmade
,SUM(fieldgoalsattempted) AS fieldgoalsattempted
,SUM(threepointersmade) AS threepointersmade
,SUM(threepointersattempted) AS threepointersattempted
,SUM(freethrowsmade) AS freethrowsmade
,SUM(freethrowsattempted) AS freethrowsattempted
,SUM(reboundsoffensive) AS reboundsoffensive
,SUM(reboundsdefensive) AS reboundsdefensive
,SUM(reboundstotal) AS reboundstotal
,SUM(foulspersonal) AS foulspersonal
,SUM(turnovers) AS turnovers
,SUM(plusminuspoints) AS plusminuspoints

FROM BaseData
WHERE YEAR = '2025'
GROUP BY firstname, lastname, week, playerteamname
LIMIT 5

""")

Unnamed: 0,firstname,lastname,week,playerteamname,numminutes,points,assists,blocks,steals,fieldgoalsmade,...,threepointersmade,threepointersattempted,freethrowsmade,freethrowsattempted,reboundsoffensive,reboundsdefensive,reboundstotal,foulspersonal,turnovers,plusminuspoints
0,Aaron,Gordon,2,Nuggets,18.22,13,2,1,0,6,...,1,2,0,0,1,5,6,0,1,23
1,Aaron,Gordon,3,Nuggets,60.86,33,3,1,1,10,...,1,7,12,13,2,10,12,4,6,17
2,Aaron,Gordon,4,Nuggets,63.11,35,6,0,2,12,...,6,11,5,6,3,2,5,3,4,6
3,Aaron,Gordon,5,Nuggets,113.82,38,10,1,2,16,...,2,8,4,4,6,8,14,6,5,24
4,Aaron,Gordon,6,Nuggets,77.06,27,23,2,1,7,...,3,8,10,13,4,13,17,4,1,40


In [4]:
#!/usr/bin/env python3
"""
NBA Player of the Week Scraper - Improved Version

This script scrapes the NBA Player of the Week data from basketball.realgm.com
with improved table detection methods.
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import time
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def get_nba_db():
    """
    Creates a connection to the NBA PostgreSQL database.
    """
    host = os.getenv("NBA_DB_HOST")
    port = os.getenv("NBA_DB_PORT")
    database = os.getenv("NBA_DB_NAME")
    user = os.getenv("NBA_DB_USER")
    password = os.getenv("NBA_DB_PASSWORD")
    
    connection_string = f"postgresql://{user}:{password}@{host}:{port}/{database}"
    return create_engine(connection_string)

# Create SQL engine
engine = get_nba_db()

def scrape_nba_potw():
    """
    Scrape the NBA Player of the Week data from basketball.realgm.com
    
    Returns:
        pandas.DataFrame: DataFrame containing the scraped data
    """
    # URL of the page to scrape
    url = "https://basketball.realgm.com/nba/awards/by-type/player-of-the-week/30"
    
    # Send a request to the website
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    print(f"Fetching data from {url}...")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None
    
    # Print status code for debugging
    print(f"Response status code: {response.status_code}")
    
    # Save HTML to a file for inspection if needed
    with open("webpage.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    print("Saved HTML to webpage.html for inspection")
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Try multiple methods to find the table
    # Method 1: By class
    table = soup.find('table', class_='tablesaw')
    
    # Method 2: Look for any table that contains the expected headers
    if not table:
        print("Trying to find table by headers...")
        for tbl in soup.find_all('table'):
            headers = [th.text.strip() for th in tbl.find_all('th')]
            # Check if this table has the expected headers
            if 'Player' in headers and 'Season' in headers and 'Date' in headers:
                table = tbl
                print("Found table by headers!")
                break
    
    # Check if we found a table
    if not table:
        # List all tables on the page for debugging
        all_tables = soup.find_all('table')
        print(f"Could not find the Player of the Week table. Found {len(all_tables)} tables on the page.")
        
        # Save info about all tables for debugging
        if all_tables:
            print("Tables found on the page:")
            for i, tbl in enumerate(all_tables):
                headers = [th.text.strip() for th in tbl.find_all('th')]
                rows = len(tbl.find_all('tr'))
                print(f"Table {i+1}: {rows} rows, Headers: {headers[:5]}{'...' if len(headers) > 5 else ''}")
                
                # If this table looks promising, try to use it
                if rows > 5 and len(headers) > 5:
                    print(f"Table {i+1} looks promising. Attempting to use it.")
                    table = tbl
                    break
        
        # If we still couldn't find a suitable table
        if not table:
            print("Could not find a suitable table. Please inspect the HTML manually.")
            return None
    
    # Extract the column headers
    headers = []
    header_row = table.find('tr')
    if header_row:
        for th in header_row.find_all(['th', 'td']):  # Some tables use td for headers
            header = th.text.strip()
            headers.append(header)
    
    # If headers are empty or don't look right, try another approach
    if not headers or 'Player' not in headers:
        print("Headers don't look right. Trying another approach...")
        rows = table.find_all('tr')
        if len(rows) > 1:
            # Try using the first row as headers
            header_cells = rows[0].find_all(['th', 'td'])
            headers = [cell.text.strip() for cell in header_cells]
            rows = rows[1:]  # Skip the header row for data extraction
        else:
            print("Could not extract headers")
            return None
    else:
        # Use normal approach for data rows
        rows = table.find_all('tr')[1:]  # Skip the header row
    
    print(f"Found headers: {headers}")
    
    # Extract the table data
    data = []
    for tr in rows:
        row = []
        for td in tr.find_all(['td', 'th']):  # Some tables might mix th and td
            # Extract the text from the cell
            cell_value = td.text.strip()
            row.append(cell_value)
        
        # Only add rows with enough columns
        if len(row) >= len(headers):
            # Trim to match header count
            row = row[:len(headers)]
            data.append(row)
    
    if not data:
        print("Could not extract any data rows")
        return None
    
    print(f"Extracted {len(data)} rows of data")
    
    # Create a pandas DataFrame from the extracted data
    df = pd.DataFrame(data, columns=headers)
    
    # List of likely column names for each type
    numeric_columns = [
        'Weight', 'Age', 'Draft Yr', 'YOS', 'Year', 'Years', 
        'Points', 'Rebounds', 'Assists', 'Steals', 'Blocks'
    ]
    
    date_columns = ['Date', 'Award Date', 'Week Of']
    
    # Convert numeric columns if they exist
    for col in df.columns:
        # Check if this column name contains any of our numeric column keywords
        if any(nc in col for nc in numeric_columns):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Convert date columns if they exist
    for col in df.columns:
        if any(dc in col for dc in date_columns):
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
            except:
                print(f"Could not convert {col} to datetime")
    
    return df

def save_to_csv(df, filename=None):
    """
    Save the DataFrame to a CSV file
    
    Args:
        df (pandas.DataFrame): DataFrame to save
        filename (str, optional): Name of the output file. Defaults to None.
    """
    if df is None:
        print("No data to save")
        return
    
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"nba_player_of_the_week_{timestamp}.csv"
    
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

def main():
    print("=== NBA Player of the Week Scraper - Improved Version ===")
    
    # Add a delay to avoid being blocked
    print("Waiting a few seconds before scraping...")
    time.sleep(2)
    
    # Scrape the data
    df = scrape_nba_potw()
    
    if df is not None:
        # Display the first few rows of the DataFrame
        print("\nFirst few rows of the data:")
        print(df.head())
        
        # Display basic information about the DataFrame
        print("\nDataFrame information:")
        print(f"Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        
        # Save the data to a CSV file
        #save_to_csv(df)

        # Save data to Postgres DB table
        df.to_sql(
        name='playeroftheweek',  # Name of the target table in the database
        con=engine,             # The SQLAlchemy engine
        if_exists='replace',     # How to handle existing tables: 'fail', 'replace', or 'append'
        index=False             # Whether to write the DataFrame index as a column
        )
    else:
        print("Failed to scrape the data.")
        print("\nTroubleshooting tips:")
        print("1. Open the saved webpage.html file in a browser")
        print("2. Use browser developer tools (F12) to inspect the table")
        print("3. Look for the table's class, id, or other identifying attributes")
        print("4. Update the script with the correct selectors")

if __name__ == "__main__":
    main()

=== NBA Player of the Week Scraper - Improved Version ===
Waiting a few seconds before scraping...
Fetching data from https://basketball.realgm.com/nba/awards/by-type/player-of-the-week/30...
Error fetching the webpage: 403 Client Error: Forbidden for url: https://basketball.realgm.com/nba/awards/by-type/player-of-the-week/30
Failed to scrape the data.

Troubleshooting tips:
1. Open the saved webpage.html file in a browser
2. Use browser developer tools (F12) to inspect the table
3. Look for the table's class, id, or other identifying attributes
4. Update the script with the correct selectors


In [None]:
#pd.read_csv('nba_player_of_the_week_20251101_192256.csv')