In [1]:
"""
Simplified NBA Player Lookup Table GCS Export Script

This standalone script exports NBA player lookup tables to Google Cloud Storage.
It's a simplified version that can be run independently from the main pipeline.
"""

import os
import sys
import logging
import pandas as pd
import tempfile
from datetime import datetime
from google.cloud import storage
from nba_api.stats.static import players
import duckdb

# GCS Configuration - Using the same as in the provided script
GCS_BUCKET_NAME = "nba_award_predictor"
GCS_PREFIX = "nba_data/"
#CREDENTIALS_FILE = "/root/nba-pipeline/cis-5450-final-project-485661e2f371.json"
CREDENTIALS_FILE = "cis-5450-final-project-485661e2f371.json"

# Setup logging to console for this simplified script
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def get_player_data(active_only=False):
    """Get player data from the NBA API"""
    player_type = "active" if active_only else "all"
    print(f"Fetching {player_type} NBA players...")
    
    # Get the player data
    if active_only:
        player_data = players.get_active_players()
    else:
        player_data = players.get_players()
    
    # Convert to DataFrame
    df_raw = pd.DataFrame(player_data)
    
    # Rename columns for clarity
    df_raw = df_raw.rename(columns={'id': 'player_id', 'full_name': 'player_name'})

    query = """
    
    WITH SpecificNames AS (
    SELECT 2399 AS player_id, 'Mike Dunleavy Sr.' AS player_full_name
    UNION ALL
    SELECT 76616, 'Mike Dunleavy Jr.'
    UNION ALL
    SELECT 121, 'Patrick Ewing Sr.'
    UNION ALL
    SELECT 201607, 'Patrick Ewing Jr.'
    UNION ALL
    SELECT 779, 'Glen Rice Sr.'
    UNION ALL
    SELECT 203318, 'Glen Rice Jr.'
    UNION ALL
    SELECT 77144, 'Eddie L. Johnson'
    UNION ALL
    SELECT 698, 'Eddie A. Johnson'
    UNION ALL
    SELECT 77156, 'Larry O. Johnson'
    UNION ALL
    SELECT 913, 'Larry D. Johnson'
    UNION ALL
    SELECT 200848, 'Steven A. Smith'
    UNION ALL
    SELECT 120, 'Steven D. Smith'
    UNION ALL
    SELECT 2229, 'Mike L. James'
    UNION ALL
    SELECT 1628455, 'Mike P. James'
    UNION ALL
    SELECT 77193, 'Bobby C. Jones'
    UNION ALL
    SELECT 77819, 'Jim Paxson Jr.'

    )

    SELECT df_raw.player_id
    ,CASE
        WHEN df_raw.player_id = SpecificNames.player_id THEN SpecificNames.player_full_name
        ELSE df_raw.player_name
    END AS player_name
    ,df_raw.first_name
    ,df_raw.last_name
    ,df_raw.is_active
     
    FROM df_raw
    LEFT JOIN SpecificNames
    ON df_raw.player_id = SpecificNames.player_id
    
    """
    
    df = duckdb.query(query).df()

    return df

def export_to_gcs(df, filename):
    """Export DataFrame to Google Cloud Storage"""
    # Set credentials
    if os.path.exists(CREDENTIALS_FILE):
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = CREDENTIALS_FILE
    
    # Create a temporary file
    with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as temp_file:
        temp_path = temp_file.name
        df.to_csv(temp_path, index=False)
    
    # Upload to GCS
    print(f"Uploading to gs://{GCS_BUCKET_NAME}/{GCS_PREFIX}{filename}...")
    storage_client = storage.Client()
    bucket = storage_client.bucket(GCS_BUCKET_NAME)
    blob = bucket.blob(f"{GCS_PREFIX}{filename}")
    blob.upload_from_filename(temp_path)
    
    # Clean up
    os.remove(temp_path)
    print(f"✓ Uploaded {len(df)} records")

def main():
    """Main function"""
    print("\n=== NBA Player Lookup Table GCS Export ===\n")
    
    try:
        # Get all players
        all_players = get_player_data(active_only=False)
        print(f"Found {len(all_players)} total NBA players")
        
        # Get active players
        active_players = get_player_data(active_only=True)
        print(f"Found {len(active_players)} active NBA players")
        
        # Create simple lookup (just ID and name)
        simple_lookup = all_players[['player_id', 'player_name']].copy()
        
        # Export to GCS
        export_to_gcs(all_players, "nba_player_lookup.csv")
        
        print("\n✓ Export completed successfully!\n")
        print(f"Files exported to gs://{GCS_BUCKET_NAME}/{GCS_PREFIX}:")
        print(f"  - nba_player_lookup.csv ({len(all_players)} records)")
        
    except Exception as e:
        print(f"\n❌ ERROR: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    main()


=== NBA Player Lookup Table GCS Export ===

Fetching all NBA players...
Found 5135 total NBA players
Fetching active NBA players...
Found 571 active NBA players
Uploading to gs://nba_award_predictor/nba_data/nba_player_lookup.csv...
✓ Uploaded 5135 records

✓ Export completed successfully!

Files exported to gs://nba_award_predictor/nba_data/:
  - nba_player_lookup.csv (5135 records)


In [19]:
import wget
filename = 'playeroftheweek.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
# Read in the playeroftheweek csv
playeroftheweek_df = pd.read_csv(filename)

In [20]:
playeroftheweek_df.head()


Unnamed: 0,season,player,conference,date,team,pos,height,weight,age,Pre-Draft Team,Draft Yr,yos
0,2025-2026,Cade Cunningham,East,2025-11-10,Detroit Pistons,SF,6-6,220,24,Oklahoma State,2021,4
1,2025-2026,Nikola Jokic,West,2025-11-10,Denver Nuggets,C,6-11,284,31,KK Mega Bemax (Serbia),2014,10
2,2025-2026,Tyrese Maxey,East,2025-11-03,Philadelphia Sixers,PG,6-2,200,25,Kentucky,2020,5
3,2025-2026,Shai Gilgeous-Alexander,West,2025-11-03,Oklahoma City Thunder,PG,6-6,195,27,Kentucky,2018,7
4,2025-2026,Victor Wembanyama,West,2025-10-27,San Antonio Spurs,F,7-4,235,22,Boulogne-Levallois (France),2023,2


In [93]:
# Bring in name mapping table for names to help match all names to the format seen in the NBA API
filename = 'name_mappings.csv?authuser=4'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
# Read in the name_mappings csv
name_mapping_df = pd.read_csv('name_mappings.csv')

In [94]:
import pandas as pd
import unicodedata
import re
import duckdb
from io import StringIO
from google.cloud import storage
import wget
import os

def remove_accents(text):
    """
    Remove accent marks from input text while preserving the base characters.
    Also handles special characters like Đ/đ.
    
    Example:
    "Nikola Đurišić" -> "Nikola Durisic"
    """
    # First, handle special characters that need specific replacements
    special_chars = {
        'Đ': 'D', 'đ': 'd',  # Serbian/Croatian D with stroke
        'Ł': 'L', 'ł': 'l',  # Polish L with stroke
        'Ø': 'O', 'ø': 'o',  # Danish/Norwegian O with stroke
        'Ŧ': 'T', 'ŧ': 't',  # Sami T with stroke
        'Æ': 'AE', 'æ': 'ae',  # Æ/æ ligature
        'Œ': 'OE', 'œ': 'oe',  # Œ/œ ligature
        'ß': 'ss',  # German eszett
    }
    
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    
    # Normalize the text to decompose characters into base character and accent mark
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Filter out the non-spacing marks (accent marks)
    result = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('Mn'))
    
    return result


# Bring in nba player lookup table to map the cleaned names to player IDs. Same player IDs from the NBA API.
filename = 'nba_player_lookup.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
# Read in the nba_player_lookup csv
nba_player_lookup_df = pd.read_csv(filename)

# Clean each player's full name
nba_player_lookup_df["player_name"] = nba_player_lookup_df["player_name"].apply(remove_accents)

In [96]:
query = """
WITH CTE AS (
SELECT * FROM playeroftheweek_df
LEFT JOIN name_mapping_df a
ON playeroftheweek_df.player = a.in_table_name
LEFT JOIN name_mapping_df b
ON playeroftheweek_df.player = b.nba_lookup_name
)
, CTE2 AS (
SELECT
season
,CASE WHEN nba_lookup_name IS NOT NULL THEN nba_lookup_name ELSE player END AS player
,conference
,date
,team
,pos
,height
,weight
,age
,"Pre-Draft Team"
,"Draft Yr"
,yos
FROM CTE
)

SELECT nba_player_lookup_df.player_id, CTE2.*
FROM CTE2
LEFT JOIN nba_player_lookup_df
ON CTE2.player = nba_player_lookup_df.player_name

"""

duckdb.query(query).df()

Unnamed: 0,player_id,season,player,conference,date,team,pos,height,weight,age,Pre-Draft Team,Draft Yr,yos
0,76003,1979-1980,Kareem Abdul-Jabbar,,1979-12-09,Los Angeles Lakers,C,7-2,225,32,UCLA,1969,10
1,949,2001-2002,Shareef Abdur-Rahim,East,2001-11-25,Atlanta Hawks,F,6-9,225,25,California,1996,5
2,149,1988-1989,Michael Adams,,1988-12-11,Denver Nuggets,PG,5-10,163,26,Boston College,1985,3
3,1628389,2019-2020,Bam Adebayo,East,2019-12-16,Miami Heat,C,6-9,255,22,Kentucky,2017,2
4,76016,1983-1984,Mark Aguirre,,1983-11-27,Dallas Mavericks,SF,6-6,232,24,DePaul,1981,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1561,2548,2014-2015,Dwyane Wade,East,2015-03-23,Miami Heat,SG,6-4,220,33,Marquette,2003,11
1562,201566,2018-2019,Russell Westbrook,West,2018-11-05,Oklahoma City Thunder,G,6-4,200,30,UCLA,2008,10
1563,2548,2015-2016,Dwyane Wade,East,2016-02-01,Miami Heat,SG,6-4,220,34,Marquette,2003,12
1564,201566,2018-2019,Russell Westbrook,West,2019-04-08,Oklahoma City Thunder,G,6-4,200,30,UCLA,2008,10
