In [2]:
# First, clear any existing large DataFrames from memory
try:
    del player_statistics_df
    import gc
    gc.collect()
    print("Memory cleared from previous DataFrame")
except NameError:
    print("No existing DataFrame found in memory")

# Remove old csv files if there are any
try:
    os.remove("player-statistics-new.csv")
    os.remove("player-statistics.csv")
except:
    pass


import pandas as pd
import unicodedata
import re
import duckdb
import wget
import os

# Download files
print("Downloading files...")
filename = 'player-statistics.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

# Define the output file
output_file = 'player-statistics-new.csv'

# Process and save in chunks
chunk_size = 100000
first_chunk = True
processed_rows = 0

print(f"Processing data in chunks of {chunk_size} rows...")

for chunk_num, chunk in enumerate(pd.read_csv('player-statistics.csv', chunksize=chunk_size, low_memory=False)):
    
    if chunk_num > 0:
        break

    # Register current chunk with DuckDB
    duckdb.register('player_statistics_chunk', chunk)
    
    query = """
    SELECT *

/*    firstName
    ,lastName
    ,full_name
    ,player_id
    ,gameId
    ,gameDate AS DATE
    ,playerteamCity
    ,playerteamName
    ,opponentteamCity
    ,opponentteamName
    ,gameType
    ,gameLabel
    ,gameSubLabel
    ,seriesGameNumber
    ,win
    ,home
    ,numMinutes
    ,points
    ,assists
    ,blocks
    ,steals
    ,fieldGoalsAttempted
    ,fieldGoalsMade
    ,fieldGoalsPercentage
    ,threePointersAttempted
    ,threePointersMade
    ,threePointersPercentage
    ,freeThrowsAttempted
    ,freeThrowsMade
    ,freeThrowsPercentage
    ,reboundsDefensive
    ,reboundsOffensive
    ,reboundsTotal
    ,foulsPersonal
    ,turnovers
    ,plusMinusPoints
*/

    FROM player_statistics_chunk
    """
    
    # Execute query for this chunk
    result_chunk = duckdb.query(query).df()

    # Write to CSV (first chunk with header, subsequent chunks without)
    if first_chunk:
        result_chunk.to_csv(output_file, index=False, mode='w')
        first_chunk = False
    else:
        result_chunk.to_csv(output_file, index=False, mode='a', header=False)
    
    # Update progress
    processed_rows += len(result_chunk)
    print(f"Processed chunk {chunk_num+1} - Total rows: {processed_rows}")
    
    # Clean up to free memory
    duckdb.unregister('player_statistics_chunk')
    del chunk
    del result_chunk

print(f"All chunks processed. Total rows: {processed_rows}")
print(f"Results saved to {output_file}")


# Read in the player-statistics-new csv
player_statistics_df = pd.read_csv(output_file).head(20)
player_statistics_df


Memory cleared from previous DataFrame
Downloading files...

Downloaded player-statistics.csv
Processing data in chunks of 100000 rows...
Processed chunk 1 - Total rows: 100000
All chunks processed. Total rows: 100000
Results saved to player-statistics-new.csv


  player_statistics_df = pd.read_csv(output_file).head(20)


Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Buddy,Hield,Buddy Hield,1627741,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,3.0,1.0,4.0,1.0,1.0,-1.0
1,Gary,Payton II,Gary Payton II,1627780,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.5,0.0,0.0,0.0,1.0,2.0,3.0,1.0,2.0,3.0
2,De'Aaron,Fox,De'Aaron Fox,1628368,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.5,0.0,0.0,0.0,1.0,3.0,4.0,3.0,4.0,1.0
3,Luke,Kornet,Luke Kornet,1628436,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,2.0,2.0,1.0,3.0,2.0,5.0,1.0,1.0,-10.0
4,Jordan,McLaughlin,Jordan McLaughlin,1629162,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Keldon,Johnson,Keldon Johnson,1629640,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,2.0,1.0,2.0
6,Devin,Vassell,Devin Vassell,1630170,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.25,2.0,2.0,1.0,3.0,0.0,3.0,3.0,2.0,-16.0
7,Jonathan,Kuminga,Jonathan Kuminga,1630228,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Pat,Spencer,Pat Spencer,1630311,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lindy,Waters III,Lindy Waters III,1630322,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:

query = """
SELECT
firstName
,lastName
,full_name
,player_id
,gameId
,CAST(gameDate AS DATE) as gameDate
,playerteamCity
,playerteamName
,opponentteamCity
,opponentteamName
,gameType
,gameLabel
,gameSubLabel
,seriesGameNumber
,win
,home
,numMinutes
,points
,assists
,blocks
,steals
,fieldGoalsAttempted
,fieldGoalsMade
,fieldGoalsPercentage
,threePointersAttempted
,threePointersMade
,threePointersPercentage
,freeThrowsAttempted
,freeThrowsMade
,freeThrowsPercentage
,reboundsDefensive
,reboundsOffensive
,reboundsTotal
,foulsPersonal
,turnovers
,plusMinusPoints
FROM 
player_statistics_df
"""

duckdb.query(query).df()

Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Buddy,Hield,Buddy Hield,1627741,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,3.0,1.0,4.0,1.0,1.0,-1.0
1,Gary,Payton II,Gary Payton II,1627780,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.5,0.0,0.0,0.0,1.0,2.0,3.0,1.0,2.0,3.0
2,De'Aaron,Fox,De'Aaron Fox,1628368,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.5,0.0,0.0,0.0,1.0,3.0,4.0,3.0,4.0,1.0
3,Luke,Kornet,Luke Kornet,1628436,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,2.0,2.0,1.0,3.0,2.0,5.0,1.0,1.0,-10.0
4,Jordan,McLaughlin,Jordan McLaughlin,1629162,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Keldon,Johnson,Keldon Johnson,1629640,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,2.0,1.0,2.0
6,Devin,Vassell,Devin Vassell,1630170,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.25,2.0,2.0,1.0,3.0,0.0,3.0,3.0,2.0,-16.0
7,Jonathan,Kuminga,Jonathan Kuminga,1630228,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Pat,Spencer,Pat Spencer,1630311,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lindy,Waters III,Lindy Waters III,1630322,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import wget
import pandas as pd

In [3]:
# Download files
print("Downloading files...")
filename = 'nba-all-stars.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

Downloading files...

Downloaded nba-all-stars.csv


In [264]:
nba_all_stars_df = pd.read_csv('nba-all-stars.csv')
nba_all_stars_df

Unnamed: 0,Player,Total Selections,Selections
0,LeBron James,21,2005–2025
1,Kareem Abdul-Jabbar,19,1970–1977; 1979–1989
2,Kobe Bryant,18,1998; 2000–2016
3,Tim Duncan,15,1998; 2000–2011; 2013; 2015
4,Kevin Durant,15,2010–2019; 2021–2025
...,...,...,...
455,Jayson Williams,1,1998
456,Mo Williams,1,2009
457,Kevin Willis,1,1992
458,Metta World Peace,1,2004


In [16]:
import pandas as pd
import duckdb
import unicodedata

def remove_accents(text):
    """
    Remove accent marks from input text while preserving the base characters.
    Also handles special characters like Đ/đ.
    
    Example:
    "Nikola Đurišić" -> "Nikola Durisic"
    """
    # First, handle special characters that need specific replacements
    special_chars = {
        'Đ': 'D', 'đ': 'd',  # Serbian/Croatian D with stroke
        'Ł': 'L', 'ł': 'l',  # Polish L with stroke
        'Ø': 'O', 'ø': 'o',  # Danish/Norwegian O with stroke
        'Ŧ': 'T', 'ŧ': 't',  # Sami T with stroke
        'Æ': 'AE', 'æ': 'ae',  # Æ/æ ligature
        'Œ': 'OE', 'œ': 'oe',  # Œ/œ ligature
        'ß': 'ss',  # German eszett
    }
    
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    
    # Normalize the text to decompose characters into base character and accent mark
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Filter out the non-spacing marks (accent marks)
    result = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('Mn'))
    
    return result

def expand_selection_years(selection_years_string):
    '''
    Takes in a Dataframe column of years summarized and processes it to create a new dataframe column with
    all years spelled out 
    '''
    # Normalize non-breaking spaces to regular spaces
    selection_years_string = selection_years_string.replace('\xa0', ' ')
    dash_characters = ["-", "–", "—"] # Hyphen-minus, En Dash, Em Dash
    selections_list = selection_years_string.split("; ")

    consecutive_selection_years = []
    consecutive_selection_years_delimiters = []
    non_consecutive_selection_years = []
    non_consecutive_selection_years_delimiters = []

    # Generate a df for consecutive selection years
    for i in selections_list:
        for dash in dash_characters:
            if dash in i:
                consecutive_selection_years.append(i)
                consecutive_selection_years_delimiters.append(dash)
            else:
                non_consecutive_selection_years.append(i)
    non_consecutive_selection_years = list(set(non_consecutive_selection_years) - set(consecutive_selection_years))
    consecutive_selection_years_df = pd.DataFrame(zip(consecutive_selection_years,consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Generate a df for non-consecutive selection years
    for i in non_consecutive_selection_years:
        non_consecutive_selection_years_delimiters.append("not_applicable")
    non_consecutive_selection_years_df = pd.DataFrame(zip(non_consecutive_selection_years,non_consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Vertically concatenate both dataframes
    selection_years_df = pd.concat([consecutive_selection_years_df,non_consecutive_selection_years_df])

    query = """
    SELECT *
    ,CASE WHEN delimiter != 'not_applicable' THEN LEFT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS min_year
    ,CASE WHEN delimiter != 'not_applicable' THEN RIGHT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS max_year
    FROM selection_years_df
    """

    output = duckdb.query(query).df()

    # Extract all years that aren't within a range of numbers
    single_years = output[output.delimiter == "not_applicable"].selection_years.to_list()
    single_years = [int(year) for year in single_years]

    # Extract all years that are within a range of numbers
    selection_range_years = []
    selection_range = output[output.delimiter != "not_applicable"][['min_year','max_year']].values.tolist()
    for row in selection_range:
        for i in range(int(row[0]),int(row[1]) + 1):
            selection_range_years.append(i)

    # Combine lists
    all_years = list(set(single_years + selection_range_years))

    return all_years

def process_nba_all_stars(csv_file):
    nba_all_stars_df = pd.read_csv(csv_file)
    nba_all_stars_df["Selection Years"] = nba_all_stars_df["Selections"].apply(lambda x: expand_selection_years(x))
    nba_all_stars_df = nba_all_stars_df[['Player', 'Total Selections', 'Selection Years']]
    nba_all_stars_df = nba_all_stars_df.explode('Selection Years').reset_index(drop=True).rename(columns={'Selection Years':'Selection Year'})
    nba_all_stars_df ["Player"] = nba_all_stars_df["Player"].apply(remove_accents)
    nba_all_stars_df.to_csv('nba-all-stars.csv', index=False)
    
    return nba_all_stars_df

filename = 'nba-all-stars-raw.csv'
nba_all_stars_df = process_nba_all_stars(filename)
nba_all_stars_df

Unnamed: 0,Player,Total Selections,Selection Year
0,LeBron James,21,2005
1,LeBron James,21,2006
2,LeBron James,21,2007
3,LeBron James,21,2008
4,LeBron James,21,2009
...,...,...,...
1816,Jayson Williams,1,1998
1817,Mo Williams,1,2009
1818,Kevin Willis,1,1992
1819,Metta World Peace,1,2004


Create a function that takes in a DF with NBA names and processes to get back a DF with player_ids attached.

1. Remove accents
2. Left Join original df to nba_mappings
3. Left Join the result from Step 2 to nba_player_lookup

In [69]:
import pandas as pd
import duckdb
import wget
import unicodedata
import os

def remove_accents(text):
    """
    Remove accent marks from input text while preserving the base characters.
    Also handles special characters like Đ/đ.
    
    Example:
    "Nikola Đurišić" -> "Nikola Durisic"
    """
    # First, handle special characters that need specific replacements
    special_chars = {
        'Đ': 'D', 'đ': 'd',  # Serbian/Croatian D with stroke
        'Ł': 'L', 'ł': 'l',  # Polish L with stroke
        'Ø': 'O', 'ø': 'o',  # Danish/Norwegian O with stroke
        'Ŧ': 'T', 'ŧ': 't',  # Sami T with stroke
        'Æ': 'AE', 'æ': 'ae',  # Æ/æ ligature
        'Œ': 'OE', 'œ': 'oe',  # Œ/œ ligature
        'ß': 'ss',  # German eszett
    }
    
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    
    # Normalize the text to decompose characters into base character and accent mark
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Filter out the non-spacing marks (accent marks)
    result = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('Mn'))
    
    return result


def attach_player_ids(input_df, column):
    '''
    This function takes in a DataFrame containing NBA player names but not player_id column.
    It outputs a new DataFrame with a player_id column.
    '''
    column = str(column)

    ##### Step 1: Remove all accents from the player's names #####
    
    input_df[column] = input_df[column].apply(remove_accents)
    #input_df.column = input_df.column.apply(remove_accents)

    ##### Step 2: Left join the df to the nba_mappings lookup table #####
    # Bring in name mapping table for names to help match all names to the format seen in the NBA API
    filename = 'name_mappings.csv'
    url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
    wget.download(url)
    # Read in the name_mappings csv
    name_mapping_df = pd.read_csv('name_mappings.csv')

    #Perform left join
    query = f"""
    WITH CTE AS (
    SELECT * FROM input_df
    LEFT JOIN name_mapping_df a
    ON input_df.{column} = a.in_table_name
    LEFT JOIN name_mapping_df b
    ON input_df.{column} = b.nba_lookup_name
    )
    SELECT *
    ,CASE WHEN nba_lookup_name IS NOT NULL THEN nba_lookup_name ELSE {column} END AS player_name_to_use
    FROM CTE
    """
    input_df = duckdb.query(query).df()

    ##### Step 3: Left join the result from Step 2 to nba_player_lookup table #####
    # Bring in nba player lookup table to map the cleaned names to player IDs. Same player IDs from the NBA API.
    filename = 'nba_player_lookup.csv'
    url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
    wget.download(url)
    # Read in the nba_player_lookup csv
    nba_player_lookup_df = pd.read_csv(filename)
    # Clean each player's full name
    nba_player_lookup_df["player_name"] = nba_player_lookup_df["player_name"].apply(remove_accents)
    
    #Perform left join
    query = f"""
    WITH CTE AS (
    SELECT
    nba_player_lookup_df.player_id
    ,input_df.*
    FROM input_df
    LEFT JOIN nba_player_lookup_df
    ON input_df.{column} = nba_player_lookup_df.player_name
    )
    SELECT *
    ,CASE
        WHEN player_id IS NOT NULL THEN player_id
        WHEN player_id IS NULL THEN player_id_1
        ELSE player_id_3
    END AS player_id_to_use
    FROM CTE
    
    """
    output_df = duckdb.query(query).df()
    #Drop unneeded columns and rearrange
    output_df = output_df.drop(columns=[f'{column}','player_id','in_table_name','nba_lookup_name','player_id_1','in_table_name_1','nba_lookup_name_2','player_id_3'])
    output_df = output_df.rename(columns={'player_name_to_use':'player_name', 'player_id_to_use':'player_id'})
    cols = output_df.columns.tolist()
    cols.pop(cols.index('player_name'))
    cols.pop(cols.index('player_id'))
    cols.insert(0, 'player_id')
    cols.insert(1, 'player_name')
    output_df = output_df[cols]

    #Delete lookup tables
    os.remove("name_mappings.csv")
    os.remove("nba_player_lookup.csv")

    return output_df

def expand_selection_years(selection_years_string):
    '''
    Takes in a Dataframe column of years summarized and processes it to create a new dataframe column with
    all years spelled out 
    '''
    # Normalize non-breaking spaces to regular spaces
    selection_years_string = selection_years_string.replace('\xa0', ' ')
    dash_characters = ["-", "–", "—"] # Hyphen-minus, En Dash, Em Dash
    selections_list = selection_years_string.split("; ")

    consecutive_selection_years = []
    consecutive_selection_years_delimiters = []
    non_consecutive_selection_years = []
    non_consecutive_selection_years_delimiters = []

    # Generate a df for consecutive selection years
    for i in selections_list:
        for dash in dash_characters:
            if dash in i:
                consecutive_selection_years.append(i)
                consecutive_selection_years_delimiters.append(dash)
            else:
                non_consecutive_selection_years.append(i)
    non_consecutive_selection_years = list(set(non_consecutive_selection_years) - set(consecutive_selection_years))
    consecutive_selection_years_df = pd.DataFrame(zip(consecutive_selection_years,consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Generate a df for non-consecutive selection years
    for i in non_consecutive_selection_years:
        non_consecutive_selection_years_delimiters.append("not_applicable")
    non_consecutive_selection_years_df = pd.DataFrame(zip(non_consecutive_selection_years,non_consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Vertically concatenate both dataframes
    selection_years_df = pd.concat([consecutive_selection_years_df,non_consecutive_selection_years_df])

    query = """
    SELECT *
    ,CASE WHEN delimiter != 'not_applicable' THEN LEFT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS min_year
    ,CASE WHEN delimiter != 'not_applicable' THEN RIGHT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS max_year
    FROM selection_years_df
    """

    output = duckdb.query(query).df()

    # Extract all years that aren't within a range of numbers
    single_years = output[output.delimiter == "not_applicable"].selection_years.to_list()
    single_years = [int(year) for year in single_years]

    # Extract all years that are within a range of numbers
    selection_range_years = []
    selection_range = output[output.delimiter != "not_applicable"][['min_year','max_year']].values.tolist()
    for row in selection_range:
        for i in range(int(row[0]),int(row[1]) + 1):
            selection_range_years.append(i)

    # Combine lists
    all_years = list(set(single_years + selection_range_years))

    return all_years

def process_nba_all_stars(csv_file):
    nba_all_stars_df = pd.read_csv(csv_file)
    nba_all_stars_df["Selection Years"] = nba_all_stars_df["Selections"].apply(lambda x: expand_selection_years(x))
    nba_all_stars_df = nba_all_stars_df[['Player', 'Total Selections', 'Selection Years']]
    nba_all_stars_df = nba_all_stars_df.explode('Selection Years').reset_index(drop=True).rename(columns={'Selection Years':'Selection Year'})
    nba_all_stars_df ["Player"] = nba_all_stars_df["Player"].apply(remove_accents)
    nba_all_stars_df.to_csv('nba-all-stars.csv', index=False)
    
    return nba_all_stars_df

filename = 'nba-all-stars-raw.csv'
nba_all_stars_df = process_nba_all_stars(filename)
output_df = attach_player_ids(nba_all_stars_df,'Player')
output_df.to_csv('nba-all-stars.csv', index=False)
output_df

Unnamed: 0,player_id,player_name,Total Selections,Selection Year
0,2544,LeBron James,21,2005
1,2544,LeBron James,21,2006
2,2544,LeBron James,21,2007
3,2544,LeBron James,21,2008
4,2544,LeBron James,21,2009
...,...,...,...,...
1816,913,Larry D. Johnson,2,1995
1817,77376,Lafayette Lever,2,1988
1818,77376,Lafayette Lever,2,1990
1819,76753,World Free,1,1980


In [70]:
# Download files
filename = 'nba-mvp.csv'
print(f"Downloading {filename}...")
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

Downloading nba-mvp.csv...

Downloaded nba-mvp.csv


In [71]:
nba_mvp_df = pd.read_csv('nba-mvp.csv')
nba_mvp_df

Unnamed: 0,Season,Player,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,2024-2025,Shai Gilgeous-Alexander,Oklahoma City Thunder,PG,6-Jun,195,26,Kentucky,2018,6
1,2023-2024,Nikola Jokic,Denver Nuggets,C,11-Jun,284,29,KK Mega Bemax (Serbia),2014,8
2,2022-2023,Joel Embiid,Philadelphia Sixers,C,Jul-00,280,28,Kansas,2014,8
3,2021-2022,Nikola Jokic,Denver Nuggets,C,11-Jun,284,27,KK Mega Bemax (Serbia),2014,6
4,2020-2021,Nikola Jokic,Denver Nuggets,C,11-Jun,284,26,KK Mega Bemax (Serbia),2014,5
...,...,...,...,...,...,...,...,...,...,...
65,1959-1960,Wilt Chamberlain,San Francisco Warriors,C,1-Jul,265,23,Kansas,1959,0
66,1958-1959,Bob Pettit,St. Louis Hawks,FC,9-Jun,205,26,LSU,1954,4
67,1957-1958,Bill Russell,Boston Celtics,C,10-Jun,215,24,San Francisco,1956,1
68,1956-1957,Bob Cousy,Boston Celtics,PG,1-Jun,175,28,Holy Cross,1950,6


In [72]:
# Download files
filename = 'nba_player_lookup.csv'
print(f"Downloading {filename}...")
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

Downloading nba_player_lookup.csv...

Downloaded nba_player_lookup.csv


In [131]:
# Download files
filename = 'all-nba-first-team.csv'
print(f"Downloading {filename}...")
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

# Download files
filename = 'all-nba-second-team.csv'
print(f"Downloading {filename}...")
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

# Download files
filename = 'all-nba-third-team.csv'
print(f"Downloading {filename}...")
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

Downloading all-nba-first-team.csv...

Downloaded all-nba-first-team.csv
Downloading all-nba-second-team.csv...

Downloaded all-nba-second-team.csv
Downloading all-nba-third-team.csv...

Downloaded all-nba-third-team.csv


In [122]:
all_nba_first_team_df = pd.read_csv('all-nba-first-team.csv')
all_nba_first_team_df

Unnamed: 0,Season,Player,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,2024-2025,Giannis Antetokounmpo,Milwaukee Bucks,F,11-Jun,243,30,Filathlitikos Div II Greece (Greece),2013,11
1,2024-2025,Shai Gilgeous-Alexander,Oklahoma City Thunder,PG,6-Jun,195,26,Kentucky,2018,6
2,2024-2025,Nikola Jokic,Denver Nuggets,C,11-Jun,284,30,KK Mega Bemax (Serbia),2014,9
3,2024-2025,Donovan Mitchell,Cleveland Cavaliers,SG,2-Jun,215,28,Louisville,2017,7
4,2024-2025,Jayson Tatum,Boston Celtics,SF,8-Jun,210,26,Duke,2017,7
...,...,...,...,...,...,...,...,...,...,...
391,1946-1947,Bob Feerick,Washington Capitols,GF,3-Jun,190,27,,1946,0
392,1946-1947,Joe Fulks,San Francisco Warriors,SF,5-Jun,190,25,,1946,0
393,1946-1947,Bones McKinney,Washington Capitols,FC,6-Jun,185,28,,1946,0
394,1946-1947,Stan Miasek,Detroit Falcons,F,5-Jun,210,22,,1946,0


In [123]:
all_nba_first_team_df.columns

Index(['Season', 'Player', 'Team', 'Pos', 'Height', 'Weight', 'Age',
       'Pre-Draft Team', 'Draft Yr', 'YOS'],
      dtype='object')

In [132]:
query = """
SELECT 
Season
,Player
,Team
,Pos
,CASE
    WHEN Height = '1-May' THEN '5-1'
    WHEN Height = '2-May' THEN '5-2'
    WHEN Height = '3-May' THEN '5-3'
    WHEN Height = '4-May' THEN '5-4'
    WHEN Height = '5-May' THEN '5-5'
    WHEN Height = '6-May' THEN '5-6'
    WHEN Height = '7-May' THEN '5-7'
    WHEN Height = '8-May' THEN '5-8'
    WHEN Height = '9-May' THEN '5-9'
    WHEN Height = '10-May' THEN '5-10'
    WHEN Height = '11-May' THEN '5-11'
    WHEN Height = 'Jun-00' THEN '6-0'
    WHEN Height = '1-Jun' THEN '6-1'
    WHEN Height = '2-Jun' THEN '6-2'
    WHEN Height = '3-Jun' THEN '6-3'
    WHEN Height = '4-Jun' THEN '6-4'
    WHEN Height = '5-Jun' THEN '6-5'
    WHEN Height = '6-Jun' THEN '6-6'
    WHEN Height = '7-Jun' THEN '6-7'
    WHEN Height = '8-Jun' THEN '6-8'
    WHEN Height = '9-Jun' THEN '6-9'
    WHEN Height = '10-Jun' THEN '6-10'
    WHEN Height = '11-Jun' THEN '6-11'
    WHEN Height = 'Jul-00' THEN '7-0'
    WHEN Height = '1-Jul' THEN '7-1'
    WHEN Height = '2-Jul' THEN '7-2'
    WHEN Height = '3-Jul' THEN '7-3'
    WHEN Height = '4-Jul' THEN '7-4'
    WHEN Height = '5-Jul' THEN '7-5'
    WHEN Height = '6-Jul' THEN '7-6'
    WHEN Height = '7-Jul' THEN '7-7'
    WHEN Height = '8-Jul' THEN '7-8'
    WHEN Height = '9-Jul' THEN '7-9'
END AS Height 
,Weight
,Age
,"Pre-Draft Team"
,"Draft Yr"
,YOS

FROM all_nba_first_team_df

"""

all_nba_first_team_df = duckdb.query(query).df()
all_nba_first_team_df

Unnamed: 0,Season,Player,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,2024-2025,Giannis Antetokounmpo,Milwaukee Bucks,F,6-11,243,30,Filathlitikos Div II Greece (Greece),2013,11
1,2024-2025,Shai Gilgeous-Alexander,Oklahoma City Thunder,PG,6-6,195,26,Kentucky,2018,6
2,2024-2025,Nikola Jokic,Denver Nuggets,C,6-11,284,30,KK Mega Bemax (Serbia),2014,9
3,2024-2025,Donovan Mitchell,Cleveland Cavaliers,SG,6-2,215,28,Louisville,2017,7
4,2024-2025,Jayson Tatum,Boston Celtics,SF,6-8,210,26,Duke,2017,7
...,...,...,...,...,...,...,...,...,...,...
391,1946-1947,Bob Feerick,Washington Capitols,GF,6-3,190,27,,1946,0
392,1946-1947,Joe Fulks,San Francisco Warriors,SF,6-5,190,25,,1946,0
393,1946-1947,Bones McKinney,Washington Capitols,FC,6-6,185,28,,1946,0
394,1946-1947,Stan Miasek,Detroit Falcons,F,6-5,210,22,,1946,0


In [134]:
all_nba_second_team_df = pd.read_csv('all-nba-second-team.csv')
all_nba_third_team_df = pd.read_csv('all-nba-third-team.csv')

In [135]:
query = """
SELECT 
Season
,Player
,Team
,Pos
,CASE
    WHEN Height = '1-May' THEN '5-1'
    WHEN Height = '2-May' THEN '5-2'
    WHEN Height = '3-May' THEN '5-3'
    WHEN Height = '4-May' THEN '5-4'
    WHEN Height = '5-May' THEN '5-5'
    WHEN Height = '6-May' THEN '5-6'
    WHEN Height = '7-May' THEN '5-7'
    WHEN Height = '8-May' THEN '5-8'
    WHEN Height = '9-May' THEN '5-9'
    WHEN Height = '10-May' THEN '5-10'
    WHEN Height = '11-May' THEN '5-11'
    WHEN Height = 'Jun-00' THEN '6-0'
    WHEN Height = '1-Jun' THEN '6-1'
    WHEN Height = '2-Jun' THEN '6-2'
    WHEN Height = '3-Jun' THEN '6-3'
    WHEN Height = '4-Jun' THEN '6-4'
    WHEN Height = '5-Jun' THEN '6-5'
    WHEN Height = '6-Jun' THEN '6-6'
    WHEN Height = '7-Jun' THEN '6-7'
    WHEN Height = '8-Jun' THEN '6-8'
    WHEN Height = '9-Jun' THEN '6-9'
    WHEN Height = '10-Jun' THEN '6-10'
    WHEN Height = '11-Jun' THEN '6-11'
    WHEN Height = 'Jul-00' THEN '7-0'
    WHEN Height = '1-Jul' THEN '7-1'
    WHEN Height = '2-Jul' THEN '7-2'
    WHEN Height = '3-Jul' THEN '7-3'
    WHEN Height = '4-Jul' THEN '7-4'
    WHEN Height = '5-Jul' THEN '7-5'
    WHEN Height = '6-Jul' THEN '7-6'
    WHEN Height = '7-Jul' THEN '7-7'
    WHEN Height = '8-Jul' THEN '7-8'
    WHEN Height = '9-Jul' THEN '7-9'
END AS Height 
,Weight
,Age
,"Pre-Draft Team"
,"Draft Yr"
,YOS

FROM all_nba_second_team_df

"""

all_nba_second_team_df = duckdb.query(query).df()
all_nba_second_team_df

Unnamed: 0,Season,Player,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,2024-2025,Jalen Brunson,New York Knicks,PG,6-2,190,28,Villanova,2018,6
1,2024-2025,Stephen Curry,Golden State Warriors,G,6-2,185,36,Davidson,2009,15
2,2024-2025,Anthony Edwards,Minnesota Timberwolves,SF,6-4,225,23,Georgia,2020,4
3,2024-2025,LeBron James,Los Angeles Lakers,F,6-9,250,40,St. Vincent St. Mary High School (Ohio),2003,21
4,2024-2025,Evan Mobley,Cleveland Cavaliers,PF,6-11,215,23,USC,2021,3
...,...,...,...,...,...,...,...,...,...,...
389,1946-1947,Frankie Baumholtz,Cleveland Rebels,PG,5-10,170,28,,1946,0
390,1946-1947,Ernie Calverley,Providence Steamrollers,PG,5-10,145,23,,1946,0
391,1946-1947,Chick Halbert,Chicago Stags,PF,6-9,225,28,,1946,0
392,1946-1947,Johnny Logan,St. Louis Bombers,G,6-2,175,26,,1946,0


In [136]:
query = """
SELECT 
Season
,Player
,Team
,Pos
,CASE
    WHEN Height = '1-May' THEN '5-1'
    WHEN Height = '2-May' THEN '5-2'
    WHEN Height = '3-May' THEN '5-3'
    WHEN Height = '4-May' THEN '5-4'
    WHEN Height = '5-May' THEN '5-5'
    WHEN Height = '6-May' THEN '5-6'
    WHEN Height = '7-May' THEN '5-7'
    WHEN Height = '8-May' THEN '5-8'
    WHEN Height = '9-May' THEN '5-9'
    WHEN Height = '10-May' THEN '5-10'
    WHEN Height = '11-May' THEN '5-11'
    WHEN Height = 'Jun-00' THEN '6-0'
    WHEN Height = '1-Jun' THEN '6-1'
    WHEN Height = '2-Jun' THEN '6-2'
    WHEN Height = '3-Jun' THEN '6-3'
    WHEN Height = '4-Jun' THEN '6-4'
    WHEN Height = '5-Jun' THEN '6-5'
    WHEN Height = '6-Jun' THEN '6-6'
    WHEN Height = '7-Jun' THEN '6-7'
    WHEN Height = '8-Jun' THEN '6-8'
    WHEN Height = '9-Jun' THEN '6-9'
    WHEN Height = '10-Jun' THEN '6-10'
    WHEN Height = '11-Jun' THEN '6-11'
    WHEN Height = 'Jul-00' THEN '7-0'
    WHEN Height = '1-Jul' THEN '7-1'
    WHEN Height = '2-Jul' THEN '7-2'
    WHEN Height = '3-Jul' THEN '7-3'
    WHEN Height = '4-Jul' THEN '7-4'
    WHEN Height = '5-Jul' THEN '7-5'
    WHEN Height = '6-Jul' THEN '7-6'
    WHEN Height = '7-Jul' THEN '7-7'
    WHEN Height = '8-Jul' THEN '7-8'
    WHEN Height = '9-Jul' THEN '7-9'
END AS Height 
,Weight
,Age
,"Pre-Draft Team"
,"Draft Yr"
,YOS

FROM all_nba_third_team_df

"""

all_nba_third_team_df = duckdb.query(query).df()
all_nba_third_team_df

Unnamed: 0,Season,Player,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,2024-2025,Cade Cunningham,Detroit Pistons,SF,6-6,220,23,Oklahoma State,2021,3
1,2024-2025,Tyrese Haliburton,Indiana Pacers,PG,6-5,185,25,Iowa State,2020,4
2,2024-2025,James Harden,Los Angeles Clippers,SG,6-5,220,35,Arizona State,2009,15
3,2024-2025,Karl-Anthony Towns,New York Knicks,C,7-0,248,29,Kentucky,2015,9
4,2024-2025,Jalen Williams,Oklahoma City Thunder,G,6-5,211,23,Santa Clara,2022,2
...,...,...,...,...,...,...,...,...,...,...
180,1988-1989,Terry Cummings,Milwaukee Bucks,PF,6-9,220,27,DePaul,1982,6
181,1988-1989,Dale Ellis,Seattle SuperSonics,GF,6-7,205,28,Tennessee,1983,5
182,1988-1989,Robert Parish,Boston Celtics,C,7-0,230,35,Centenary (LA),1976,11
183,1988-1989,Mark Price,Cleveland Cavaliers,PG,6-0,170,25,Georgia Tech,1986,2


In [144]:
all_nba_third_team_df.Height.value_counts().sum()

np.int64(185)

In [None]:
# all_nba_first_team_df.to_csv('all-nba-first-team.csv', index=False)
# all_nba_second_team_df.to_csv('all-nba-second-team.csv', index=False)
# all_nba_third_team_df.to_csv('all-nba-third-team.csv', index=False)

In [187]:
all_nba_second_team_df = pd.read_csv('all-nba-second-team.csv')

In [188]:
testing_df_1 = attach_player_ids(all_nba_first_team_df, 'Player')
testing_df_2 = attach_player_ids(all_nba_second_team_df, 'Player')
testing_df_3 = attach_player_ids(all_nba_third_team_df, 'Player')

In [189]:
query = """
SELECT * FROM testing_df_1
WHERE player_id IS NULL

UNION ALL
SELECT * FROM testing_df_2
WHERE player_id IS NULL

UNION ALL
SELECT * FROM testing_df_3
WHERE player_id IS NULL

"""

duckdb.query(query).df()

Unnamed: 0,player_id,player_name,Season,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS


In [199]:
query = """
SELECT * FROM testing_df_1
ORDER BY Season DESC
"""

all_nba_first_team_df = duckdb.query(query).df()
all_nba_first_team_df.to_csv('all-nba-first-team.csv',index=False)

In [200]:
query = """
SELECT * FROM testing_df_2
ORDER BY Season DESC
"""

all_nba_second_team_df = duckdb.query(query).df()
all_nba_second_team_df.to_csv('all-nba-second-team.csv',index=False)

In [201]:
query = """
SELECT * FROM testing_df_3
ORDER BY Season DESC
"""

all_nba_third_team_df = duckdb.query(query).df()
all_nba_third_team_df.to_csv('all-nba-third-team.csv',index=False)

In [203]:
player_stats_test_df = pd.read_csv('player-statistics-new.csv')

  player_stats_test_df = pd.read_csv('player-statistics-new.csv')


In [221]:
query = """ 


SELECT *
FROM player_stats_test_df
--LEFT JOIN all_nba_first_team_df
--ON player_stats_test_df.player_id = all_nba_first_team_df.player_id
--LEFT JOIN all_nba_second_team_df
--ON player_stats_test_df.player_id = all_nba_second_team_df.player_id
--LEFT JOIN all_nba_third_team_df
--ON player_stats_test_df.player_id = all_nba_third_team_df.player_id
"""

duckdb.query(query).df()

Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Buddy,Hield,Buddy Hield,1627741,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.000,0.0,0.0,0.0,3.0,1.0,4.0,1.0,1.0,-1.0
1,Gary,Payton II,Gary Payton II,1627780,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.500,0.0,0.0,0.0,1.0,2.0,3.0,1.0,2.0,3.0
2,De'Aaron,Fox,De'Aaron Fox,1628368,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.500,0.0,0.0,0.0,1.0,3.0,4.0,3.0,4.0,1.0
3,Luke,Kornet,Luke Kornet,1628436,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.000,2.0,2.0,1.0,3.0,2.0,5.0,1.0,1.0,-10.0
4,Jordan,McLaughlin,Jordan McLaughlin,1629162,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Udoka,Azubuike,Udoka Azubuike,1628962,22200646,2023-01-14 21:00:00,Utah,Jazz,Philadelphia,76ers,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,De'Anthony,Melton,De'Anthony Melton,1629001,22200646,2023-01-14 21:00:00,Philadelphia,76ers,Utah,Jazz,...,0.667,0.0,0.0,0.0,4.0,0.0,4.0,2.0,1.0,1.0
99997,Shake,Milton,Shake Milton,1629003,22200646,2023-01-14 21:00:00,Philadelphia,76ers,Utah,Jazz,...,0.600,0.0,0.0,0.0,4.0,0.0,4.0,3.0,1.0,3.0
99998,Jarred,Vanderbilt,Jarred Vanderbilt,1629020,22200646,2023-01-14 21:00:00,Utah,Jazz,Philadelphia,76ers,...,0.000,0.0,0.0,0.0,6.0,3.0,9.0,4.0,0.0,-5.0


In [220]:
all_nba_third_team_df

Unnamed: 0,player_id,player_name,Season,Team,Pos,Height,Weight,Age,Pre-Draft Team,Draft Yr,YOS
0,1630595,Cade Cunningham,2024-2025,Detroit Pistons,SF,6-6,220,23,Oklahoma State,2021,3
1,1630169,Tyrese Haliburton,2024-2025,Indiana Pacers,PG,6-5,185,25,Iowa State,2020,4
2,1631114,Jalen Williams,2024-2025,Oklahoma City Thunder,G,6-5,211,23,Santa Clara,2022,2
3,201935,James Harden,2024-2025,Los Angeles Clippers,SG,6-5,220,35,Arizona State,2009,15
4,1626157,Karl-Anthony Towns,2024-2025,New York Knicks,C,7-0,248,29,Kentucky,2015,9
...,...,...,...,...,...,...,...,...,...,...,...
180,305,Robert Parish,1988-1989,Boston Celtics,C,7-0,230,35,Centenary (LA),1976,11
181,1122,Dominique Wilkins,1988-1989,Atlanta Hawks,SF,6-7,200,29,Georgia,1982,6
182,107,Dale Ellis,1988-1989,Seattle SuperSonics,GF,6-7,205,28,Tennessee,1983,5
183,899,Mark Price,1988-1989,Cleveland Cavaliers,PG,6-0,170,25,Georgia Tech,1986,2
