In [2]:
# First, clear any existing large DataFrames from memory
try:
    del player_statistics_df
    import gc
    gc.collect()
    print("Memory cleared from previous DataFrame")
except NameError:
    print("No existing DataFrame found in memory")

# Remove old csv files if there are any
try:
    os.remove("player-statistics-new.csv")
    os.remove("player-statistics.csv")
except:
    pass


import pandas as pd
import unicodedata
import re
import duckdb
import wget
import os

# Download files
print("Downloading files...")
filename = 'player-statistics.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

# Define the output file
output_file = 'player-statistics-new.csv'

# Process and save in chunks
chunk_size = 100000
first_chunk = True
processed_rows = 0

print(f"Processing data in chunks of {chunk_size} rows...")

for chunk_num, chunk in enumerate(pd.read_csv('player-statistics.csv', chunksize=chunk_size, low_memory=False)):
    
    if chunk_num > 0:
        break

    # Register current chunk with DuckDB
    duckdb.register('player_statistics_chunk', chunk)
    
    query = """
    SELECT *

/*    firstName
    ,lastName
    ,full_name
    ,player_id
    ,gameId
    ,gameDate AS DATE
    ,playerteamCity
    ,playerteamName
    ,opponentteamCity
    ,opponentteamName
    ,gameType
    ,gameLabel
    ,gameSubLabel
    ,seriesGameNumber
    ,win
    ,home
    ,numMinutes
    ,points
    ,assists
    ,blocks
    ,steals
    ,fieldGoalsAttempted
    ,fieldGoalsMade
    ,fieldGoalsPercentage
    ,threePointersAttempted
    ,threePointersMade
    ,threePointersPercentage
    ,freeThrowsAttempted
    ,freeThrowsMade
    ,freeThrowsPercentage
    ,reboundsDefensive
    ,reboundsOffensive
    ,reboundsTotal
    ,foulsPersonal
    ,turnovers
    ,plusMinusPoints
*/

    FROM player_statistics_chunk
    """
    
    # Execute query for this chunk
    result_chunk = duckdb.query(query).df()

    # Write to CSV (first chunk with header, subsequent chunks without)
    if first_chunk:
        result_chunk.to_csv(output_file, index=False, mode='w')
        first_chunk = False
    else:
        result_chunk.to_csv(output_file, index=False, mode='a', header=False)
    
    # Update progress
    processed_rows += len(result_chunk)
    print(f"Processed chunk {chunk_num+1} - Total rows: {processed_rows}")
    
    # Clean up to free memory
    duckdb.unregister('player_statistics_chunk')
    del chunk
    del result_chunk

print(f"All chunks processed. Total rows: {processed_rows}")
print(f"Results saved to {output_file}")


# Read in the player-statistics-new csv
player_statistics_df = pd.read_csv(output_file).head(20)
player_statistics_df


Memory cleared from previous DataFrame
Downloading files...

Downloaded player-statistics.csv
Processing data in chunks of 100000 rows...
Processed chunk 1 - Total rows: 100000
All chunks processed. Total rows: 100000
Results saved to player-statistics-new.csv


  player_statistics_df = pd.read_csv(output_file).head(20)


Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Buddy,Hield,Buddy Hield,1627741,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,3.0,1.0,4.0,1.0,1.0,-1.0
1,Gary,Payton II,Gary Payton II,1627780,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.5,0.0,0.0,0.0,1.0,2.0,3.0,1.0,2.0,3.0
2,De'Aaron,Fox,De'Aaron Fox,1628368,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.5,0.0,0.0,0.0,1.0,3.0,4.0,3.0,4.0,1.0
3,Luke,Kornet,Luke Kornet,1628436,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,2.0,2.0,1.0,3.0,2.0,5.0,1.0,1.0,-10.0
4,Jordan,McLaughlin,Jordan McLaughlin,1629162,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Keldon,Johnson,Keldon Johnson,1629640,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,2.0,1.0,2.0
6,Devin,Vassell,Devin Vassell,1630170,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.25,2.0,2.0,1.0,3.0,0.0,3.0,3.0,2.0,-16.0
7,Jonathan,Kuminga,Jonathan Kuminga,1630228,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Pat,Spencer,Pat Spencer,1630311,22500047,2025-11-14T21:30:00Z,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lindy,Waters III,Lindy Waters III,1630322,22500047,2025-11-14T21:30:00Z,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:

query = """
SELECT
firstName
,lastName
,full_name
,player_id
,gameId
,CAST(gameDate AS DATE) as gameDate
,playerteamCity
,playerteamName
,opponentteamCity
,opponentteamName
,gameType
,gameLabel
,gameSubLabel
,seriesGameNumber
,win
,home
,numMinutes
,points
,assists
,blocks
,steals
,fieldGoalsAttempted
,fieldGoalsMade
,fieldGoalsPercentage
,threePointersAttempted
,threePointersMade
,threePointersPercentage
,freeThrowsAttempted
,freeThrowsMade
,freeThrowsPercentage
,reboundsDefensive
,reboundsOffensive
,reboundsTotal
,foulsPersonal
,turnovers
,plusMinusPoints
FROM 
player_statistics_df
"""

duckdb.query(query).df()

Unnamed: 0,firstName,lastName,full_name,player_id,gameId,gameDate,playerteamCity,playerteamName,opponentteamCity,opponentteamName,...,threePointersPercentage,freeThrowsAttempted,freeThrowsMade,freeThrowsPercentage,reboundsDefensive,reboundsOffensive,reboundsTotal,foulsPersonal,turnovers,plusMinusPoints
0,Buddy,Hield,Buddy Hield,1627741,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,3.0,1.0,4.0,1.0,1.0,-1.0
1,Gary,Payton II,Gary Payton II,1627780,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.5,0.0,0.0,0.0,1.0,2.0,3.0,1.0,2.0,3.0
2,De'Aaron,Fox,De'Aaron Fox,1628368,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.5,0.0,0.0,0.0,1.0,3.0,4.0,3.0,4.0,1.0
3,Luke,Kornet,Luke Kornet,1628436,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,2.0,2.0,1.0,3.0,2.0,5.0,1.0,1.0,-10.0
4,Jordan,McLaughlin,Jordan McLaughlin,1629162,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Keldon,Johnson,Keldon Johnson,1629640,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,2.0,1.0,2.0
6,Devin,Vassell,Devin Vassell,1630170,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.25,2.0,2.0,1.0,3.0,0.0,3.0,3.0,2.0,-16.0
7,Jonathan,Kuminga,Jonathan Kuminga,1630228,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Pat,Spencer,Pat Spencer,1630311,22500047,2025-11-14,Golden State,Warriors,San Antonio,Spurs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lindy,Waters III,Lindy Waters III,1630322,22500047,2025-11-14,San Antonio,Spurs,Golden State,Warriors,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import wget
import pandas as pd

In [3]:
# Download files
print("Downloading files...")
filename = 'nba-all-stars.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

Downloading files...

Downloaded nba-all-stars.csv


In [264]:
nba_all_stars_df = pd.read_csv('nba-all-stars.csv')
nba_all_stars_df

Unnamed: 0,Player,Total Selections,Selections
0,LeBron James,21,2005–2025
1,Kareem Abdul-Jabbar,19,1970–1977; 1979–1989
2,Kobe Bryant,18,1998; 2000–2016
3,Tim Duncan,15,1998; 2000–2011; 2013; 2015
4,Kevin Durant,15,2010–2019; 2021–2025
...,...,...,...
455,Jayson Williams,1,1998
456,Mo Williams,1,2009
457,Kevin Willis,1,1992
458,Metta World Peace,1,2004


In [277]:
import pandas as pd
import duckdb
import unicodedata

def remove_accents(text):
    """
    Remove accent marks from input text while preserving the base characters.
    Also handles special characters like Đ/đ.
    
    Example:
    "Nikola Đurišić" -> "Nikola Durisic"
    """
    # First, handle special characters that need specific replacements
    special_chars = {
        'Đ': 'D', 'đ': 'd',  # Serbian/Croatian D with stroke
        'Ł': 'L', 'ł': 'l',  # Polish L with stroke
        'Ø': 'O', 'ø': 'o',  # Danish/Norwegian O with stroke
        'Ŧ': 'T', 'ŧ': 't',  # Sami T with stroke
        'Æ': 'AE', 'æ': 'ae',  # Æ/æ ligature
        'Œ': 'OE', 'œ': 'oe',  # Œ/œ ligature
        'ß': 'ss',  # German eszett
    }
    
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    
    # Normalize the text to decompose characters into base character and accent mark
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Filter out the non-spacing marks (accent marks)
    result = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('Mn'))
    
    return result

def expand_selection_years(selection_years_string):
    '''
    Takes in a Dataframe column of years summarized and processes it to create a new dataframe column with
    all years spelled out 
    '''
    # Normalize non-breaking spaces to regular spaces
    selection_years_string = selection_years_string.replace('\xa0', ' ')
    dash_characters = ["-", "–", "—"] # Hyphen-minus, En Dash, Em Dash
    selections_list = selection_years_string.split("; ")

    consecutive_selection_years = []
    consecutive_selection_years_delimiters = []
    non_consecutive_selection_years = []
    non_consecutive_selection_years_delimiters = []

    # Generate a df for consecutive selection years
    for i in selections_list:
        for dash in dash_characters:
            if dash in i:
                consecutive_selection_years.append(i)
                consecutive_selection_years_delimiters.append(dash)
            else:
                non_consecutive_selection_years.append(i)
    non_consecutive_selection_years = list(set(non_consecutive_selection_years) - set(consecutive_selection_years))
    consecutive_selection_years_df = pd.DataFrame(zip(consecutive_selection_years,consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Generate a df for non-consecutive selection years
    for i in non_consecutive_selection_years:
        non_consecutive_selection_years_delimiters.append("not_applicable")
    non_consecutive_selection_years_df = pd.DataFrame(zip(non_consecutive_selection_years,non_consecutive_selection_years_delimiters), columns=['selection_years', 'delimiter'])

    # Vertically concatenate both dataframes
    selection_years_df = pd.concat([consecutive_selection_years_df,non_consecutive_selection_years_df])

    query = """
    SELECT *
    ,CASE WHEN delimiter != 'not_applicable' THEN LEFT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS min_year
    ,CASE WHEN delimiter != 'not_applicable' THEN RIGHT(selection_years, POSITION(delimiter IN selection_years) - 1) END AS max_year
    FROM selection_years_df
    """

    output = duckdb.query(query).df()

    # Extract all years that aren't within a range of numbers
    single_years = output[output.delimiter == "not_applicable"].selection_years.to_list()
    single_years = [int(year) for year in single_years]

    # Extract all years that are within a range of numbers
    selection_range_years = []
    selection_range = output[output.delimiter != "not_applicable"][['min_year','max_year']].values.tolist()
    for row in selection_range:
        for i in range(int(row[0]),int(row[1]) + 1):
            selection_range_years.append(i)

    # Combine lists
    all_years = list(set(single_years + selection_range_years))

    return all_years

def process_nba_all_stars(csv_file):
    nba_all_stars_df = pd.read_csv(csv_file)
    nba_all_stars_df["Selection Years"] = nba_all_stars_df["Selections"].apply(lambda x: expand_selection_years(x))
    nba_all_stars_df = nba_all_stars_df[['Player', 'Total Selections', 'Selection Years']]
    nba_all_stars_df = nba_all_stars_df.explode('Selection Years').reset_index(drop=True).rename(columns={'Selection Years':'Selection Year'})
    nba_all_stars_df ["Player"] = nba_all_stars_df["Player"].apply(remove_accents)
    
    return nba_all_stars_df

filename = 'nba-all-stars.csv'
nba_all_stars_df = process_nba_all_stars(filename)
nba_all_stars_df

Unnamed: 0,Player,Total Selections,Selection Year
0,LeBron James,21,2005
1,LeBron James,21,2006
2,LeBron James,21,2007
3,LeBron James,21,2008
4,LeBron James,21,2009
...,...,...,...
1816,Jayson Williams,1,1998
1817,Mo Williams,1,2009
1818,Kevin Willis,1,1992
1819,Metta World Peace,1,2004


In [278]:
query = """
SELECT * FROM nba_all_stars_df
WHERE Player LIKE '%Kristaps%'

"""

duckdb.query(query).df()

Unnamed: 0,Player,Total Selections,Selection Year
0,Kristaps Porzingis,1,2018
