In [9]:
# First, clear any existing large DataFrames from memory
try:
    del player_statistics_df
    import gc
    gc.collect()
    print("Memory cleared from previous DataFrame")
except NameError:
    print("No existing DataFrame found in memory")

import pandas as pd
import unicodedata
import re
import duckdb
import wget
from google.cloud import storage
import os

def remove_accents(text):
    """
    Remove accent marks from input text while preserving the base characters.
    Also handles special characters like Đ/đ.
    """
    if not isinstance(text, str):
        return text
        
    special_chars = {
        'Đ': 'D', 'đ': 'd', 'Ł': 'L', 'ł': 'l', 'Ø': 'O', 'ø': 'o',
        'Ŧ': 'T', 'ŧ': 't', 'Æ': 'AE', 'æ': 'ae', 'Œ': 'OE', 'œ': 'oe',
        'ß': 'ss'
    }
    
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    
    normalized_text = unicodedata.normalize('NFKD', text)
    result = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('Mn'))
    
    return result

# Download files
print("Downloading files...")
filename = 'playerstatistics.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

filename = 'name_mappings.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

filename = 'nba_player_lookup.csv'
url = f'https://storage.googleapis.com/nba_award_predictor/nba_data/{filename}'
wget.download(url)
print(f"\nDownloaded {filename}")

# Read in the smaller datasets fully
name_mapping_df = pd.read_csv('name_mappings.csv')
nba_player_lookup_df = pd.read_csv('nba_player_lookup.csv')

# Clean player names in lookup table
nba_player_lookup_df["player_name"] = nba_player_lookup_df["player_name"].apply(remove_accents)

# Register these dataframes with DuckDB
duckdb.register('name_mapping_df', name_mapping_df)
duckdb.register('nba_player_lookup_df', nba_player_lookup_df)

# Define the output file
output_file = 'player-statistics.csv'

# Process and save in chunks
chunk_size = 100000
first_chunk = True
processed_rows = 0

print(f"Processing data in chunks of {chunk_size} rows...")

for chunk_num, chunk in enumerate(pd.read_csv('playerstatistics.csv', chunksize=chunk_size, low_memory=False)):
    # Add full_name column
    chunk['full_name'] = chunk['firstName'] + ' ' + chunk['lastName']
    
    # Register current chunk with DuckDB
    duckdb.register('player_statistics_chunk', chunk)
    
    # Modified SQL query to explicitly select columns and rename player_id to player_id_1 temporarily
    # to avoid confusion in the column selection later
    query = """
    WITH CTE AS (
        SELECT * FROM player_statistics_chunk
        LEFT JOIN name_mapping_df
        ON player_statistics_chunk.full_name = name_mapping_df.in_table_name
    )
    ,CTE2 AS (
        SELECT *,
        CASE WHEN nba_lookup_name IS NULL THEN full_name
        ELSE nba_lookup_name
        END AS player_full_name
        FROM CTE
    )
    
    SELECT 
        CTE2.*,
        nba_player_lookup_df.player_id AS player_id_1
    FROM CTE2
    LEFT JOIN nba_player_lookup_df
    ON CTE2.player_full_name = nba_player_lookup_df.player_name
    """
    
    # Execute query for this chunk
    result_chunk = duckdb.query(query).df()
    
    # Process the result chunk to omit unwanted columns and reorder
    columns_to_drop = ['full_name', 'in_table_name', 'nba_lookup_name', 'player_id',
                      'Unnamed: 3', 'player_full_name']
    
    # Drop unwanted columns (skip any that might not exist)
    for col in columns_to_drop:
        if col in result_chunk.columns:
            result_chunk = result_chunk.drop(columns=[col])
    
    # Rename player_id_1 to player_id
    if 'player_id_1' in result_chunk.columns:
        result_chunk = result_chunk.rename(columns={'player_id_1': 'player_id'})
    
    # Reorder columns to put player_id first
    if 'player_id' in result_chunk.columns:
        cols = result_chunk.columns.tolist()
        cols.remove('player_id')
        result_chunk = result_chunk[['player_id'] + cols]
    
    # Write to CSV (first chunk with header, subsequent chunks without)
    if first_chunk:
        result_chunk.to_csv(output_file, index=False, mode='w')
        first_chunk = False
    else:
        result_chunk.to_csv(output_file, index=False, mode='a', header=False)
    
    # Update progress
    processed_rows += len(result_chunk)
    print(f"Processed chunk {chunk_num+1} - Total rows: {processed_rows}")
    
    # Clean up to free memory
    duckdb.unregister('player_statistics_chunk')
    del chunk
    del result_chunk

print(f"All chunks processed. Total rows: {processed_rows}")
print(f"Results saved to {output_file}")

# Upload to GCS
print("Uploading file to Google Cloud Storage...")

# Path to your credentials file
credentials_path = 'cis-5450-final-project-485661e2f371.json'

try:
    # Set up the client with your credentials
    storage_client = storage.Client.from_service_account_json(credentials_path)
    
    # Specify your bucket name
    bucket_name = 'nba_award_predictor'
    bucket = storage_client.bucket(bucket_name)
    
    # Define blob (file in GCS) and upload from the local file
    blob = bucket.blob('nba_data/common-player-info.csv')
    blob.cache_control = "max-age=0"
    blob.upload_from_filename(output_file)
    
    print(f"File successfully uploaded to gs://{bucket_name}/nba_data/common-player-info.csv")
    
    # Get file size for confirmation
    file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"Uploaded file size: {file_size_mb:.2f} MB")
    
except Exception as e:
    print(f"Error uploading to GCS: {e}")
    print("You may need to update the credentials file.")

print("Process complete!")

No existing DataFrame found in memory
Downloading files...

Downloaded playerstatistics.csv

Downloaded name_mappings.csv

Downloaded nba_player_lookup.csv
Processing data in chunks of 100000 rows...
Processed chunk 1 - Total rows: 100376
Processed chunk 2 - Total rows: 200496
Processed chunk 3 - Total rows: 300601
Processed chunk 4 - Total rows: 401344
Processed chunk 5 - Total rows: 502681
Processed chunk 6 - Total rows: 603419
Processed chunk 7 - Total rows: 704469
Processed chunk 8 - Total rows: 805494
Processed chunk 9 - Total rows: 907237
Processed chunk 10 - Total rows: 1010221
Processed chunk 11 - Total rows: 1113653
Processed chunk 12 - Total rows: 1216620
Processed chunk 13 - Total rows: 1320887
Processed chunk 14 - Total rows: 1424425
Processed chunk 15 - Total rows: 1526924
Processed chunk 16 - Total rows: 1627956
Processed chunk 17 - Total rows: 1661388
All chunks processed. Total rows: 1661388
Results saved to player-statistics.csv
Uploading file to Google Cloud Storage..