In [171]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import spacy

In [172]:
df_processed = pd.read_csv('df_processed.csv')
df_2025_processed = pd.read_csv('df_2025_processed.csv')
df_2025 = pd.read_csv("/Users/ethantsao/NBADraft1/103NBAProspects.csv")

In [173]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# === Preprocessing Function ===
def preprocess(df, is_train=True, label_encoders={}):
    df = df.copy()  # Avoid modifying the original DataFrame

    # Ensure 'Team' column is treated as string
    if 'Team' in df.columns:
        df['Team'] = df['Team'].astype(str)  # Convert all values to strings

    # Encode categorical features
    if 'Team' in df.columns:
        if is_train:
            le = LabelEncoder()
            df['Team'] = le.fit_transform(df['Team'])
            label_encoders['Team'] = le
        else:
            if 'Team' in label_encoders:
                le = label_encoders['Team']
                df['Team'] = df['Team'].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
            else:
                raise ValueError("LabelEncoder for 'Team' is missing!")

    # Convert all non-categorical columns to numeric
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert strings to NaN if needed

    # Handle missing values
    df = df.fillna(0)  # Replace NaNs with 0 (or use df.mean() for numerical imputation)

     # === Add Star Score Feature ===
    df['Star_Score'] = (
        df['PPG'] * 2 +   # Weight scoring higher
        df['MPG'] * 1.5 + 
        df['RPG'] + 
        df['APG'] + 
        df['FG%'] * 10 +  # Shooting efficiency
        df['3P%'] * 8 +
        df['FT%'] * 5
    )

    return df, label_encoders

# === Check if 'overall_pick' Exists ===
print("Columns in df_processed:", df_processed.columns)  # Debugging step

# === Preprocess Training & Test Data ===
label_encoders = {}  # Store encoders for consistency

df_processed, label_encoders = preprocess(df_processed, is_train=True, label_encoders=label_encoders)
df_2025_processed, _ = preprocess(df_2025_processed, is_train=False, label_encoders=label_encoders)

# === Identify Correct Target Column ===
target_col = 'overall_pick'

# === Define Features (X) and Target (y) ===
X_train = df_processed.drop(columns=[target_col], errors='ignore')
y_train = df_processed[target_col]

X_test = df_2025_processed  # Test data does not have a draft order yet

# Scale Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === Train Random Forest Model ===
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# === Evaluate on Training Data ===
y_train_pred = rf_model.predict(X_train)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)

print(f"Training MAE: {mae:.4f}")
print(f"Training R² Score: {r2:.4f}")

# === Predict Draft Order for 2025 Class ===
y_test_pred = rf_model.predict(X_test)

# Add Predictions to Test Data
df_2025_processed['Predicted Draft Order'] = y_test_pred

# Sort Players by Predicted Draft Order
df_2025_sorted = df_2025_processed.sort_values(by='Predicted Draft Order')

# Display Top 10 Predicted Picks
print(df_2025_sorted[['Predicted Draft Order']].head(10))

Columns in df_processed: Index(['Player', 'Team', 'GP', 'MPG', 'PPG', 'FG%', '3P%', 'FT%', 'RPG', 'APG',
       'overall_pick'],
      dtype='object')
Training MAE: 4.3235
Training R² Score: 0.8972
    Predicted Draft Order
11                  3.840
16                  4.435
0                   4.720
2                   5.420
5                   6.200
1                   6.860
6                   7.070
41                  9.280
49                  9.340
13                 10.070


In [174]:
df_2025_processed['Player'] = df_2025['Player']  # Add the actual player names to df_2025_processed

# Now sort by 'Predicted Draft Order' and get the player names
sorted_df = df_2025_processed.sort_values(by='Predicted Draft Order')

# Extract the player names based on the sorted order
sorted_player_names = sorted_df['Player'].tolist()

In [175]:
sorted_player_names

['Nique Clifford',
 'Hansen Yang',
 'Ace Bailey',
 'Kam Jones',
 'Ryan Kalkbrenner',
 'Cooper Flagg',
 'Hunter Sallis',
 'Ian Schieffelin',
 'Coleman Hawkins',
 'Xaivian Lee',
 'Dylan Harper',
 'Collin Murray-Boyles',
 'Kadary Richmond',
 'Saint Thomas',
 'Grant Nelson',
 'Johni Broome',
 'Jaden Bradley',
 'Kasparas Jakucionis',
 'Walter Clayton, Jr.',
 'Mark Sears',
 'Caleb Love',
 'Alex Karaban',
 'Tyon Grant-Foster',
 'Baye Ndongo',
 'Tucker DeVries',
 'Boogie Fland',
 'Kylan Boswell',
 'Rasheer Fleming',
 'Matthew Cleveland',
 'Adou Thiero',
 'Darrion Williams',
 'Tre Johnson',
 'Derik Queen',
 'V.J. Edgecombe',
 'Danny Wolf',
 'Jamir Watkins',
 'Augustas Marciulionis',
 'Liam McNeeley',
 'Baba Miller',
 'Egor Demin',
 'Payton Sandfort',
 'Matthew Murrell',
 'Tomislav Ivisic',
 'Elliot Cadeau',
 'Nolan Traore',
 'Noa Essengue',
 'Bogoljub Markovic',
 'Noah Penda',
 'Donnie Freeman',
 'Alex Condon',
 'J.T. Toppin',
 'Kon Knueppel',
 'Asa Newell',
 'K.J. Lewis',
 'Chaz Lanier',
 'Aar