In [1]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -U gus-nba-tools

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install nba_api




In [4]:
import pandas as pd
import numpy as np
from gus_nba_tools import NBASeasonDataCollector
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
collector = NBASeasonDataCollector(2015, 2024)

In [6]:
seasons_data = collector.collect_all_seasons(save_csv=True)


Building dataset for 2015-16...
  - Fetching NBA advanced Per100 stats for 2015-16...
  - Fetching BBRef positions from https://www.basketball-reference.com/leagues/NBA_2016_per_game.html...
  - Merged 538/570 players with BBRef positions.
  - After minutes filter: 459 players remain.
  - Saved: nba_2015_2016_per100_adv_gwb.csv

Building dataset for 2016-17...
  - Fetching NBA advanced Per100 stats for 2016-17...
  - Fetching BBRef positions from https://www.basketball-reference.com/leagues/NBA_2017_per_game.html...
  - Merged 545/584 players with BBRef positions.
  - After minutes filter: 457 players remain.
  - Saved: nba_2016_2017_per100_adv_gwb.csv

Building dataset for 2017-18...
  - Fetching NBA advanced Per100 stats for 2017-18...
  - Fetching BBRef positions from https://www.basketball-reference.com/leagues/NBA_2018_per_game.html...
  - Merged 620/654 players with BBRef positions.
  - After minutes filter: 465 players remain.
  - Saved: nba_2017_2018_per100_adv_gwb.csv

Buildi

In [7]:
df = collector.get_combined_data()
df.to_csv('2015-2025_player_data.csv', index=True)

In [9]:
df = df.drop_duplicates(subset=['PLAYER_NAME', 'Season'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3993 entries, 0 to 5113
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PLAYER_ID    3993 non-null   int64  
 1   PLAYER_NAME  3993 non-null   object 
 2   TEAM_ID      3993 non-null   int64  
 3   GP           3993 non-null   int64  
 4   MIN          3993 non-null   float64
 5   PTS          3993 non-null   float64
 6   REB          3993 non-null   float64
 7   AST          3993 non-null   float64
 8   STL          3993 non-null   float64
 9   BLK          3993 non-null   float64
 10  TOV          3993 non-null   float64
 11  FG_PCT       3993 non-null   float64
 12  FG3_PCT      3993 non-null   float64
 13  FT_PCT       3993 non-null   float64
 14  FGA          3993 non-null   float64
 15  FG3A         3993 non-null   float64
 16  USG_PCT      3993 non-null   float64
 17  AST_PCT      3993 non-null   float64
 18  REB_PCT      3993 non-null   float64
 19  OREB_PCT   

In [12]:
df['position_label'] = df['Position3'].map({'Guard': 0, 'Wing': 1, 'Big': 2})

In [13]:
feature_cols = [
    col for col in df.columns 
    if col not in ["PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "Position3", "Position", "position_label", "Pos", "SEASON", "START_YEAR"]
]

In [14]:
X_train = df.loc[df.Season <= 2023][feature_cols]
y_train = df.loc[df.Season <= 2023]["position_label"]

In [15]:
X_test = df.loc[df.Season == 2024][feature_cols]
y_test = df.loc[df.Season == 2024]["position_label"]

In [16]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index= X_test.index)

In [17]:
def objective_svm(trial):
    base_params = {
        "C": trial.suggest_float("C", 0.01, 10.0, log=True),
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
        "probability": True}
    
    model = SVC(**base_params)
    
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score
    
study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(objective_svm, n_trials=50)
best_params_svm = study_svm.best_params


[I 2026-01-13 18:43:33,999] A new study created in memory with name: no-name-0d2eea16-fda0-4b3a-8fb2-b2a2e4f2236c
[I 2026-01-13 18:43:41,179] Trial 0 finished with value: 0.7986873000614589 and parameters: {'C': 2.780472334152691, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.7986873000614589.
[I 2026-01-13 18:43:44,752] Trial 1 finished with value: 0.767816789321903 and parameters: {'C': 0.6301398162006213, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: 0.7986873000614589.
[I 2026-01-13 18:43:48,664] Trial 2 finished with value: 0.798970562743275 and parameters: {'C': 0.9326070896231853, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 2 with value: 0.798970562743275.
[I 2026-01-13 18:43:52,074] Trial 3 finished with value: 0.7672565674393684 and parameters: {'C': 0.4119335068650199, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 2 with value: 0.798970562743275.
[I 2026-01-13 18:43:55,487] Trial 4 finished with value: 0.7759530075484186 and

In [18]:
print(best_params_svm)

{'C': 0.5999833379543412, 'kernel': 'linear', 'gamma': 'auto'}


In [19]:
def objective_lgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
    }
    model = LGBMClassifier(**params, verbose = -1)
    
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=50)
best_params_lgbm = study_lgbm.best_params

[I 2026-01-13 18:47:12,726] A new study created in memory with name: no-name-269b2787-d35e-42dd-8430-95e25688a49f
found 0 physical cores < 1
  File "C:\Users\OllyL-S\AppData\Local\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
[I 2026-01-13 18:47:14,827] Trial 0 finished with value: 0.7860539420394913 and parameters: {'num_leaves': 62, 'max_depth': 6, 'learning_rate': 0.19302090164813276, 'n_estimators': 125}. Best is trial 0 with value: 0.7860539420394913.
[I 2026-01-13 18:47:23,623] Trial 1 finished with value: 0.7821276612509258 and parameters: {'num_leaves': 103, 'max_depth': 14, 'learning_rate': 0.14994145061591924, 'n_estimators': 411}. Best is trial 0 with value: 0.7860539420394913.
[I 2026-01-13 18:47:30,632] Trial 2 finished with value: 0.7826867012307547 and parameters: {'num_leaves': 149, 'max_depth': 7, 'learning_rate': 0.21725942309679902, 'n_es

In [20]:
print(best_params_lgbm)

{'num_leaves': 115, 'max_depth': 11, 'learning_rate': 0.23642005529963217, 'n_estimators': 236}


In [21]:
def objective_logreg(trial):
    params = {
        "C": trial.suggest_float("C", 0.01, 10.0, log=True), 
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear"]),
        "max_iter": 10000
    }
    model = LogisticRegression(**params)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score

study_logreg = optuna.create_study(direction="maximize")
study_logreg.optimize(objective_logreg, n_trials=50)
best_params_logreg = study_logreg.best_params

[I 2026-01-13 18:50:48,439] A new study created in memory with name: no-name-e63efadb-9ec0-49cd-a541-973ee8697133
[I 2026-01-13 18:50:48,855] Trial 0 finished with value: 0.79167507130813 and parameters: {'C': 2.289923594955042, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.79167507130813.
[I 2026-01-13 18:50:49,065] Trial 1 finished with value: 0.7810131269993853 and parameters: {'C': 1.006721940313077, 'solver': 'liblinear'}. Best is trial 0 with value: 0.79167507130813.
[I 2026-01-13 18:50:49,192] Trial 2 finished with value: 0.7939214743842287 and parameters: {'C': 0.25618707136136604, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.7939214743842287.
[I 2026-01-13 18:50:49,337] Trial 3 finished with value: 0.7812928439730842 and parameters: {'C': 0.09196154766319352, 'solver': 'liblinear'}. Best is trial 2 with value: 0.7939214743842287.
[I 2026-01-13 18:50:49,579] Trial 4 finished with value: 0.7944824842019005 and parameters: {'C': 7.1257583856074875, 'solver': 'lbfgs'}. Be

In [22]:
print(best_params_logreg)

{'C': 0.5051828297976904, 'solver': 'lbfgs'}


In [23]:
lgbm_best = LGBMClassifier(**best_params_lgbm)
logreg_best = LogisticRegression(**best_params_logreg)
svm_best = SVC(**best_params_svm, probability=True)

lgbm_best.fit(X_train_scaled, y_train)
logreg_best.fit(X_train_scaled, y_train)
svm_best.fit(X_train_scaled, y_train)

In [24]:
oof_lgbm = cross_val_predict(lgbm_best, X_train_scaled, y_train, cv=5, method="predict_proba")
oof_logreg = cross_val_predict(logreg_best, X_train_scaled, y_train, cv=5, method="predict_proba")
oof_svm = cross_val_predict(svm_best, X_train_scaled, y_train, cv=5, method="predict_proba")

Z_train = np.hstack([oof_lgbm, oof_logreg, oof_svm])

meta = LogisticRegression(max_iter=1000)
meta.fit(Z_train, y_train)

In [25]:
P_test = np.hstack([
    lgbm_best.predict_proba(X_test_scaled),
    logreg_best.predict_proba(X_test_scaled),
    svm_best.predict_proba(X_test_scaled)
])


In [26]:
y_pred = meta.predict(P_test)

In [27]:
print(y_test.value_counts())


position_label
0    197
2    165
1     69
Name: count, dtype: int64


In [28]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


Accuracy: 0.7888631090487239

Confusion Matrix:
 [[169  19   9]
 [ 24  27  18]
 [  5  16 144]]


In [29]:
probs = meta.predict_proba(P_test)

result = X_test.copy()
result["PLAYER_NAME"] = df.loc[X_test.index, "PLAYER_NAME"]
result["True_Label"] = y_test
result["Pred_Label"] = y_pred
result["P_Guard"] = probs[:, 1]
result["P_Wing"] = probs[:, 2]
result["P_Big"] = probs[:, 0]

result["entropy"] = -(
    result["P_Guard"] * np.log(result["P_Guard"] + 1e-9) +
    result["P_Wing"] * np.log(result["P_Wing"] + 1e-9) +
    result["P_Big"] * np.log(result["P_Big"] + 1e-9)
)

hybrids = pd.DataFrame(result.sort_values("entropy", ascending=False))

In [30]:
hybrids.head(10)

Unnamed: 0,GP,MIN,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FG3_PCT,...,DEF_RATING,NET_RATING,Season,PLAYER_NAME,True_Label,Pred_Label,P_Guard,P_Wing,P_Big,entropy
4719,68,47.4,14.6,9.9,9.1,2.4,1.7,4.2,0.424,0.325,...,108.8,6.6,2024,Draymond Green,2,2,0.318541,0.393036,0.288424,1.090055
4773,66,46.1,15.7,8.9,5.4,2.1,0.8,2.9,0.475,0.423,...,111.2,3.8,2024,Jake LaRavia,2,2,0.37572,0.393314,0.230966,1.073293
4750,82,47.8,21.7,6.6,2.9,0.8,0.3,1.1,0.508,0.433,...,115.6,-1.2,2024,Harrison Barnes,2,1,0.441596,0.293334,0.265071,1.07265
5099,36,47.7,14.4,9.1,4.0,1.9,0.5,2.6,0.468,0.341,...,112.0,-12.4,2024,Wendell Moore Jr.,0,2,0.385613,0.387839,0.226548,1.071184
4709,54,47.4,11.6,10.4,5.0,1.4,0.3,2.4,0.383,0.254,...,102.9,6.0,2024,Dillon Jones,1,1,0.417663,0.353201,0.229136,1.069854
4751,74,49.3,13.0,6.8,3.0,1.8,1.0,1.5,0.458,0.382,...,110.4,1.4,2024,Haywood Highsmith,1,1,0.442612,0.329723,0.227665,1.063501
4715,63,49.0,14.8,6.7,2.4,1.6,0.6,1.5,0.448,0.411,...,110.3,8.8,2024,Dorian Finney-Smith,2,1,0.440281,0.334321,0.225398,1.063299
4574,60,48.1,28.1,7.0,4.1,1.6,1.3,2.6,0.448,0.374,...,111.7,-2.3,2024,Andrew Wiggins,1,1,0.458391,0.223404,0.318205,1.056756
4772,66,48.3,20.1,10.2,5.9,2.2,0.5,3.5,0.461,0.311,...,113.9,-6.0,2024,Jaime Jaquez Jr.,1,2,0.396587,0.403869,0.199544,1.05457
5092,60,48.5,19.3,8.1,4.5,1.3,0.3,2.7,0.406,0.351,...,113.4,-10.2,2024,Tyrese Martin,0,1,0.452962,0.210734,0.336304,1.053352


In [31]:
decode = {0: "Guard", 1: "Wing", 2: "Big"}

hybrids["True_Pos"] = hybrids["True_Label"].map(decode)
hybrids["Pred_Pos"] = hybrids["Pred_Label"].map(decode)

In [32]:
hybrids[['PLAYER_NAME', 'True_Pos', 'Pred_Pos', 'P_Guard', 'P_Wing', 'P_Big', 'entropy']].head(10).to_csv('final_positionless.csv', index = False)