# NBA Position Modelling

### Install Irregular Packages

In [None]:
%pip install optuna

In [None]:
pip install -U gus-nba-tools

In [None]:
pip install lightgbm

In [None]:
# Dependecy of gus_nba_tools
pip install nba_api

### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from gus_nba_tools import NBASeasonDataCollector                                         # Data collector
import optuna                                                                            # Baysian optimisation of hyperperameter searches
from lightgbm import LGBMClassifier                                                      # Light Gradient Boosting Machine
from sklearn.preprocessing import StandardScaler                                         # Scaler
from sklearn.model_selection import cross_val_score, cross_val_predict                   # Cross validation score for optimisation and predict for probabilities
from sklearn.linear_model import LogisticRegression                                      # Logistic regression model
from sklearn.svm import SVC                                                              # Support Vector Machine Model
from sklearn.metrics import accuracy_score, confusion_matrix                             # Model metrics

## Data Collection and validity check

In [None]:
# Beginning season 2015-16, end season 2024-25
collector = NBASeasonDataCollector(2015, 2024)

In [None]:
seasons_data = collector.collect_all_seasons(save_csv=True)

In [None]:
df = collector.get_combined_data()
df.to_csv('2015-2025_player_data.csv', index=True)

In [None]:
# Dropping duplicates as players are logged multiple times if they change team in a season
df = df.drop_duplicates(subset=['PLAYER_NAME', 'Season'])

In [None]:
# Checking for nulls and data types
df.info()

In [None]:
# Range Checking as all stats must be non-negative
(df >= 0).all().all()

## Feature Engineering

In [None]:
# Categorising target varible
df['position_label'] = df['Position3'].map({'Guard': 0, 'Wing': 1, 'Big': 2})

In [None]:
# Seperating numerical feature columns
feature_cols = [
    col for col in df.columns 
    if col not in ["PLAYER_ID", "PLAYER_NAME", "TEAM_ID", "Position3", "Position", "position_label", "Pos", "SEASON"]
]

In [None]:
# Training data on 2015-16 to 2023-24 seasons
X_train = df.loc[df.Season <= 2023][feature_cols]
y_train = df.loc[df.Season <= 2023]["position_label"]

In [None]:
# Testing data on 2024-25 season (most recent full season)
X_test = df.loc[df.Season == 2024][feature_cols]
y_test = df.loc[df.Season == 2024]["position_label"]

In [None]:
# Scaling features for logistic regression and SVM
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index= X_test.index)

## Modelling the data

### Support Vector Machine

In [None]:
def objective_svm(trial):
    base_params = {
        "C": trial.suggest_float("C", 0.01, 10.0, log=True),
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
        "probability": True}
    
    model = SVC(**base_params)
    
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score
    
study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(objective_svm, n_trials=50)
best_params_svm = study_svm.best_params


In [None]:
print(best_params_svm)

### Light Gradient Boosting Machine

In [None]:
def objective_lgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
    }
    model = LGBMClassifier(**params, verbose = -1)
    
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=50)
best_params_lgbm = study_lgbm.best_params

In [None]:
print(best_params_lgbm)

### Logistic Regression

In [None]:
def objective_logreg(trial):
    params = {
        "C": trial.suggest_float("C", 0.01, 10.0, log=True), 
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear"]),
        "max_iter": 10000
    }
    model = LogisticRegression(**params)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="accuracy").mean()
    return score

study_logreg = optuna.create_study(direction="maximize")
study_logreg.optimize(objective_logreg, n_trials=50)
best_params_logreg = study_logreg.best_params

In [None]:
print(best_params_logreg)

### Fitting the models

In [None]:
lgbm_best = LGBMClassifier(**best_params_lgbm)
logreg_best = LogisticRegression(**best_params_logreg)
svm_best = SVC(**best_params_svm, probability=True)

lgbm_best.fit(X_train_scaled, y_train)
logreg_best.fit(X_train_scaled, y_train)
svm_best.fit(X_train_scaled, y_train)

## Stacked Generalisation Model

In [None]:
# Out of fold (OOF) predictions
oof_lgbm = cross_val_predict(lgbm_best, X_train_scaled, y_train, cv=5, method="predict_proba")
oof_logreg = cross_val_predict(logreg_best, X_train_scaled, y_train, cv=5, method="predict_proba")
oof_svm = cross_val_predict(svm_best, X_train_scaled, y_train, cv=5, method="predict_proba")

In [None]:
# Stacking each models predictions by row
Z_train = np.hstack([oof_lgbm, oof_logreg, oof_svm])

# Setting and fitting meta learning to predict which model makes best predictions for different targets
meta = LogisticRegression(max_iter=1000)
meta.fit(Z_train, y_train)

In [None]:
# Stacking predictions made by models on test data
P_test = np.hstack([
    lgbm_best.predict_proba(X_test_scaled),
    logreg_best.predict_proba(X_test_scaled),
    svm_best.predict_proba(X_test_scaled)
])

In [None]:
# Predicting target variable based on meta model learning which base models make best predictions in different scenarios
y_pred = meta.predict(P_test)

## Model Validation

In [None]:
# Accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

## Finding Positionless Players Through Model Uncertainty

In [None]:
# Predict player position probability
probs = meta.predict_proba(P_test)

In [None]:
# Building dataset for final analysis
result = X_test.copy()
result["PLAYER_NAME"] = df.loc[X_test.index, "PLAYER_NAME"]
result["True_Label"] = y_test
result["Pred_Label"] = y_pred
result["P_Guard"] = probs[:, 1]
result["P_Wing"] = probs[:, 2]
result["P_Big"] = probs[:, 0]

In [None]:
# Finding positionless through Shannon entropy equation
result["entropy"] = -(
    result["P_Guard"] * np.log(result["P_Guard"] + 1e-9) +
    result["P_Wing"] * np.log(result["P_Wing"] + 1e-9) +
    result["P_Big"] * np.log(result["P_Big"] + 1e-9)
)

In [None]:
# Creating a dataframe sorted by highest to lowest entropy
hybrids = pd.DataFrame(result.sort_values("entropy", ascending=False))

In [None]:
# Decoding target labels for easier reading
decode = {0: "Guard", 1: "Wing", 2: "Big"}

hybrids["True_Pos"] = hybrids["True_Label"].map(decode)
hybrids["Pred_Pos"] = hybrids["Pred_Label"].map(decode)

In [None]:
# Exporting final dataset for presentation
hybrids[['PLAYER_NAME', 'True_Pos', 'Pred_Pos', 'P_Guard', 'P_Wing', 'P_Big', 'entropy']].head(10).to_csv('final_positionless.csv', index = False)