In [10]:
import random

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from scipy.spatial.distance import canberra

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

import warnings

warnings.filterwarnings("ignore")


def set_seed(seed):
    """
    Set seed for reproducibility

    Parameters
    ----------
    seed : int
    """
    random.seed(seed)
    np.random.seed(seed)


def load_data(path) -> tuple:
    """
    Load data from the given path and return the features and labels

    Parameters
    ----------
    path : str

    Returns
    -------
    X : pd.DataFrame - features of the data to be used for training
    y_encoded : np.ndarray - encoded labels of the data to be used for training
    label_encoder : LabelEncoder - label encoder used to encode and decode the labels
    """
    data = pd.read_csv(path)  # 27 features

    X = data.drop(
        columns=["Pos", "Tm", "G", "GS", "FG%", "3P%", "FT%", "PTS"]
    )  # 19 features
    y = data["Pos"]

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X[["AST", "BLK"]] *= 10

    return X, y_encoded, label_encoder


def correlation_distance(x, y):
    return 1 - np.corrcoef(x, y)[0, 1]


def canberra_distance(x, y):
    return canberra(x, y)


def chi_square_distance(x, y):
    return 0.5 * np.sum((x - y) ** 2 / (x + y + 1e-10))


RANDOM_STATE = 0
set_seed(RANDOM_STATE)

X, y_encoded, label_encoder = load_data("./nba_stats.csv")

X_train, X_validate, y_train, y_validate = train_test_split(
    X, y_encoded, train_size=0.8, random_state=RANDOM_STATE
)

models = train_model(X_train, y_train, label_encoder)
evaluate_each_model(models, X_validate, y_validate, label_encoder, "Validate")

Training set accuracy: 0.643
Confusion Matrix
Predicted    C   PF   PG   SF   SG  All
Actual                                 
C          110   15    0    2    2  129
PF          30   69    4   27    8  138
PG           1    3  101    4   25  134
SF           5   30    9   60   27  131
SG           0   11   21   20  100  152
All        146  128  135  113  162  684
Model: adaboost
Validate set accuracy: 0.579
Confusion Matrix
Predicted   C  PF  PG  SF  SG  All
Actual                            
C          23   5   0   1   0   29
PF         11  18   0   7   5   41
PG          0   0  18   4  10   32
SF          1   4   1  18   5   29
SG          0   4   3  11  22   40
All        35  31  22  41  42  171
Model: adaboost2
Validate set accuracy: 0.579
Confusion Matrix
Predicted   C  PF  PG  SF  SG  All
Actual                            
C          27   2   0   0   0   29
PF         10  17   2  10   2   41
PG          0   0  19   3  10   32
SF          1   2   3  18   5   29
SG          0   5  

### Data Features

In [None]:
selected_features = ['ORB', 'BLK', '3PA', 'TRB', '3P', 'FG%', '3P%', 'DRB', '2P%', 'STL', 'PF', 'eFG%', '2P', 'AST', 'FGA', 'FT%', 'MP', '2PA', 'PTS', 'Age']
X = data[selected_features]

### Grid Search

In [None]:
def find_hyperparameters(X_train, y_train):
    """Gradient Boosting Grid Search
    ada_params = {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0],
        "estimator": [
            DecisionTreeClassifier(max_depth=1),
            DecisionTreeClassifier(max_depth=2),
        ],
        "algorithm": ["SAMME", "SAMME.R"],
        "random_state": [RANDOM_STATE],
    }
    ada_grid = GridSearchCV(
        AdaBoostClassifier(random_state=RANDOM_STATE), ada_params, scoring="accuracy"
    )
    ada_grid.fit(X_train, y_train)
    best_ada = ada_grid.best_estimator_
    print("Best AdaBoost Parameters:", ada_grid.best_params_)
    """

    """ Random Forest Grid Search
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20, 30],
        "max_features": ["sqrt", "log2"],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "criterion": ["gini", "entropy"],
    }
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=RANDOM_STATE), param_grid
    )
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    print(best_rf)
    """

    """ SVM Grid Search
    svm_params = {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto', 0.01, 0.001]
    }
    svm_grid = GridSearchCV(SVC(probability=True, random_state=RANDOM_STATE), svm_params, scoring='accuracy')
    svm_grid.fit(X_train, y_train)
    best_svm = svm_grid.best_estimator_
    print("Best SVM Parameters:", svm_grid.best_params_)
    """

    """ Gradient Boosting Grid Search
    gb_params = {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [3, 5, 7],
        "subsample": [0.7, 0.8, 1.0],
    }
    gb_grid = GridSearchCV(
        GradientBoostingClassifier(random_state=RANDOM_STATE),
        gb_params,
        scoring="accuracy",
    )
    gb_grid.fit(X_train, y_train)
    best_gb = gb_grid.best_estimator_
    print("Best Gradient Boosting Parameters:", gb_grid.best_params_)
    """

    """ KNN Grid Search
    knn_params = {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski'],
        'p': [1, 2, 3, 10, 19]
    }
    knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, scoring='accuracy')
    knn_grid.fit(X_train, y_train)
    best_knn = knn_grid.best_estimator_
    print("Best KNN Parameters:", knn_grid.best_params_)
    """

    """ XGBoost Grid Search
    xgb_params = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.3],
            'max_depth': [3, 5, 7],
            'subsample': [0.7, 0.8, 1.0]
    }
    xgb_grid = GridSearchCV(XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='mlogloss'),
                                                    xgb_params, scoring='accuracy')
    xgb_grid.fit(X_train, y_train)
    best_xgb = xgb_grid.best_estimator_
    print("Best XGBoost Parameters:", xgb_grid.best_params_)
    """

    """ Logistic Regression Grid Search
    lr_params = {
            'C': [0.1, 1, 10, 100],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['liblinear', 'saga']
    }
    lr_grid = GridSearchCV(LogisticRegression(random_state=RANDOM_STATE), lr_params, scoring='accuracy')
    lr_grid.fit(X_train, y_train)
    best_lr = lr_grid.best_estimator_
    print("Best Logistic Regression Parameters:", lr_grid.best_params_)
    """

### Individual Model Performance

In [9]:
def train_model(X_train, y_train, label_encoder) -> dict:
    """
    Train multiple models and return them in a dictionary

    Parameters
    ----------
    X_train : pd.DataFrame - features of the data to be used for training
    y_train : np.ndarray - encoded labels of the data to be used for training

    Returns
    -------
    models : dict - dictionary containing the trained models
    """
    adaboost = AdaBoostClassifier(
        random_state=RANDOM_STATE,
        n_estimators=50,
        learning_rate=1,
    )
    adaboost2 = AdaBoostClassifier(
        random_state=RANDOM_STATE,
        n_estimators=100,
        learning_rate=1,
    )
    adaboost3 = AdaBoostClassifier(
        random_state=RANDOM_STATE,
        n_estimators=75,
        learning_rate=1,
    )
    randomforest = RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=50,
        max_depth=10,
        criterion="entropy",
        min_samples_leaf=2,
        min_samples_split=10,
    )
    svm = SVC(random_state=RANDOM_STATE)
    logistic = LogisticRegression(random_state=RANDOM_STATE)
    knn2 = KNeighborsClassifier(
        n_neighbors=10,
        metric=correlation_distance,
        weights="distance",
    )
    knn3 = KNeighborsClassifier(
        n_neighbors=10,
        metric=canberra_distance,
        weights="distance",
    )
    knn6 = KNeighborsClassifier(
        n_neighbors=10,
        metric=chi_square_distance,
        weights="distance",
    )
    gradient_boosting = GradientBoostingClassifier(random_state=RANDOM_STATE)
    naive_bayes = GaussianNB()
    extra_trees = ExtraTreesClassifier(random_state=RANDOM_STATE)
    xgboost = XGBClassifier(
        random_state=RANDOM_STATE,
        use_label_encoder=False,
        eval_metric="mlogloss",
        learning_rate=0.01,
        max_depth=5,
        n_estimators=200,
        subsample=0.7,
    )
    mlpc1 = MLPClassifier(
        random_state=RANDOM_STATE,
        hidden_layer_sizes=(200),
        activation="relu",
        solver="adam",
        max_iter=400,
        alpha=0.0001,
        learning_rate="constant",
    )
    decision_tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
    mlpc2 = MLPClassifier(
        random_state=RANDOM_STATE,
        learning_rate_init=0.001,
        hidden_layer_sizes=(150),
        max_iter=400,
        activation="tanh",
        solver="adam",
    )
    mlpc3 = MLPClassifier(
        random_state=RANDOM_STATE,
        hidden_layer_sizes=(50, 50),
        learning_rate_init=0.001,
        activation="tanh",
        solver="sgd",
        max_iter=400,
    )

    models = {
        "adaboost": adaboost,
        "adaboost2": adaboost2,
        "adaboost3": adaboost3,
        "randomforest": randomforest,
        "svm": svm,
        "logistic": logistic,
        "knn2": knn2,
        "knn3": knn3,
        "knn6": knn6,
        "gradient_boosting": gradient_boosting,
        "naive_bayes": naive_bayes,
        "extra_trees": extra_trees,
        "xgboost": xgboost,
        "mlpc1": mlpc1,
        "decision_tree": decision_tree,
        "mlpc2": mlpc2,
        "mlpc3": mlpc3,
    }

    for model in models.values():
        model.fit(X_train, y_train)

    # find_hyperparameters(X_train, y_train)

    y_pred = model.predict(X_train)
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f"Training set accuracy: %.3f" % accuracy)

    print("Confusion Matrix")
    print(
        pd.crosstab(
            label_encoder.inverse_transform(y_train),
            label_encoder.inverse_transform(y_pred),
            rownames=["Actual"],
            colnames=["Predicted"],
            margins=True,
        )
    )

    return models

In [8]:
def evaluate_each_model(models, X_validate, y_validate, label_encoder, data_type):
    """
    Evaluate each model with the validation set and print the accuracy and classification report

    Parameters
    ----------
    models : dict - dictionary containing the trained models
    X_validate : pd.DataFrame - features of the data to be used for validation
    y_validate : np.ndarray - encoded labels of the data to be used for validation
    label_encoder : LabelEncoder - label encoder used to encode and decode the labels

    Returns
    -------
    None
    """
    for name, model in models.items():
        y_pred = model.predict(X_validate)

        accuracy = accuracy_score(y_validate, y_pred)
        report = classification_report(
            y_validate, y_pred, target_names=label_encoder.classes_
        )

        print(f"Model: {name}")
        print(f"{data_type} set accuracy: %.3f" % accuracy)

        print("Confusion Matrix")
        print(
            pd.crosstab(
                label_encoder.inverse_transform(y_validate),
                label_encoder.inverse_transform(y_pred),
                rownames=["Actual"],
                colnames=["Predicted"],
                margins=True,
            )
        )