In [151]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [152]:
# Load dataset
df = pd.read_csv("./nba_logreg.csv")

In [153]:
# Print the column names to inspect them
print(df.columns)

Index(['Name', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3P Made', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'TARGET_5Yrs'],
      dtype='object')


In [154]:
# Checking for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Name            0
GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            11
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64


In [155]:
df = df.dropna()

In [156]:
def score_classifier(dataset, classifier, labels):
    """
    Performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :return: recall score
    """
    kf = KFold(n_splits=3, random_state=50, shuffle=True)
    confusion_mat = np.zeros((2, 2))
    recall = 0
    for training_ids, test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set, training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat += confusion_matrix(test_labels, predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
    recall /= 3
    print(confusion_mat)
    print(f'Recall score: {recall}')
    return recall

In [157]:
# Extract names, labels, features names and values
names = df['Name'].values.tolist()  # players names
labels = df['TARGET_5Yrs'].values  # labels
paramset = df.drop(['TARGET_5Yrs', 'Name'], axis=1).columns.values
df_vals = df.drop(['TARGET_5Yrs', 'Name'], axis=1).values

In [158]:
# Normalize dataset
X = MinMaxScaler().fit_transform(df_vals)

In [159]:
# Evaluate different classifiers
classifiers = {
    'Support Vector Classifier': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbors': KNeighborsClassifier()
}

In [160]:
best_classifier = None
best_recall = 0

In [161]:
for name, clf in classifiers.items():
    print(f"Evaluating {name}")
    recall = score_classifier(X, clf, labels)
    if recall > best_recall:
        best_recall = recall
        best_classifier = clf

print(f'Best classifier: {best_classifier}')
print(f'Best recall score: {best_recall}')

Evaluating Support Vector Classifier
[[257. 246.]
 [155. 671.]]
Recall score: 0.8137119368871194
Evaluating Random Forest
[[258. 245.]
 [177. 649.]]
Recall score: 0.7865201590110349
Evaluating Logistic Regression
[[257. 246.]
 [146. 680.]]
Recall score: 0.8241616284499497
Evaluating K-Neighbors
[[244. 259.]
 [204. 622.]]
Recall score: 0.753738725454054
Best classifier: LogisticRegression()
Best recall score: 0.8241616284499497


In [162]:
# Save the best model
joblib.dump(best_classifier, 'best_nba_classifier.pkl')
joblib.dump(MinMaxScaler().fit(df_vals), 'scaler.pkl')

['scaler.pkl']