<a href="https://colab.research.google.com/github/frankwillard/StatGPT/blob/main/NBA_Champion_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

In [None]:
# Load in dataset of previous seasons, include only teams who made the playoffs
historic_dataset = pd.read_csv("https://raw.githubusercontent.com/milesfking/NBA-Champion-Model/main/data/nba_team_advanced_data.csv")
historic_dataset = historic_dataset[historic_dataset["Playoffs"] == "Y"]

In [None]:
historic_dataset = historic_dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.', 'OeFG%', 'OTOV%', 'ORB%', 'OFT/FGA',
       'DeFG%', 'DTOV%', 'DRB%', 'DFT/FGA', 'Pace', 'FTr', '3PAr',])

In [None]:
historic_dataset.columns

Index(['Year', 'Team', 'Age', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg',
       'TS%', 'W/L%', 'Champion', 'won_last', 'won_last_3'],
      dtype='object')

In [None]:
historic_dataset = historic_dataset[['Year', 'Team', 'Age', 'MOV', 'NRtg', 'TS%', 'W/L%', 'Champion', 'won_last']]

In [None]:
# X_historic = historic_dataset.iloc[:, ~historic_dataset.columns.isin(['Champion', 'Team'])]
# y_historic = historic_dataset.loc[:, historic_dataset.columns == 'Champion'].values
# y_historic = np.array([0 if val == "N" else 1 for val in y_historic])

In [None]:
X_train_with_team2 = historic_dataset[historic_dataset['Year'] < 2013]
y_train_with_team = historic_dataset[historic_dataset['Year'] < 2013].loc[:, historic_dataset.columns == 'Champion'].values
X_test_with_team2 = historic_dataset[historic_dataset['Year'] > 2014]
y_test_with_team = historic_dataset[historic_dataset['Year'] > 2014].loc[:, historic_dataset.columns == 'Champion'].values

In [None]:
X_train_with_team = X_train_with_team2.iloc[:, ~ X_train_with_team2.columns.isin(['Year', 'Champion', 'Team'])]
X_test_with_team = X_test_with_team2.iloc[:, ~ X_test_with_team2.columns.isin(['Year', 'Champion', 'Team'])]

In [None]:
y_train_with_team = np.array([0 if val == "N" else 1 for val in y_train_with_team])
y_test_with_team = np.array([0 if val == "N" else 1 for val in y_test_with_team])

In [None]:
# Transform data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last'])], remainder='passthrough')
X_train_with_team = np.array(ct.fit_transform(X_train_with_team))
X_test_with_team = np.array(ct.transform(X_test_with_team))

In [None]:
# Scale data
sc = StandardScaler()
X_train_with_team = sc.fit_transform(X_train_with_team)
X_test_with_team = sc.transform(X_test_with_team)

In [None]:
# Create logistic regression classifier
classifier = LogisticRegression(solver='lbfgs', random_state=0)
decision_tree_model = DecisionTreeClassifier()
random_forest_model = RandomForestClassifier()
xgboost_model = XGBClassifier()

In [None]:
classifier.fit(X_train_with_team, y_train_with_team)
decision_tree_model.fit(X_train_with_team, y_train_with_team)
random_forest_model.fit(X_train_with_team, y_train_with_team)
xgboost_model.fit(X_train_with_team, y_train_with_team)

In [None]:
logreg_predictions = classifier.predict(X_test_with_team)
decision_tree_predictions = decision_tree_model.predict(X_test_with_team)
random_forest_predictions = random_forest_model.predict(X_test_with_team)
xgboost_predictions = xgboost_model.predict(X_test_with_team)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
logreg_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
y_test_with_team

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
def evaluate(predictions, test_labels):
  # Print the evaluation metrics
  accuracy = accuracy_score(test_labels, predictions)
  conf_matrix = confusion_matrix(test_labels, predictions)
  classification_rep = classification_report(test_labels, predictions)

  precision = precision_score(test_labels, predictions)
  recall = recall_score(test_labels, predictions)
  f1 = f1_score(test_labels, predictions)

  print(f'Accuracy: {accuracy}')
  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1: {f1}')
  print(f'Confusion Matrix:\n{conf_matrix}')
  # print(f'Classification Report:\n{classification_rep}')

In [None]:
evaluate(logreg_predictions, y_test_with_team)

Accuracy: 0.921875
Precision: 0.0
Recall: 0.0
F1: 0.0
Confusion Matrix:
[[118   2]
 [  8   0]]


In [None]:
evaluate(decision_tree_predictions, y_test_with_team)

Accuracy: 0.875
Precision: 0.16666666666666666
Recall: 0.25
F1: 0.2
Confusion Matrix:
[[110  10]
 [  6   2]]


In [None]:
evaluate(random_forest_predictions, y_test_with_team)

Accuracy: 0.9296875
Precision: 0.42857142857142855
Recall: 0.375
F1: 0.39999999999999997
Confusion Matrix:
[[116   4]
 [  5   3]]


In [None]:
evaluate(xgboost_predictions, y_test_with_team)

Accuracy: 0.90625
Precision: 0.16666666666666666
Recall: 0.125
F1: 0.14285714285714288
Confusion Matrix:
[[115   5]
 [  7   1]]


In [None]:
# Predict probabilities for test data
y_current_proba = classifier.predict_proba(X_test_with_team)

# Create a DataFrame with team, year, and predicted probabilities
team_names = X_test_with_team2['Team'].values
year = X_test_with_team2['Year'].values

current_predictions = pd.DataFrame(data=y_current_proba[:, 1], columns=['pred_proba'])

current_predictions['Team'] = team_names
current_predictions['Year'] = year

# Group by 'Year' and calculate the normalizing constant for each group
normalizing_consts = current_predictions.groupby('Year')['pred_proba'].transform('sum')

# Scale probabilities so they sum to 1 for each group
y_current_proba_norm = y_current_proba[:, 1] / normalizing_consts

# Store normalized probabilities in the DataFrame
current_predictions['norm_pred'] = y_current_proba_norm

# Output DataFrame with team, year, and normalized predicted probabilities
print("Model output:")
print(current_predictions.sort_values(by=['norm_pred'], ascending=False))

# Get the feature names after one-hot encoding
feature_names = ct.get_feature_names_out()

# Create a DataFrame to store coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': classifier.coef_.flatten()})

# Print the coefficients
print()
print("Model coefficients:")
print(coefficients_df)

Model output:
     pred_proba                    Team    Year  norm_pred
36     0.433319   Golden State Warriors  2017.0   0.578865
22     0.894677   Golden State Warriors  2016.0   0.531093
68     0.255370   Golden State Warriors  2019.0   0.525714
110    0.125963               Utah Jazz  2021.0   0.418819
13     0.394867       San Antonio Spurs  2015.0   0.406105
..          ...                     ...     ...        ...
29     0.001056  Portland Trail Blazers  2016.0   0.000627
43     0.000439  Portland Trail Blazers  2017.0   0.000587
93     0.000410  Portland Trail Blazers  2020.0   0.000544
91     0.000386           Orlando Magic  2020.0   0.000513
2      0.000462           Brooklyn Nets  2015.0   0.000475

[128 rows x 4 columns]

Model coefficients:
               Feature  Coefficient
0  encoder__won_last_N    -0.253362
1  encoder__won_last_Y     0.253362
2       remainder__Age     0.336597
3       remainder__MOV     0.564976
4      remainder__NRtg     0.434495
5       remainder

In [None]:
current_predictions['Champion'] = y_test_with_team

     pred_proba                    Team    Year  norm_pred
22     0.894677   Golden State Warriors  2016.0   0.531093
30     0.603812       San Antonio Spurs  2016.0   0.358431
36     0.433319   Golden State Warriors  2017.0   0.578865
13     0.394867       San Antonio Spurs  2015.0   0.406105
6      0.322903   Golden State Warriors  2015.0   0.332093
..          ...                     ...     ...        ...
43     0.000439  Portland Trail Blazers  2017.0   0.000587
93     0.000410  Portland Trail Blazers  2020.0   0.000544
91     0.000386           Orlando Magic  2020.0   0.000513
111    0.000267      Washington Wizards  2021.0   0.000889
123    0.000254    New Orleans Pelicans  2022.0   0.000820

[128 rows x 4 columns]


In [None]:
current_predictions[current_predictions['Year'] == 2017].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
36,0.433319,Golden State Warriors,2017.0,0.578865
44,0.157269,San Antonio Spurs,2017.0,0.210093
35,0.076431,Cleveland Cavaliers,2017.0,0.102103
37,0.024092,Houston Rockets,2017.0,0.032185
39,0.018674,Los Angeles Clippers,2017.0,0.024947
46,0.010187,Utah Jazz,2017.0,0.013609
45,0.009863,Toronto Raptors,2017.0,0.013176
33,0.006122,Boston Celtics,2017.0,0.008179
40,0.00311,Memphis Grizzlies,2017.0,0.004154
47,0.003046,Washington Wizards,2017.0,0.00407


In [None]:
current_predictions[current_predictions['Year'] == 2018].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
50,0.243442,Golden State Warriors,2018.0,0.396707
51,0.240124,Houston Rockets,2018.0,0.391298
61,0.062051,Toronto Raptors,2018.0,0.101116
48,0.010405,Boston Celtics,2018.0,0.016956
58,0.00993,Philadelphia 76ers,2018.0,0.016181
60,0.00992,San Antonio Spurs,2018.0,0.016166
57,0.008248,Oklahoma City Thunder,2018.0,0.01344
62,0.007729,Utah Jazz,2018.0,0.012595
49,0.004903,Cleveland Cavaliers,2018.0,0.00799
59,0.004653,Portland Trail Blazers,2018.0,0.007582


In [None]:
current_predictions[current_predictions['Year'] == 2019].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
68,0.25537,Golden State Warriors,2019.0,0.525714
72,0.103591,Milwaukee Bucks,2019.0,0.213256
78,0.038973,Toronto Raptors,2019.0,0.080232
69,0.020601,Houston Rockets,2019.0,0.042411
79,0.013196,Utah Jazz,2019.0,0.027165
76,0.010895,Portland Trail Blazers,2019.0,0.022428
66,0.010105,Denver Nuggets,2019.0,0.020802
64,0.006812,Boston Celtics,2019.0,0.014023
73,0.006499,Oklahoma City Thunder,2019.0,0.013379
70,0.005999,Indiana Pacers,2019.0,0.012349


In [None]:
current_predictions[current_predictions['Year'] == 2020].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
94,0.305302,Toronto Raptors,2020.0,0.40545
89,0.278669,Milwaukee Bucks,2020.0,0.370081
87,0.074625,Los Angeles Lakers,2020.0,0.099105
86,0.0359,Los Angeles Clippers,2020.0,0.047677
80,0.022126,Boston Celtics,2020.0,0.029385
84,0.007991,Houston Rockets,2020.0,0.010613
82,0.005805,Dallas Mavericks,2020.0,0.007709
95,0.004111,Utah Jazz,2020.0,0.005459
83,0.003997,Denver Nuggets,2020.0,0.005309
92,0.003475,Philadelphia 76ers,2020.0,0.004615


In [None]:
current_predictions[current_predictions['Year'] == 2021].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
110,0.125963,Utah Jazz,2021.0,0.418819
102,0.044833,Los Angeles Lakers,2021.0,0.149067
101,0.025903,Los Angeles Clippers,2021.0,0.086127
107,0.024941,Philadelphia 76ers,2021.0,0.082928
108,0.024473,Phoenix Suns,2021.0,0.081371
105,0.018603,Milwaukee Bucks,2021.0,0.061854
98,0.01192,Brooklyn Nets,2021.0,0.039633
100,0.010987,Denver Nuggets,2021.0,0.036532
109,0.002896,Portland Trail Blazers,2021.0,0.00963
106,0.002768,New York Knicks,2021.0,0.009203


In [None]:
current_predictions[current_predictions['Year'] == 2022].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
125,0.12075,Phoenix Suns,2022.0,0.390588
121,0.068823,Milwaukee Bucks,2022.0,0.222619
113,0.022215,Boston Celtics,2022.0,0.071859
119,0.020068,Memphis Grizzlies,2022.0,0.064912
118,0.019122,Golden State Warriors,2022.0,0.061852
127,0.018844,Utah Jazz,2022.0,0.060956
120,0.014564,Miami Heat,2022.0,0.04711
116,0.007515,Dallas Mavericks,2022.0,0.024309
124,0.004896,Philadelphia 76ers,2022.0,0.015836
126,0.003439,Toronto Raptors,2022.0,0.011125


In [None]:
current_predictions[current_predictions['Year'] == 2023].sort_values(by=['norm_pred'], ascending=False)

Unnamed: 0,pred_proba,Team,Year,norm_pred
