# Collectting Data

In [1]:
%pip install nba_api

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='03/15/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42022,1610612748,MIA,Miami Heat,42200405,2023-06-12,MIA @ DEN,L,240,89,...,0.875,11,33,44,18,9,7,8,21,-5.0
1,42022,1610612743,DEN,Denver Nuggets,42200405,2023-06-12,DEN vs. MIA,W,240,94,...,0.565,11,46,57,21,6,7,14,13,5.0
2,42022,1610612743,DEN,Denver Nuggets,42200404,2023-06-09,DEN @ MIA,W,242,108,...,0.762,5,29,34,26,11,7,6,18,13.0
3,42022,1610612748,MIA,Miami Heat,42200404,2023-06-09,MIA vs. DEN,L,240,95,...,0.85,8,29,37,23,2,3,14,19,-13.0
4,42022,1610612743,DEN,Denver Nuggets,42200403,2023-06-07,DEN @ MIA,W,238,109,...,0.815,13,45,58,28,3,5,13,18,15.0


# Cleaning and Exploring Data

In [3]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [4]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [5]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Miami Heat,0042200405,2023-06-12,MIA @ DEN,L,-5.0
1,Denver Nuggets,0042200405,2023-06-12,DEN vs. MIA,W,5.0
2,Denver Nuggets,0042200404,2023-06-09,DEN @ MIA,W,13.0
3,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0
4,Denver Nuggets,0042200403,2023-06-07,DEN @ MIA,W,15.0
...,...,...,...,...,...,...
8425,Miami Heat,0011900104,2020-07-22,MIA vs. SAC,W,6.0
8426,Sacramento Kings,0011900104,2020-07-22,SAC @ MIA,L,-6.0
8427,Orlando Magic,0011900101,2020-07-22,ORL @ LAC,L,-9.0
8428,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0


final dataframe : one row for one game 
consists of two columns : 
1. result of game-> target
2. score stat comparing 2 teams -> feature

In [6]:
import pandas as pd

In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8430 entries, 0 to 8429
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   8430 non-null   object 
 1   GAME_ID     8430 non-null   object 
 2   GAME_DATE   8430 non-null   object 
 3   MATCHUP     8430 non-null   object 
 4   WL          8430 non-null   object 
 5   PLUS_MINUS  8430 non-null   float64
dtypes: float64(1), object(5)
memory usage: 395.3+ KB


GAME_DATE format -> datetime

In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [9]:
games['GAME_DATE']

0      2023-06-12
1      2023-06-12
2      2023-06-09
3      2023-06-09
4      2023-06-07
          ...    
8425   2020-07-22
8426   2020-07-22
8427   2020-07-22
8428   2020-07-22
8429   2020-07-22
Name: GAME_DATE, Length: 8430, dtype: datetime64[ns]

sorting the games in increasing date from oldest to latest

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
8429,Denver Nuggets,0011900102,2020-07-22,DEN vs. WAS,W,7.0
8422,New Orleans Pelicans,0011900103,2020-07-22,NOP @ BKN,W,31.0
8423,Brooklyn Nets,0011900103,2020-07-22,BKN vs. NOP,L,-31.0
8424,Washington Wizards,0011900102,2020-07-22,WAS @ DEN,L,-7.0
8428,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0
...,...,...,...,...,...,...
4,Denver Nuggets,0042200403,2023-06-07,DEN @ MIA,W,15.0
3,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0
2,Denver Nuggets,0042200404,2023-06-09,DEN @ MIA,W,13.0
1,Denver Nuggets,0042200405,2023-06-12,DEN vs. MIA,W,5.0


add feature, calculating the past 30 games average plus minus 

In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8429,Denver Nuggets,0011900102,2020-07-22,DEN vs. WAS,W,7.0,
8422,New Orleans Pelicans,0011900103,2020-07-22,NOP @ BKN,W,31.0,
8423,Brooklyn Nets,0011900103,2020-07-22,BKN vs. NOP,L,-31.0,
8424,Washington Wizards,0011900102,2020-07-22,WAS @ DEN,L,-7.0,
8428,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0,
...,...,...,...,...,...,...,...
4,Denver Nuggets,0042200403,2023-06-07,DEN @ MIA,W,15.0,5.166667
3,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0,2.833333
2,Denver Nuggets,0042200404,2023-06-09,DEN @ MIA,W,13.0,5.033333
1,Denver Nuggets,0042200405,2023-06-12,DEN vs. MIA,W,5.0,5.666667


try an example, take Toronto Raptors games

In [13]:

games[games['TEAM_NAME']=='Toronto Raptors'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8408,Toronto Raptors,11900111,2020-07-24,TOR vs. HOU,W,11.0,
8387,Toronto Raptors,11900121,2020-07-26,TOR vs. POR,W,6.0,
8369,Toronto Raptors,11900129,2020-07-28,TOR @ PHX,L,-11.0,
8346,Toronto Raptors,21901243,2020-08-01,TOR vs. LAL,W,15.0,
8325,Toronto Raptors,21901250,2020-08-03,TOR @ MIA,W,4.0,
8290,Toronto Raptors,21901266,2020-08-05,TOR @ ORL,W,10.0,
8271,Toronto Raptors,21901279,2020-08-07,TOR vs. BOS,L,-22.0,
8244,Toronto Raptors,21901286,2020-08-09,TOR vs. MEM,W,9.0,
8233,Toronto Raptors,21901294,2020-08-10,TOR @ MIL,W,8.0,
8216,Toronto Raptors,21901305,2020-08-12,TOR @ PHI,W,4.0,


Need to factor in Home advantage. from Matchup, @ -> away game
Using Boolean series to seperate games using string 

In [14]:
msk = games['MATCHUP'].str.contains('@')

In [15]:
games_away = games[msk]
games_home = games[~msk]

In [16]:
games_home.shape

(4215, 7)

In [17]:
games_away.shape

(4215, 7)

In [18]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8429,Denver Nuggets,0011900102,2020-07-22,DEN vs. WAS,W,7.0,
8423,Brooklyn Nets,0011900103,2020-07-22,BKN vs. NOP,L,-31.0,
8428,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0,
8425,Miami Heat,0011900104,2020-07-22,MIA vs. SAC,W,6.0,
8421,Los Angeles Lakers,0011900107,2020-07-23,LAL vs. DAL,L,-4.0,
...,...,...,...,...,...,...,...
8,Denver Nuggets,0042200401,2023-06-01,DEN vs. MIA,W,11.0,4.333333
7,Denver Nuggets,0042200402,2023-06-04,DEN vs. MIA,L,-3.0,4.766667
5,Miami Heat,0042200403,2023-06-07,MIA vs. DEN,L,-15.0,2.366667
3,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0,2.833333


In [19]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8422,New Orleans Pelicans,0011900103,2020-07-22,NOP @ BKN,W,31.0,
8424,Washington Wizards,0011900102,2020-07-22,WAS @ DEN,L,-7.0,
8426,Sacramento Kings,0011900104,2020-07-22,SAC @ MIA,L,-6.0,
8427,Orlando Magic,0011900101,2020-07-22,ORL @ LAC,L,-9.0,
8420,Phoenix Suns,0011900108,2020-07-23,PHX @ UTA,W,13.0,
...,...,...,...,...,...,...,...
9,Miami Heat,0042200401,2023-06-01,MIA @ DEN,L,-11.0,3.266667
6,Miami Heat,0042200402,2023-06-04,MIA @ DEN,W,3.0,2.500000
4,Denver Nuggets,0042200403,2023-06-07,DEN @ MIA,W,15.0,5.166667
2,Denver Nuggets,0042200404,2023-06-09,DEN @ MIA,W,13.0,5.033333


combile info for single game, combine using merge 

In [20]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))


In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Denver Nuggets,0011900102,2020-07-22,DEN vs. WAS,W,7.0,,Washington Wizards,2020-07-22,WAS @ DEN,L,-7.0,
1,Brooklyn Nets,0011900103,2020-07-22,BKN vs. NOP,L,-31.0,,New Orleans Pelicans,2020-07-22,NOP @ BKN,W,31.0,
2,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0,,Orlando Magic,2020-07-22,ORL @ LAC,L,-9.0,
3,Miami Heat,0011900104,2020-07-22,MIA vs. SAC,W,6.0,,Sacramento Kings,2020-07-22,SAC @ MIA,L,-6.0,
4,Los Angeles Lakers,0011900107,2020-07-23,LAL vs. DAL,L,-4.0,,Dallas Mavericks,2020-07-23,DAL @ LAL,W,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4210,Denver Nuggets,0042200401,2023-06-01,DEN vs. MIA,W,11.0,4.333333,Miami Heat,2023-06-01,MIA @ DEN,L,-11.0,3.266667
4211,Denver Nuggets,0042200402,2023-06-04,DEN vs. MIA,L,-3.0,4.766667,Miami Heat,2023-06-04,MIA @ DEN,W,3.0,2.500000
4212,Miami Heat,0042200403,2023-06-07,MIA vs. DEN,L,-15.0,2.366667,Denver Nuggets,2023-06-07,DEN @ MIA,W,15.0,5.166667
4213,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0,2.833333,Denver Nuggets,2023-06-09,DEN @ MIA,W,13.0,5.033333


In [22]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [23]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Denver Nuggets,0011900102,2020-07-22,DEN vs. WAS,W,7.0,,Washington Wizards,2020-07-22,WAS @ DEN,L,-7.0,,
1,Brooklyn Nets,0011900103,2020-07-22,BKN vs. NOP,L,-31.0,,New Orleans Pelicans,2020-07-22,NOP @ BKN,W,31.0,,
2,LA Clippers,0011900101,2020-07-22,LAC vs. ORL,W,9.0,,Orlando Magic,2020-07-22,ORL @ LAC,L,-9.0,,
3,Miami Heat,0011900104,2020-07-22,MIA vs. SAC,W,6.0,,Sacramento Kings,2020-07-22,SAC @ MIA,L,-6.0,,
4,Los Angeles Lakers,0011900107,2020-07-23,LAL vs. DAL,L,-4.0,,Dallas Mavericks,2020-07-23,DAL @ LAL,W,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4210,Denver Nuggets,0042200401,2023-06-01,DEN vs. MIA,W,11.0,4.333333,Miami Heat,2023-06-01,MIA @ DEN,L,-11.0,3.266667,1.066667
4211,Denver Nuggets,0042200402,2023-06-04,DEN vs. MIA,L,-3.0,4.766667,Miami Heat,2023-06-04,MIA @ DEN,W,3.0,2.500000,2.266667
4212,Miami Heat,0042200403,2023-06-07,MIA vs. DEN,L,-15.0,2.366667,Denver Nuggets,2023-06-07,DEN @ MIA,W,15.0,5.166667,-2.800000
4213,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,-13.0,2.833333,Denver Nuggets,2023-06-09,DEN @ MIA,W,13.0,5.033333,-2.200000


In [24]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,W,
1,L,
2,W,
3,W,
4,L,
...,...,...
4210,W,1.066667
4211,L,2.266667
4212,L,-2.800000
4213,L,-2.200000


In [25]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()

In [26]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
203,L,1.500000
204,L,-1.633333
347,L,-0.433333
363,L,-2.226667
396,L,-5.373333
...,...,...
4210,W,1.066667
4211,L,2.266667
4212,L,-2.800000
4213,L,-2.200000


change the win lost to 1 or 0 value

In [27]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [28]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
203,0,1.500000
204,0,-1.633333
347,0,-0.433333
363,0,-2.226667
396,0,-5.373333
...,...,...
4210,1,1.066667
4211,0,2.266667
4212,0,-2.800000
4213,0,-2.200000


## Building the predictive model, tuning hyperparameter and evaluation

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=8)

In [31]:
df_train.shape

(2928, 2)

In [32]:
df_test.shape

(732, 2)

In [33]:
target = 'WL_home'
X_train = df_train.drop(columns=target) # avg_plys_minus_diff
y_train = df_train[target] # WL_home

X_test = df_test.drop(columns=target) # avg_plys_minus_diff
y_test= df_test[target] # WL_home

In [34]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


Since our project is a labeled classification problems, use xgboost. 
alternative : random forest, logistic progression

In [35]:
import xgboost as xgb

xgboost provides wrapper class to allow models to be treated like classifier in scikit-learn framework. means we can use scikit-learn library with xgboost models. -> xgbclassifier

In [36]:
clf = xgb.XGBClassifier(use_label_encoder=False, random_state=8)



In [37]:
clf.fit(X_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [38]:
from sklearn.metrics import accuracy_score

In [39]:
y_pred = clf.predict(X_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [40]:
accuracy_score(y_test, y_pred)

0.5724043715846995

In [41]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import loguniform

In [42]:
hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [43]:
random_hyp = RandomizedSearchCV(estimator=clf, 
                                param_distributions=hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=8)

In [44]:
random_hyp.fit(X_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_s

In [45]:
random_hyp.best_params_

{'learning_rate': 0.00018826107481990595,
 'max_depth': 4,
 'n_estimators': 150,
 'subsample': 0.9}

In [46]:
model_hyp = random_hyp.best_estimator_

In [47]:
y_pred_hyp = model_hyp.predict(X_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [48]:
accuracy_score(y_test, y_pred_hyp)

0.6174863387978142

# Deploying model with FastAPI

In [49]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

In [50]:
accuracy_score(y_test, model_saved.predict(X_test))

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.6174863387978142

In [51]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [52]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2021',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus

predict_home_win=model_saved.predict(np.array([games_diff]))[0]
predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]

In [53]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='01/01/2021',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [54]:
predict_games('Boston Celtics','Toronto Raptors')

(1, 0.50526863)