# NBA Game Outcome

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
games_df = pd.concat(map
                     (pd.read_csv, 
                          ['data/game_details/nba_game_detailsSeason=2021-22.csv', 
                           'data/game_details/nba_game_detailsSeason=2020-21.csv', 
                           'data/game_details/nba_game_detailsSeason=2019-20.csv',
                           'data/game_details/nba_game_detailsSeason=2018-19.csv',
                           'data/game_details/nba_game_detailsSeason=2017-18.csv',
                           'data/game_details/nba_game_detailsSeason=2016-17.csv',
                           'data/game_details/nba_game_detailsSeason=2015-16.csv',
                           'data/game_details/nba_game_detailsSeason=2014-15.csv',
                           'data/game_details/nba_game_detailsSeason=2013-14.csv',
                           'data/game_details/nba_game_detailsSeason=2012-13.csv']), ignore_index=True)

In [30]:
# Creating rows for dataframe
trans_df = pd.DataFrame()
y = []
for i in range(0,3000):#len(games_df)):
    # Check Matchup to see which team is Home
    if "@" in games_df["MATCHUP"][i]:
        away_team = games_df["TEAM_ABBREVIATION"]
        home_team = games_df["MATCHUP"][i][-3:]
        
        #Adding win/loss to home team
        if games_df["PLUS_MINUS"][i] > 0:
            y.append(0)
        else:
            y.append(1)
         
    else:
        home_team = games_df["TEAM_ABBREVIATION"]
        away_team = games_df["MATCHUP"][i][-3:]
        
        #Adding win/loss to home team
        if games_df["PLUS_MINUS"][i] > 0:
            y.append(1)
        else:
            y.append(0)
    # row list for df
    row = []
    
    #Last 10 games df for both teams
    last_10h = games_df[games_df["TEAM_ABBREVIATION"] == home_team][games_df["GAME_DATE"]< games_df["GAME_DATE"][i]].head(20)
    last_10a = games_df[games_df["TEAM_ABBREVIATION"] == away_team][games_df["GAME_DATE"]< games_df["GAME_DATE"][i]].head(20)

    needed_features = ["FGA", "FG_PCT", "FG3_PCT", "DREB", "REB", "AST"]
    
    for feature in needed_features:
        row.append(round((sum(last_10h[feature])/20),2))
        row.append(round((sum(last_10a[feature])/20),2))
    
    trans_df = trans_df.append(pd.DataFrame([row]), ignore_index=True)

trans_df=trans_df.set_axis(["FGA_HOME","FGA_AWAY", "FG_PCT_HOME", "FG_PCT_AWAY", "FG3_PCT_HOME", "FG3_PCT_AWAY", 
                               "DREB_HOME", "DREB_AWAY", "REB_HOME", "REB_AWAY", "AST_HOME", "AST_AWAY"], axis=1)
        
trans_df["WIN"] = y

trans_df.head()



Unnamed: 0,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG3_PCT_HOME,FG3_PCT_AWAY,DREB_HOME,DREB_AWAY,REB_HOME,REB_AWAY,AST_HOME,AST_AWAY,WIN
0,86.4,87.0,0.49,0.47,0.34,0.35,37.2,33.95,46.9,43.6,28.7,25.75,1
1,87.0,88.0,0.47,0.46,0.35,0.39,33.95,34.75,43.6,44.3,25.75,24.9,1
2,87.0,87.45,0.47,0.49,0.35,0.36,33.95,33.7,43.6,41.9,25.75,23.4,1
3,89.85,87.0,0.49,0.47,0.39,0.35,36.05,33.95,46.1,43.6,25.25,25.75,1
4,88.7,87.0,0.43,0.47,0.35,0.35,35.7,33.95,48.55,43.6,22.15,25.75,0


# Read the Data and Perform Basic Data Cleaning

In [4]:
columns = ["HOME_TEAM_ID", "VISITOR_TEAM_ID", "SEASON", "PTS_home", "FG_PCT_home", "FT_PCT_home", "FG3_PCT_home", "AST_home", "REB_home", "PTS_away", "FG_PCT_away", "FT_PCT_away", "FG3_PCT_away", "AST_away", "REB_away"]

target = ["HOME_TEAM_WINS"]

In [31]:
"""# Load the data from AWS - REVIEW WITH TEAM
file_path ="https://YOUR-BUCKET-NAME.s3.amazonaws.com/INSERTLINK"
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null comumns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Convert the target column values to win/loss based on their values
x = {'1': 'Win'}   
df = df.replace(x)

x = {'0': 'Loss'}   
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

#create column with FGA difference between teams
trans_df["FGA_DIFF"]=trans_df["FGA_HOME"]-trans_df["FGA_AWAY"]
trans_df.drop(columns=["FGA_HOME","FGA_AWAY"], inplace=True)
trans_df.head()

#Create column with FG percent ratio between teams
trans_df["FG_PCT_RATIO"]=trans_df["FG_PCT_HOME"]/trans_df["FG_PCT_AWAY"]
trans_df.drop(columns=["FG_PCT_HOME", "FG_PCT_AWAY"], inplace=True)

#Create column with 3-point FG percent ratio between teams
trans_df["FG3_PCT_RATIO"] = trans_df["FG3_PCT_HOME"]/trans_df["FG3_PCT_AWAY"]
trans_df.drop(columns=["FG3_PCT_HOME", "FG3_PCT_AWAY"], inplace = True)

#Create column with Defensive rebound difference between teams
trans_df["DREB_DIFF"]=trans_df["DREB_HOME"]-trans_df["DREB_AWAY"]
trans_df.drop(columns=["DREB_HOME", "DREB_AWAY"], inplace =True)

#Create column with rebound difference between teams
trans_df["REB_DIFF"]=trans_df["REB_HOME"]-trans_df["REB_AWAY"]
trans_df.drop(columns=["REB_HOME","REB_AWAY"], inplace=True)

#Create column with assist difference between teams
trans_df["AST_DIFF"]=trans_df["AST_HOME"]-trans_df["AST_AWAY"]
trans_df.drop(columns=["AST_HOME","AST_AWAY"], inplace=True)

win=trans_df["WIN"]
trans_df.head()

win=trans_df["WIN"]
trans_df.drop(columns=["WIN"], inplace=True)
trans_df["WIN"]=win
trans_df.head()
"""

Unnamed: 0,FGA_DIFF,FG_PCT_RATIO,FG3_PCT_RATIO,DREB_DIFF,REB_DIFF,AST_DIFF,WIN
0,-0.6,1.042553,0.971429,3.25,3.3,2.95,1
1,-1.0,1.021739,0.897436,-0.8,-0.7,0.85,1
2,-0.45,0.959184,0.972222,0.25,1.7,2.35,1
3,2.85,1.042553,1.114286,2.1,2.5,-0.5,1
4,1.7,0.914894,1.0,1.75,4.95,-3.6,0


# Split the Data into Training and Testing

In [32]:
#Creating input and output data
trans_df["WIN"] = y
y = trans_df["WIN"]
X_df = trans_df.drop(["WIN"], axis=1)

In [33]:
# Check the balance of our target values
y.value_counts()

1    1646
0    1354
Name: WIN, dtype: int64

In [34]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_df, y, random_state=1, stratify=y)

# Balanced Random Forest Classifier

In [35]:
# Resample the training data with the BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [36]:
# Calculated the balanced accuracy score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5408887229275579

In [27]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[195, 143],
       [200, 212]], dtype=int64)

In [28]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.58      0.51      0.53      0.54      0.30       338
          1       0.60      0.51      0.58      0.55      0.54      0.30       412

avg / total       0.55      0.54      0.55      0.54      0.54      0.30       750



In [29]:
# List the features sorted in descending order by feature importance
feature_names = X_df.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.17654064925843507, 'FGA_DIFF'),
 (0.17509186410846142, 'AST_DIFF'),
 (0.16873113003707288, 'REB_DIFF'),
 (0.16571714683416364, 'FG3_PCT_RATIO'),
 (0.16247191076164605, 'DREB_DIFF'),
 (0.1514472990002211, 'FG_PCT_RATIO')]

In [37]:
X_train

Unnamed: 0,FGA_DIFF,FG_PCT_RATIO,FG3_PCT_RATIO,DREB_DIFF,REB_DIFF,AST_DIFF
398,-1.35,1.000000,1.028571,2.30,1.00,-0.45
2721,-1.35,0.979592,1.000000,-0.50,0.65,0.25
2465,0.85,1.021277,1.028571,2.75,2.05,-0.35
2593,7.85,1.021739,0.971429,3.65,4.90,-1.65
1355,1.20,1.022222,1.000000,-1.70,-2.75,1.20
...,...,...,...,...,...,...
2428,-2.75,1.022222,1.058824,-2.15,-4.05,0.15
1015,3.20,0.934783,0.969697,2.85,4.50,-2.25
2578,8.65,1.000000,0.918919,2.95,4.00,0.85
556,-0.85,1.000000,0.888889,0.40,0.50,-2.15


In [39]:
X_train.head(1)

Unnamed: 0,FGA_DIFF,FG_PCT_RATIO,FG3_PCT_RATIO,DREB_DIFF,REB_DIFF,AST_DIFF
398,-1.35,1.0,1.028571,2.3,1.0,-0.45


In [43]:
random_forest.predict_proba(X_train.head(5))

array([[0.81, 0.19],
       [0.17, 0.83],
       [0.21, 0.79],
       [0.74, 0.26],
       [0.23, 0.77]])