In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from math import exp

In [2]:
games_df = pd.concat(map
                     (pd.read_csv, 
                          ['data/game_details/nba_game_detailsSeason=2021-22.csv', 
                           'data/game_details/nba_game_detailsSeason=2020-21.csv', 
                    ]), ignore_index=True)

games_df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22021,1610612746,LAC,LA Clippers,22101083,2022-03-22,LAC @ DEN,L,240,42,...,29,35,27,3,7,6,19,115,-12,1
1,22021,1610612743,DEN,Denver Nuggets,22101083,2022-03-22,DEN vs. LAC,W,240,45,...,34,40,31,4,5,11,19,127,12,1
2,22021,1610612749,MIL,Milwaukee Bucks,22101082,2022-03-22,MIL vs. CHI,W,240,50,...,48,53,26,4,6,15,12,126,28,1
3,22021,1610612741,CHI,Chicago Bulls,22101082,2022-03-22,CHI @ MIL,L,240,39,...,28,33,21,9,4,7,12,98,-28,1
4,22021,1610612737,ATL,Atlanta Hawks,22101081,2022-03-22,ATL @ NYK,W,240,40,...,33,43,26,7,5,14,18,117,6,1


In [3]:
# Creating rows for dataframe
trans_df = pd.DataFrame()
y = []
for i in range(0,3000):#len(games_df)):
    # Check Matchup to see which team is Home
    if "@" in games_df["MATCHUP"][i]:
        away_team = games_df["TEAM_ABBREVIATION"]
        home_team = games_df["MATCHUP"][i][-3:]
        
        #Adding win/loss to home team
        if games_df["PLUS_MINUS"][i] > 0:
            y.append(0)
        else:
            y.append(1)
         
    else:
        home_team = games_df["TEAM_ABBREVIATION"]
        away_team = games_df["MATCHUP"][i][-3:]
        
        #Adding win/loss to home team
        if games_df["PLUS_MINUS"][i] > 0:
            y.append(1)
        else:
            y.append(0)
    # row list for df
    row = []
    
    #Last 10 games df for both teams
    last_10h = games_df[games_df["TEAM_ABBREVIATION"] == home_team][games_df["GAME_DATE"]< games_df["GAME_DATE"][i]].head(20)
    last_10a = games_df[games_df["TEAM_ABBREVIATION"] == away_team][games_df["GAME_DATE"]< games_df["GAME_DATE"][i]].head(20)

    needed_features = ["FGA", "FG_PCT", "FG3_PCT", "DREB", "REB", "AST"]
    
    for feature in needed_features:
        row.append(round((sum(last_10h[feature])/20),2))
        row.append(round((sum(last_10a[feature])/20),2))
    
    trans_df = trans_df.append(pd.DataFrame([row]), ignore_index=True)

trans_df=trans_df.set_axis(["FGA_HOME","FGA_AWAY", "FG_PCT_HOME", "FG_PCT_AWAY", "FG3_PCT_HOME", "FG3_PCT_AWAY", 
                               "DREB_HOME", "DREB_AWAY", "REB_HOME", "REB_AWAY", "AST_HOME", "AST_AWAY"], axis=1)
        
trans_df["WIN"] = y

trans_df.head()



Unnamed: 0,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG3_PCT_HOME,FG3_PCT_AWAY,DREB_HOME,DREB_AWAY,REB_HOME,REB_AWAY,AST_HOME,AST_AWAY,WIN
0,86.4,87.0,0.49,0.47,0.34,0.35,37.2,33.95,46.9,43.6,28.7,25.75,1
1,87.0,88.0,0.47,0.46,0.35,0.39,33.95,34.75,43.6,44.3,25.75,24.9,1
2,87.0,87.45,0.47,0.49,0.35,0.36,33.95,33.7,43.6,41.9,25.75,23.4,1
3,89.85,87.0,0.49,0.47,0.39,0.35,36.05,33.95,46.1,43.6,25.25,25.75,1
4,88.7,87.0,0.43,0.47,0.35,0.35,35.7,33.95,48.55,43.6,22.15,25.75,0


In [4]:
columns = ["HOME_TEAM_ID", "VISITOR_TEAM_ID", "SEASON", "PTS_home", "FG_PCT_home", "FT_PCT_home", "FG3_PCT_home", "AST_home", "REB_home", "PTS_away", "FG_PCT_away", "FT_PCT_away", "FG3_PCT_away", "AST_away", "REB_away"]

target = ["HOME_TEAM_WINS"]

In [5]:
"""# Load the data from AWS - REVIEW WITH TEAM
file_path ="https://YOUR-BUCKET-NAME.s3.amazonaws.com/INSERTLINK"
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null comumns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Convert the target column values to win/loss based on their values
x = {'1': 'Win'}   
df = df.replace(x)

x = {'0': 'Loss'}   
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head() """

#create column with FGA difference between teams
trans_df["FGA_DIFF"]=trans_df["FGA_HOME"]-trans_df["FGA_AWAY"]
trans_df.drop(columns=["FGA_HOME","FGA_AWAY"], inplace=True)
trans_df.head()

#Create column with FG percent ratio between teams
trans_df["FG_PCT_RATIO"]=trans_df["FG_PCT_HOME"]/trans_df["FG_PCT_AWAY"]
trans_df.drop(columns=["FG_PCT_HOME", "FG_PCT_AWAY"], inplace=True)

#Create column with 3-point FG percent ratio between teams
trans_df["FG3_PCT_RATIO"] = trans_df["FG3_PCT_HOME"]/trans_df["FG3_PCT_AWAY"]
trans_df.drop(columns=["FG3_PCT_HOME", "FG3_PCT_AWAY"], inplace = True)

#Create column with Defensive rebound difference between teams
trans_df["DREB_DIFF"]=trans_df["DREB_HOME"]-trans_df["DREB_AWAY"]
trans_df.drop(columns=["DREB_HOME", "DREB_AWAY"], inplace =True)

#Create column with rebound difference between teams
trans_df["REB_DIFF"]=trans_df["REB_HOME"]-trans_df["REB_AWAY"]
trans_df.drop(columns=["REB_HOME","REB_AWAY"], inplace=True)

#Create column with assist difference between teams
trans_df["AST_DIFF"]=trans_df["AST_HOME"]-trans_df["AST_AWAY"]
trans_df.drop(columns=["AST_HOME","AST_AWAY"], inplace=True)

win=trans_df["WIN"]
trans_df.head()

win=trans_df["WIN"]
trans_df.drop(columns=["WIN"], inplace=True)
trans_df["WIN"]=win
trans_df.head()

Unnamed: 0,FGA_DIFF,FG_PCT_RATIO,FG3_PCT_RATIO,DREB_DIFF,REB_DIFF,AST_DIFF,WIN
0,-0.6,1.042553,0.971429,3.25,3.3,2.95,1
1,-1.0,1.021739,0.897436,-0.8,-0.7,0.85,1
2,-0.45,0.959184,0.972222,0.25,1.7,2.35,1
3,2.85,1.042553,1.114286,2.1,2.5,-0.5,1
4,1.7,0.914894,1.0,1.75,4.95,-3.6,0


In [6]:
#Creating input and output data
trans_df["WIN"] = y
y = trans_df["WIN"]
X_df = trans_df.drop(["WIN"], axis=1)

In [7]:
# Check the balance of our target values
y.value_counts()

1    1646
0    1354
Name: WIN, dtype: int64

In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_df, y, random_state=42, stratify=y)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

In [10]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

In [11]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Oversampling

from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [13]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


array([[102, 237],
       [103, 308]], dtype=int64)

In [14]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5251383416230648

In [15]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          0       0.50      0.30      0.75      0.38      0.47      0.22       339
          1       0.57      0.75      0.30      0.64      0.47      0.24       411

avg / total       0.53      0.55      0.50      0.52      0.47      0.23       750



In [16]:
# SMOTEEN

from imblearn.over_sampling import SMOTE
X_resampled1, y_resampled1 = SMOTE(random_state=42,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)

Counter(y_resampled1)

model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled1, y_resampled1)

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


0.5131559115474883

In [17]:
confusion_matrix(y_test, y_pred)

array([[ 98, 241],
       [108, 303]], dtype=int64)

In [18]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.29      0.74      0.36      0.46      0.20       339
          1       0.56      0.74      0.29      0.63      0.46      0.22       411

avg / total       0.52      0.53      0.49      0.51      0.46      0.21       750



In [19]:
"""
# Undersampling

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled2, y_resampled2 = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

model = LogisticRegression(solver='lbfgs', random_state=42, max_iter=250,n_jobs=250)
model.fit(X_resampled2, y_resampled2)

from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)


SyntaxError: EOF while scanning triple-quoted string literal (Temp/ipykernel_35504/3170895798.py, line 14)

In [None]:
"""
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
"""
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [21]:
# Decision Tree Model
## thought to give it a try because it is a binary outcome, win or loss = 1 or 0. 

# Import Dependencies
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,142,197
Actual 1,192,219


In [22]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,142,197
Actual 1,192,219


Accuracy Score : 0.48133333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.43      0.42      0.42       339
           1       0.53      0.53      0.53       411

    accuracy                           0.48       750
   macro avg       0.48      0.48      0.48       750
weighted avg       0.48      0.48      0.48       750

