<a href="https://colab.research.google.com/github/dani0621/ADV_CSIII_Project_1/blob/main/ADV_CSIII_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NBA Win Prediction Model**

### **Imports**

In [1]:
#importing files and libraries, connecting data file from Google Drive
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from google.colab import files
from google.colab import drive

drive.mount("/content/drive")
data=pd.read_csv('/content/drive/My Drive/MLA/team.csv')

Mounted at /content/drive


### **Rating and Percentage Calculations for Missing Data**

In [2]:
# Winning percentage equation from ChatGPT
data['W_PCT'] = data.groupby('team')['win'].transform(lambda x: x.cumsum() / (x.index + 1))

# True Shooting Percentage
# to_percentage= (points[PTS] * 100)/(2*(field goal attempts[FGA] + 0.44 * Free throw attempts [fta])) from Google Search
data['TO%'] = (data['PTS'] * 100) / (2 * (data['FGA'] + 0.44 * data['FTA']))

### **Clean Data/ Basic Analysis**

In [3]:
# Filter all the games played before 10/19/2018 as only the data from the past few years would be accurate in the our current win prediction model
data_filtered = data[data['date'] >= '10/19/2018']

# filters out rows that don't't have values
data_clean = data_filtered.dropna().drop_duplicates()

data_cleaned = data[['home','away', 'date', 'season', 'win', 'PTS', 'REB', 'TOV', '+/-', 'FTM', 'FTA', 'AST', 'BLK', 'TO%', 'W_PCT', 'OREB', 'DREB', 'FG%', 'FT%', '3P%', 'FGA', 'FGM', '3PM', 'STL']]

### **Logistic Regressions**

#### **Points**

In [19]:
def performLinRegPt(dataframe):
  # factors
  featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TO%', 'FTM', 'FTA', 'AST', 'BLK', 'FT%', 'FG%', 'OREB', '3PM']

  X = dataframe[featureColumns]
  Y = dataframe['PTS']

  # training and testing sets
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, accuracy, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

pt_model = performLinRegPt(data_cleaned)

Mean Absolute Error: 1.9900546961106098
Mean Squared Error: 7.320605299002524
Root Mean Squared Error: 2.705661711855812
----------------------------------
Coefficient Information:
W_PCT: -24.177310139884035
REB: 0.11568425744869301
TOV: -0.181922253852623
+/-: -0.027919781142483974
TO%: -6.732549047235092
FTM: 4.495718204653165
FTA: -1.7026585104257468
AST: 0.12072113714766768
BLK: -0.006651898675963055
FT%: 0.039552936383902185
FG%: 7.565412297895933
OREB: 0.17872120600528943
3PM: 4.733365966059371


#### **Rebounds**

In [18]:
def performLinRegRB(dataframe):
  # factors
  featureColumns = ['OREB', 'DREB','FG%', 'PTS', 'FT%', '+/-', 'BLK', '3P%']

  X = dataframe[featureColumns]
  Y = dataframe['REB']

  # training and testing sets
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

rb_model = performLinRegRB(data_cleaned)

Mean Absolute Error: 7.22917754379742e-15
Mean Squared Error: 9.052476103813857e-29
Root Mean Squared Error: 9.514450117486485e-15
----------------------------------
Coefficient Information:
OREB: 0.9999999999999996
DREB: 1.0000000000000004
FG%: 4.440892098500626e-16
PTS: 5.551115123125783e-17
FT%: -2.7755575615628914e-16
+/-: -2.3592239273284576e-16
BLK: 2.7755575615628914e-17
3P%: 5.551115123125783e-16


#### **Turnovers**

In [17]:
def performLinRegTO(dataframe):
  featureColumns = ['AST', 'REB', '+/-', "TO%", "FGA", "3P%", "STL","OREB","DREB", "FG%", "FT%"]

  X = dataframe[featureColumns]
  Y = dataframe['TOV']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

to_model = performLinRegTO(data_cleaned)

Mean Absolute Error: 2.4576581914351516
Mean Squared Error: 9.498559865528138
Root Mean Squared Error: 3.0819733719693523
----------------------------------
Coefficient Information:
AST: 0.06404194556258672
REB: 0.29172129289506493
+/-: -0.19397321961787947
TO%: 0.05495787713811429
FGA: -0.3094252607710219
3P%: 0.02466322291329305
STL: 0.4730107529579296
OREB: 0.1831736130066181
DREB: 0.1085476798884471
FG%: 0.18867028775125572
FT%: 0.02692404906639759


#### **Assists**

In [16]:
def performLinRegAS(dataframe):
  featureColumns = ['PTS', 'FGM', 'TOV', '+/-']

  X = dataframe[featureColumns]
  Y = dataframe['AST']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

as_model = performLinRegAS(data_cleaned)

Mean Absolute Error: 3.136043395538476
Mean Squared Error: 15.381665595871262
Root Mean Squared Error: 3.921946658978327
----------------------------------
Coefficient Information:
PTS: 0.030679431368210587
FGM: 0.5297011749172122
TOV: 0.06837399919164579
+/-: 0.046988769283430765


#### **Free Throws**

In [14]:
def performLinRegFT(dataframe):
  featureColumns = ['PTS', 'FGA', 'REB']

  X = dataframe[featureColumns]
  Y = dataframe['FTM']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

ft_model = performLinRegFT(data_cleaned)

Mean Absolute Error: 4.260292827986912
Mean Squared Error: 28.606322231006853
Root Mean Squared Error: 5.3484878452705535
----------------------------------
Coefficient Information:
PTS: 0.20568255207320194
FGA: -0.41395505464200727
REB: 0.19673452552091222


#### **Blocks**

In [20]:
def performLinRegBL(dataframe):
  # what factors we want to later use for our independent variables
  featureColumns = ['REB', '+/-', 'STL']

  # following yt videos
  X = dataframe[featureColumns]
  Y = dataframe['BLK']

  # training and testing sets, maybe change test_size (look into more later on Kandell used 0.25)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

bl_model = performLinRegBL(data_cleaned)

Mean Absolute Error: 2.015882782896304
Mean Squared Error: 6.464026648980332
Root Mean Squared Error: 2.542445013954153
----------------------------------
Coefficient Information:
REB: 0.05148981169814914
+/-: 0.026725706114708474
STL: -0.0009488074122541686


#### **Field Goals Made**

In [21]:
def performLinRegFGM(dataframe):
  # what factors we want to later use for our independent variables
  featureColumns = ['FGA', 'FG%', 'PTS','3PM', 'AST', 'TOV', '3P%', 'TO%']

  # following yt videos
  X = dataframe[featureColumns]
  Y = dataframe['FGM']

  # training and testing sets, maybe change test_size (look into more later on Kandell used 0.25)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", metrics.root_mean_squared_error(Y_test, Y_pred))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

fgm_model = performLinRegFGM(data_cleaned)

Mean Absolute Error: 0.30105190115158537
Mean Squared Error: 0.20052118523595122
Root Mean Squared Error: 0.44779591918188716
----------------------------------
Coefficient Information:
FGA: 0.40680380845845854
FG%: 0.8588739258606423
PTS: 0.03738769253228191
3PM: 0.03935723777477218
AST: 0.007433722666306292
TOV: -0.001905380835459075
3P%: -0.003113437785528486
TO%: -0.11462939903492278


### **Converting Abreviations to Full Names**

In [11]:
team_names = {
  'ATL': 'Atlanta Hawks',
  'BKN': 'Brooklyn Nets',
  'BOS': 'Boston Celtics',
  'CHA': 'Charlotte Hornets',
  'CHI': 'Chicago Bulls',
  'CLE': 'Cleveland Cavaliers',
  'DAL': 'Dallas Mavericks',
  'DEN': 'Denver Nuggets',
  'DET': 'Detroit Pistons',
  'GSW': 'Golden State Warriors',
  'HOU': 'Houston Rockets',
  'IND': 'Indiana Pacers',
  'LAC': 'Los Angeles Clippers',
  'LAL': 'Los Angeles Lakers',
  'MEM': 'Memphis Grizzlies',
  'MIA': 'Miami Heat',
  'MIL': 'Milwaukee Bucks',
  'MIN': 'Minnesota Timberwolves',
  'NOP': 'New Orleans Pelicans',
  'OKC': 'Oklahoma City Thunder',
  'ORL': 'Orlando Magic',
  'PHI': 'Philadelphia 76ers',
  'PHX': 'Phoenix Suns',
  'POR': 'Portland Trail Blazers',
  'SAC': 'Sacramento Kings',
  'SAS': 'San Antonio Spurs',
  'TOR': 'Toronto Raptors',
  'UTA': 'Utah Jazz',
  'WAS': 'Washington Wizards'
  }

def get_full_team_name(team_abbreviation):
  # abreviation -> full name
  return team_names.get(team_abbreviation, team_abbreviation)

### **Prediction for One Game**

In [22]:
def predict_game_performance(model, home_team, away_team, game_date, season, dataframe):
  # date formatting is consistent with file
  game_date = pd.to_datetime(game_date)
  data_cleaned.loc[:, 'date'] = pd.to_datetime(data_cleaned['date'])

  # gets stats for home team
  game_data = dataframe[(dataframe['home'] == home_team) & (dataframe['date'] == game_date) & (dataframe['season'] == season)]

  if game_data.empty:
      print("No data available for the home game.")
      return

  # Extract the feature values for the home team
  pt_featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TO%', 'FTM', 'FTA', 'AST', 'BLK', 'FT%', 'FG%', 'OREB', '3PM']
  rb_featureColumns = ['OREB', 'DREB','FG%', 'PTS', 'FT%', '+/-', 'BLK', '3P%']
  to_featureColumns = ['AST', 'REB', '+/-', "TO%", "FGA", "3P%", "STL","OREB","DREB", "FG%", "FT%"]
  as_featureColumns = ['PTS', 'FGM', 'TOV', '+/-']
  ft_featureColumns = ['PTS', 'FGA', 'REB']
  bl_featureColumns = ['REB', '+/-', 'STL']
  fgm_featureColumns = ['FGA', 'FG%', 'PTS','3PM', 'AST', 'TOV', '3P%', 'TO%']


  # predict the stats for the home team
  pt_stats_pred = pt_model.predict(game_data[pt_featureColumns])
  rb_stats_pred = rb_model.predict(game_data[rb_featureColumns])
  to_stats_pred = to_model.predict(game_data[to_featureColumns])
  as_stats_pred = as_model.predict(game_data[as_featureColumns])
  ft_stats_pred = ft_model.predict(game_data[ft_featureColumns])
  bl_stats_pred = bl_model.predict(game_data[bl_featureColumns])
  fgm_stats_pred = fgm_model.predict(game_data[fgm_featureColumns])


  # predictions for all variables
  points_pred = pt_stats_pred[0]
  rebounds_pred = rb_stats_pred[0]
  turnovers_pred = to_stats_pred[0]
  assists_pred = as_stats_pred[0]
  free_throws_pred = ft_stats_pred[0]
  blocks_pred = bl_stats_pred[0]
  fgm_pred = fgm_stats_pred[0]

  # Print prediction results
  win_prob = model.coef_[0]  # Example win probability calculation; customize as needed
  print(f'There is a {win_prob:.2f}% chance that the {get_full_team_name(home_team)} (home team) will defeat the {get_full_team_name(away_team)} (away team) by scoring {round(points_pred)} points with:')
  print(f'- {round(rebounds_pred)} rebounds')
  print(f'- {round(turnovers_pred)} turnovers')
  print(f'- {round(assists_pred)} assists')
  print(f'- {round(free_throws_pred)} free throws')
  print(f'- {round(blocks_pred)} blocks')
  print(f'- {round(fgm_pred)} field goals made')


# predicting for one game
predict_game_performance(pt_model, 'NOP', 'PHX', '2022-04-22', 2022, data_cleaned)

There is a -24.18% chance that the New Orleans Pelicans (home team) will defeat the Phoenix Suns (away team) by scoring 112 points with:
- 45 rebounds
- 16 turnovers
- 23 assists
- 22 free throws
- 5 blocks
- 38 field goals made
