<a href="https://colab.research.google.com/github/dani0621/ADV_CSIII_Project_1/blob/main/ADV_CSIII_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NBA Win Prediction Model**

### **Imports**

In [None]:
#importing files and libraries, connecting data file from Google Drive
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from google.colab import files
from google.colab import drive

drive.mount("/content/drive")
data=pd.read_csv('/content/drive/My Drive/MLA/team.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Rating and Percentage Calculations for Missing Data**

In [None]:
# Winning percentage equation from ChatGPT
data['W_PCT'] = data.groupby('team')['win'].transform(lambda x: x.cumsum() / (x.index + 1))

# True Shooting Percentage
# to_percentage= (points[PTS] * 100)/(2*(field goal attempts[FGA] + 0.44 * Free throw attempts [fta])) from Google Search
data['TO%'] = (data['PTS'] * 100) / (2 * (data['FGA'] + 0.44 * data['FTA']))

### **Clean Data/ Basic Analysis**

In [None]:
# Filter all the games played before 10/19/2018 as only the data from the past few years would be accurate in the our current win prediction model
data_filtered = data[data['date'] >= '10/19/2018']

# filters out rows that don't't have values
data_clean = data_filtered.dropna().drop_duplicates()

data_cleaned = data[['home','away', 'date', 'season', 'win', 'PTS', 'REB', 'TOV', '+/-', 'FTM', 'FTA', 'AST', 'BLK', 'TO%', 'W_PCT', 'OREB', 'DREB', 'FG%', 'FT%', '3P%', 'FGA', 'FGM', '3PM', 'STL']]

### **Logistic Regression**

#### **Points**

In [None]:
def performLinRegPt(dataframe):
  # factors
  featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TO%', 'FTM', 'FTA', 'AST', 'BLK', 'FT%', 'FG%', 'OREB', '3PM']

  X = dataframe[featureColumns]
  Y = dataframe['PTS']

  # training and testing sets
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, accuracy, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

pt_model = performLinRegPt(data_cleaned)

Mean Absolute Error: 1.9694866064656145
Mean Squared Error: 7.226182749792908
Root Mean Squared Error: 1.639559700931644
----------------------------------
Coefficient Information:
W_PCT: -26.56581999609999
REB: 0.11447430247621412
TOV: -0.18162115085260372
+/-: -0.028277590444782397
TO%: -6.734582675716202
FTM: 4.487266433704214
FTA: -1.6949758467803626
AST: 0.12114191681108055
BLK: -0.001988501271381438
FT%: 0.039651755502799
FG%: 7.568073358756645
OREB: 0.18023448402621542
3PM: 4.733241559525466




#### **Rebounds**

In [None]:
def performLinRegRB(dataframe):
  # factors
  featureColumns = ['OREB', 'DREB','FG%', 'PTS', 'FT%', '+/-', 'BLK', '3P%']

  X = dataframe[featureColumns]
  Y = dataframe['REB']

  # training and testing sets
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

rb_model = performLinRegRB(data_cleaned)

Mean Absolute Error: 8.132928903595494e-15
Mean Squared Error: 1.1086851835089736e-28
Root Mean Squared Error: 1.0261292346009266e-07
----------------------------------
Coefficient Information:
OREB: 0.9999999999999998
DREB: 1.0
FG%: 4.440892098500626e-16
PTS: 4.440892098500626e-16
FT%: 0.0
+/-: 2.498001805406602e-16
BLK: 2.740863092043355e-16
3P%: -3.3306690738754696e-16




#### **Turnovers**

In [None]:
def performLinRegTO(dataframe):
  featureColumns = ['AST', 'REB', '+/-', "TO%", "FGA", "3P%", "STL","OREB","DREB", "FG%", "FT%"]

  X = dataframe[featureColumns]
  Y = dataframe['TOV']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

to_model = performLinRegTO(data_cleaned)

Mean Absolute Error: 2.4568638351909593
Mean Squared Error: 9.549301324388455
Root Mean Squared Error: 1.7578948725522772
----------------------------------
Coefficient Information:
AST: 0.0639563501029985
REB: 0.29002042095940206
+/-: -0.19344590235524883
TO%: 0.04742978859725357
FGA: -0.3091348557711949
3P%: 0.0251431156745522
STL: 0.4724445523616996
OREB: 0.1787379280889
DREB: 0.111282492870502
FG%: 0.19326233784235783
FT%: 0.027296918043249196




#### **Assists**

In [None]:
def performLinRegAS(dataframe):
  featureColumns = ['PTS', 'FGM', 'TOV', '+/-']

  X = dataframe[featureColumns]
  Y = dataframe['AST']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

as_model = performLinRegAS(data_cleaned)

Mean Absolute Error: 3.1217209055580817
Mean Squared Error: 15.266196798596722
Root Mean Squared Error: 1.9766633692078237
----------------------------------
Coefficient Information:
PTS: 0.032763760483376925
FGM: 0.528489924959131
TOV: 0.07485074707948804
+/-: 0.04500341384662948




#### **Free Throws**

In [None]:
def performLinRegFT(dataframe):
  featureColumns = ['PTS', 'FGA', 'REB']

  X = dataframe[featureColumns]
  Y = dataframe['FTM']

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

ft_model = performLinRegFT(data_cleaned)

Mean Absolute Error: 4.267834342243429
Mean Squared Error: 28.65818302009757
Root Mean Squared Error: 2.3137272576871877
----------------------------------
Coefficient Information:
PTS: 0.20486670513056385
FGA: -0.4088233617984033
REB: 0.19749896637549946




#### **Blocks**

In [None]:
def performLinRegBL(dataframe):
  # what factors we want to later use for our independent variables
  featureColumns = ['REB', '+/-', 'STL']

  # following yt videos
  X = dataframe[featureColumns]
  Y = dataframe['BLK']

  # training and testing sets, maybe change test_size (look into more later on Kandell used 0.25)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

bl_model = performLinRegBL(data_cleaned)

Mean Absolute Error: 1.9902134802171443
Mean Squared Error:



 6.295964301232655
Root Mean Squared Error: 1.58403788476682
----------------------------------
Coefficient Information:
REB: 0.05048240896019663
+/-: 0.026235907444745236
STL: 0.001180427792089559


#### **Field Goals Made**

In [None]:
def performLinRegFGM(dataframe):
  # what factors we want to later use for our independent variables
  featureColumns = ['FGA', 'FG%', 'PTS','3PM', 'AST', 'TOV', '3P%', 'TO%']

  # following yt videos
  X = dataframe[featureColumns]
  Y = dataframe['FGM']

  # training and testing sets, maybe change test_size (look into more later on Kandell used 0.25)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

fgm_model = performLinRegFGM(data_cleaned)

Mean Absolute Error: 0.2991662873696996
Mean Squared Error: 0.1994160888607083
Root Mean Squared Error: 0.6682516635278658
----------------------------------
Coefficient Information:
FGA: 0.4078153224448956
FG%: 0.8573171096377035
PTS: 0.036553610777032675
3PM: 0.03846238592692139
AST: 0.0075380392795171
TOV: -0.0014299489253606357
3P%: -0.0032999150100185837
TO%: -0.11158685892225895




### **Converting Abreviations to Full Names**

In [None]:
team_names = {
  'ATL': 'Atlanta Hawks',
  'BKN': 'Brooklyn Nets',
  'BOS': 'Boston Celtics',
  'CHA': 'Charlotte Hornets',
  'CHI': 'Chicago Bulls',
  'CLE': 'Cleveland Cavaliers',
  'DAL': 'Dallas Mavericks',
  'DEN': 'Denver Nuggets',
  'DET': 'Detroit Pistons',
  'GSW': 'Golden State Warriors',
  'HOU': 'Houston Rockets',
  'IND': 'Indiana Pacers',
  'LAC': 'Los Angeles Clippers',
  'LAL': 'Los Angeles Lakers',
  'MEM': 'Memphis Grizzlies',
  'MIA': 'Miami Heat',
  'MIL': 'Milwaukee Bucks',
  'MIN': 'Minnesota Timberwolves',
  'NOP': 'New Orleans Pelicans',
  'OKC': 'Oklahoma City Thunder',
  'ORL': 'Orlando Magic',
  'PHI': 'Philadelphia 76ers',
  'PHX': 'Phoenix Suns',
  'POR': 'Portland Trail Blazers',
  'SAC': 'Sacramento Kings',
  'SAS': 'San Antonio Spurs',
  'TOR': 'Toronto Raptors',
  'UTA': 'Utah Jazz',
  'WAS': 'Washington Wizards'
  }

def get_full_team_name(team_abbreviation):
  # abreviation -> full name
  return team_names.get(team_abbreviation, team_abbreviation)

### **Prediction for One Game**

In [139]:
def predict_game_performance(model, home_team, away_team, game_date, season, dataframe):
  # date formatting is consistent with file
  game_date = pd.to_datetime(game_date)
  data_cleaned.loc[:, 'date'] = pd.to_datetime(data_cleaned['date'])

  # gets stats for home team
  game_data = dataframe[(dataframe['home'] == home_team) & (dataframe['date'] == game_date) & (dataframe['season'] == season)]

  if game_data.empty:
      print("No data available for the home game.")
      return

  # Extract the feature values for the home team
  pt_featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TO%', 'FTM', 'FTA', 'AST', 'BLK', 'FT%', 'FG%', 'OREB', '3PM']
  rb_featureColumns = ['OREB', 'DREB','FG%', 'PTS', 'FT%', '+/-', 'BLK', '3P%']
  to_featureColumns = ['AST', 'REB', '+/-', "TO%", "FGA", "3P%", "STL","OREB","DREB", "FG%", "FT%"]
  as_featureColumns = ['PTS', 'FGM', 'TOV', '+/-']
  ft_featureColumns = ['PTS', 'FGA', 'REB']
  bl_featureColumns = ['REB', '+/-', 'STL']
  fgm_featureColumns = ['FGA', 'FG%', 'PTS','3PM', 'AST', 'TOV', '3P%', 'TO%']


  # predict the stats for the home team
  pt_stats_pred = pt_model.predict(game_data[pt_featureColumns])
  rb_stats_pred = rb_model.predict(game_data[rb_featureColumns])
  to_stats_pred = to_model.predict(game_data[to_featureColumns])
  as_stats_pred = as_model.predict(game_data[as_featureColumns])
  ft_stats_pred = ft_model.predict(game_data[ft_featureColumns])
  bl_stats_pred = bl_model.predict(game_data[bl_featureColumns])
  fgm_stats_pred = fgm_model.predict(game_data[fgm_featureColumns])


  # predictions for all variables
  points_pred = pt_stats_pred[0]
  rebounds_pred = rb_stats_pred[0]
  turnovers_pred = to_stats_pred[0]
  assists_pred = as_stats_pred[0]
  free_throws_pred = ft_stats_pred[0]
  blocks_pred = bl_stats_pred[0]
  fgm_pred = fgm_stats_pred[0]

  # Print prediction results
  win_prob = model.coef_[0]  # Example win probability calculation; customize as needed
  print(f'There is a {win_prob:.2f}% chance that the {get_full_team_name(home_team)} (home team) will defeat the {get_full_team_name(away_team)} (away team) by scoring {round(points_pred)} points with:')
  print(f'- {round(rebounds_pred)} rebounds')
  print(f'- {round(turnovers_pred)} turnovers')
  print(f'- {round(assists_pred)} assists')
  print(f'- {round(free_throws_pred)} free throws')
  print(f'- {round(blocks_pred)} blocks')
  print(f'- {round(fgm_pred)} field goals made')


# predicting for one game
predict_game_performance(pt_model, 'NOP', 'PHX', '2022-04-22', 2022, data_cleaned)

There is a -26.57% chance that the New Orleans Pelicans (home team) will defeat the Phoenix Suns (away team) by scoring 112 points with:
- 45 rebounds
- 16 turnovers
- 23 assists
- 22 free throws
- 5 blocks
- 38 field goals made
