<a href="https://colab.research.google.com/github/dani0621/ADV_CSIII_Project_1/blob/main/ADV_CSIII_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NBA Win Prediction Model**

### **Imports**

In [47]:
#importing files and libraries, connecting data file from Google Drive
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from google.colab import files
from google.colab import drive

drive.mount("/content/drive")
data=pd.read_csv('/content/drive/My Drive/MLA/team.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Rating and Percentage Calculations for Missing Data**

In [48]:
# Winning percentage equation from ChatGPT
data['W_PCT'] = data.groupby('team')['win'].transform(lambda x: x.cumsum() / (x.index + 1))

# True Shooting Percentage
# ts_percentage= (points[PTS] * 100)/(2*(field goal attempts[FGA] + 0.44 * Free throw attempts [fta])) from Google Search
data['TS_PCT'] = (data['PTS'] * 100) / (2 * (data['FGA'] + 0.44 * data['FTA']))

### **Clean Data/ Basic Analysis**

In [49]:
# Filter all the games played before 10/19/2018 as only the data from the past few years would be accurate in the our current win prediction model
data_filtered = data[data['date'] >= '10/19/2018']

# filters out rows that don't't have values
data_clean = data_filtered.dropna().drop_duplicates()

data_cleaned = data[['home','away', 'date', 'season', 'win', 'PTS', 'REB', 'TOV', '+/-', 'FTM', 'FTA', 'AST', 'BLK', 'TS_PCT', 'W_PCT']]

### **Logistic Regressions**

In [61]:
def performLinReg(dataframe):
  # what factors we want to later use for our independent variables
  featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TS_PCT', 'FTM', 'FTA', 'AST', 'BLK']

  # following yt videos
  X = dataframe[featureColumns]
  Y = dataframe['PTS']

  # training and testing sets, maybe change test_size (look into more later on Kandell used 0.25)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

  linreg = LinearRegression()
  linreg.fit(X_train, Y_train)
  Y_pred = linreg.predict(X_test)

  # model performance, from Kandell
  print("Mean Absolute Error:", metrics.mean_absolute_error(Y_test, Y_pred))
  print("Mean Squared Error:", metrics.mean_squared_error(Y_test, Y_pred))
  print("Root Mean Squared Error:", math.sqrt(metrics.mean_squared_error(Y_test, Y_pred, squared=False)))
  print('----------------------------------')

  # displaying coefficients for model
  print('Coefficient Information:')
  for i, feature in enumerate(featureColumns):
      print(f"{feature}: {linreg.coef_[i]}")

  return linreg

model = performLinReg(data_cleaned)

Mean Absolute Error: 4.702170899882689
Mean Squared Error: 35.7462699369314
Root Mean Squared Error: 2.44516225225856
----------------------------------
Coefficient Information:
W_PCT: -92.59449675025297
REB: 0.7649161727915788
TOV: -0.8000191865457162
+/-: -0.20376714856987643
TS_PCT: 1.753801887426401
FTM: 0.2681271814692355
FTA: 0.05617358029874269
AST: 0.604495756149373
BLK: -0.062411343744697964




### **Converting Abreviations to Full Names**

In [51]:
team_names = {
  'ATL': 'Atlanta Hawks',
  'BKN': 'Brooklyn Nets',
  'BOS': 'Boston Celtics',
  'CHA': 'Charlotte Hornets',
  'CHI': 'Chicago Bulls',
  'CLE': 'Cleveland Cavaliers',
  'DAL': 'Dallas Mavericks',
  'DEN': 'Denver Nuggets',
  'DET': 'Detroit Pistons',
  'GSW': 'Golden State Warriors',
  'HOU': 'Houston Rockets',
  'IND': 'Indiana Pacers',
  'LAC': 'Los Angeles Clippers',
  'LAL': 'Los Angeles Lakers',
  'MEM': 'Memphis Grizzlies',
  'MIA': 'Miami Heat',
  'MIL': 'Milwaukee Bucks',
  'MIN': 'Minnesota Timberwolves',
  'NOP': 'New Orleans Pelicans',
  'OKC': 'Oklahoma City Thunder',
  'ORL': 'Orlando Magic',
  'PHI': 'Philadelphia 76ers',
  'PHX': 'Phoenix Suns',
  'POR': 'Portland Trail Blazers',
  'SAC': 'Sacramento Kings',
  'SAS': 'San Antonio Spurs',
  'TOR': 'Toronto Raptors',
  'UTA': 'Utah Jazz',
  'WAS': 'Washington Wizards'
  }

def get_full_team_name(team_abbreviation):
  # abreviation -> full name
  return team_names.get(team_abbreviation, team_abbreviation)

### **Prediction for One Game**

In [62]:
def predict_game_performance(model, home_team, away_team, game_date, season, dataframe):
  # date formatting is consistent with file
  game_date = pd.to_datetime(game_date)
  data_cleaned.loc[:, 'date'] = pd.to_datetime(data_cleaned['date'])

  # gets stats for home team
  game_data_home = dataframe[(dataframe['home'] == home_team) & (dataframe['date'] == game_date) & (dataframe['season'] == season)]

  if game_data_home.empty:
      print("No data available for the home game.")
      return

  # Extract the feature values for the home team
  featureColumns = ['W_PCT', 'REB', 'TOV', '+/-', 'TS_PCT', 'FTM', 'FTA', 'AST', 'BLK']


  X_new_home = game_data_home[featureColumns]

  # Predict stats for the home team
  home_stats_pred = model.predict(X_new_home)

  # For demonstration, let's assume the model is trained only for points
  points_pred_home = home_stats_pred[0]  # Assuming first output is points
  # Create dummy predictions for other stats if only points are predicted
  rebounds_pred_home = game_data_home['REB'].mean()  # Replace with appropriate logic
  turnovers_pred_home = game_data_home['TOV'].mean()  # Replace with appropriate logic
  assists_pred_home = game_data_home['AST'].mean()  # Replace with appropriate logic
  free_throws_pred_home = game_data_home['FTM'].mean()  # Replace with appropriate logic
  blocks_pred_home = game_data_home['BLK'].mean()  # Replace with appropriate logic


  # Predict away team points using relevant features for the away team
  game_data_away = dataframe[(dataframe['away'] == away_team) &
                              (dataframe['date'] == game_date) &
                              (dataframe['season'] == season)]

  if game_data_away.empty:
      print("No data available for the away game.")
      return

  X_new_away = game_data_away[featureColumns]

  # Print prediction results
  win_prob = model.coef_[0]  # Example win probability calculation; customize as needed
  print(f'There is a {win_prob:.2f}% chance that the {home_team} will defeat the {away_team} by:')
  print(f'- Home Team Scoring: {points_pred_home:.2f} points')
  print(f'- Home Team Rebounds: {rebounds_pred_home:.2f}')
  print(f'- Home Team Turnovers: {turnovers_pred_home:.2f}')
  print(f'- Home Team Assists: {assists_pred_home:.2f}')
  print(f'- Home Team Free Throws: {free_throws_pred_home:.2f}')
  print(f'- Home Team Blocks: {blocks_pred_home:.2f}')

# predicting for one game
predict_game_performance(model, 'NOP', 'PHX', '2022-04-22', 2022, data_cleaned)

There is a -92.59% chance that the NOP will defeat the PHX by:
- Home Team Scoring: 113.02 points
- Home Team Rebounds: 40.00
- Home Team Turnovers: 10.50
- Home Team Assists: 21.50
- Home Team Free Throws: 23.00
- Home Team Blocks: 4.00
