In [None]:
#@title This cell processes possession data

import numpy as np
import pandas as pd

# Declare the class object that calculates atk/def ratings
# Assumes game_data is the DataFrame being passed

class CalcTeamRatings:
  def __init__(self, home, away, date, df):
    self.home = home
    self.date = date
    self.away = away
    self.df = df[df['Date'] < self.date].sort_values(['Date'])
    self.df = self.df.dropna()
    self.home_df = self.df[self.df['Home Team'] == self.home]
    self.home_df = self.home_df.tail(30)
    self.away_df = self.df[self.df['Away Team'] == self.away]
    self.away_df = self.away_df.tail(30)
  def rollingAvgEff(self):
    if (len(self.home_df) < 10):
      return self.home_df['Home Eff'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['Home Eff'].tail(10).mean() + self.home_df['Home Eff'].mean()) / 2)
    old_games = self.home_df['Home Eff'].head(20).mean()
    new_games = self.home_df['Home Eff'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def rollingAvgEffA(self):
    if (len(self.home_df) < 10):
      return self.home_df['Away Eff'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['Away Eff'].tail(10).mean() + self.home_df['Away Eff'].mean()) / 2)
    old_games = self.home_df['Away Eff'].head(20).mean()
    new_games = self.home_df['Away Eff'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgEff(self):
    if (len(self.away_df) < 10):
      return self.away_df['Away Eff'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['Away Eff'].tail(10).mean() + self.away_df['Away Eff'].mean()) / 2)
    old_games = self.away_df['Away Eff'].head(20).mean()
    new_games = self.away_df['Away Eff'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_rollingAvgEffA(self):
    if (len(self.away_df) < 10):
      return self.away_df['Home Eff'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['Home Eff'].tail(10).mean() + self.away_df['Home Eff'].mean()) / 2)
    old_games = self.away_df['Home Eff'].head(20).mean()
    new_games = self.away_df['Home Eff'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def home_pace(self):
    if (len(self.home_df) < 10):
      return self.home_df['Home Possessions'].mean()
    if (len(self.home_df) < 20):
      return ((self.home_df['Home Possessions'].tail(10).mean() + self.home_df['Home Possessions'].mean()) / 2)
    old_games = self.home_df['Home Possessions'].head(20).mean()
    new_games = self.home_df['Home Possessions'].tail(10).mean()
    return ((old_games + new_games) / 2)
  def away_pace(self):
    if (len(self.away_df) < 10):
      return self.away_df['Away Possessions'].mean()
    if (len(self.away_df) < 20):
      return ((self.away_df['Away Possessions'].tail(10).mean() + self.away_df['Away Possessions'].mean()) / 2)
    old_games = self.away_df['Away Possessions'].head(20).mean()
    new_games = self.away_df['Away Possessions'].tail(10).mean()
    return ((old_games + new_games) / 2)

seasons = ['2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
           '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']
game_data = pd.DataFrame()

for season in seasons:
  df = pd.read_excel(f'/content/drive/MyDrive/Sports Data Analysis/NBA/Possession Data/Excel Files/{season}.xlsx')
  s1, s2 = season.split('-')
  df['Season'] = np.full(len(df), int(s2))
  
  df['Home Eff'] = np.array(df['Home Points']) / np.array(df['Home Possessions'])
  df['Away Eff'] = np.array(df['Away Points']) / np.array(df['Away Possessions'])
  df = df.replace(to_replace='NOH', value='NOP')
  df = df.replace(to_replace='SEA', value='OKC')
  df = df.replace(to_replace='NJN', value='BKN')

  df['Game ID'] = np.array([str(df['Date'].iloc[x])[:10] + df['Home Team'].iloc[x] for x in range(len(df))])
  df.to_csv(f'/content/drive/MyDrive/Sports Data Analysis/NBA/Possession Data/CSV Files/{season}.csv')
  game_data = game_data.append(df).reset_index(drop=True)

r = range(len(game_data))
home_atk = []
home_def = []
away_atk = []
away_def = []
home_pace = []
away_pace = []
for i in r:
  obj = CalcTeamRatings(game_data['Home Team'].iloc[i], game_data['Away Team'].iloc[i], game_data['Date'].iloc[i], game_data)
  home_atk.append(obj.rollingAvgEff())
  home_def.append(obj.rollingAvgEffA())
  away_atk.append(obj.away_rollingAvgEff())
  away_def.append(obj.away_rollingAvgEffA())
  home_pace.append(obj.home_pace())
  away_pace.append(obj.away_pace())

game_data['Home Atk'] = np.array(home_atk)
game_data['Home Def'] = np.array(home_def)
game_data['Away Atk'] = np.array(away_atk)
game_data['Away Def'] = np.array(away_def)
game_data['Home Pace'] = np.array(home_pace)
game_data['Away Pace'] = np.array(away_pace)

game_data.to_csv('/content/drive/MyDrive/Sports Data Analysis/NBA/Possession Data/CSV Files/2007-2022.csv')


In [None]:
#@title This cell processes historic betting data

import numpy as np
import pandas as pd

# All scores/spreads/totals are listed soccer style: home team first

def odds_compiler(season):
  odds = pd.read_excel(f'/content/drive/MyDrive/Sports Data Analysis/NBA/Betting Odds/Excel Files/nba odds {season}.xlsx').replace(to_replace='pk', value=0)
  home_odds = odds[odds['VH'] == 'H'].reset_index(drop=True).replace(to_replace='NL', value=0)
  away_odds = odds[odds['VH'] == 'V'].reset_index(drop=True).replace(to_replace='NL', value=0)

  s1, s2 = season.split('-')
  s = np.full(len(home_odds), int(s2))

  r1 = range(len(home_odds))
  spread = []
  home_ml = np.array(home_odds['ML'])
  away_ml = np.array(away_odds['ML'])
  over_under = []
  date = []

  for i in r1:
    # This breaks the date into individual characters so it can be reformated
    d = list(str(home_odds['Date'].iloc[i]))
    if (len(d) == 4):
      new_date = s1 + '-' + d[0] + d[1] + '-' + d[2] + d[3]
      date.append(new_date)
    else:
      new_date = s2 + '-' + '0' + d[0] + '-' + d[1] + d[2]
      date.append(new_date)

    # This corrects inconsistencies with teams names
    home_team = home_odds['Team'].iloc[i].split(' ')
    team_name = ''
    for word in home_team:
      team_name += word
    home_odds['Team'].iloc[i] = team_name
    
    away_team = away_odds['Team'].iloc[i].split(' ')
    team_name = ''
    for word in away_team:
      team_name += word
    away_odds['Team'].iloc[i] = team_name

    # The original data always puts the spread on the same line as the favorite
    # and the over/under on the same line as the underdog
    if (int(home_ml[i]) < int(away_ml[i])):
      spread.append(-float(home_odds['Close'].iloc[i]))
      over_under.append(away_odds['Close'].iloc[i])
    else:
      spread.append(away_odds['Close'].iloc[i])
      over_under.append(home_odds['Close'].iloc[i])

  # TODO: convert dates to a more usable format
  odds = pd.DataFrame({
      'Date': np.array(date),
      'Home Team': np.array(home_odds['Team']),
      'Spread': np.array(spread),
      'Home ML': home_ml,
      'Away Team': np.array(away_odds['Team']),
      'Away ML': away_ml,
      'O_U': np.array(over_under),
      '1st': np.array([str(home_odds['1st'].iloc[x]) + '-' + str(away_odds['1st'].iloc[x]) for x in r1]),
      '2nd': np.array([str(home_odds['2nd'].iloc[x]) + '-' + str(away_odds['2nd'].iloc[x]) for x in r1]),
      '3rd': np.array([str(home_odds['3rd'].iloc[x]) + '-' + str(away_odds['3rd'].iloc[x]) for x in r1]),
      '4th': np.array([str(home_odds['4th'].iloc[x]) + '-' + str(away_odds['4th'].iloc[x]) for x in r1]),
      'Final': np.array([str(home_odds['Final'].iloc[x]) + '-' + str(away_odds['Final'].iloc[x]) for x in r1]),
      'MOV': np.array([home_odds['Final'].iloc[x] - away_odds['Final'].iloc[x] for x in r1]),
      'Total': np.array([home_odds['Final'].iloc[x] + away_odds['Final'].iloc[x] for x in r1]),
      'Season': s
  })
  return odds

seasons = ['2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
           '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']
current_team_names = ['Atlanta', 'Boston', 'Brooklyn', 'Charlotte', 'Chicago', 'Cleveland', 'Dallas',
                      'Denver', 'Detroit', 'GoldenState', 'Houston', 'Indiana', 'LAClippers',
                      'LALakers', 'Memphis', 'Miami', 'Milwaukee', 'Minnesota', 'NewOrleans',
                      'NewYork', 'OklahomaCity', 'Orlando', 'Philadelphia', 'Phoenix', 'Portland',
                      'Sacramento', 'SanAntonio', 'Toronto', 'Utah', 'Washington']
new_team_names = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL',
                  'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC',
                  'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP',
                  'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR',
                  'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
odds_data = pd.DataFrame()
for season in seasons:
  temp = odds_compiler(season)
  temp.replace(to_replace='NewJersey', value='Brooklyn', inplace=True)
  temp.replace(to_replace='Seattle', value='OklahomaCity', inplace=True)
  temp.replace(to_replace=current_team_names, value=new_team_names, inplace=True)
  temp['Game ID'] = np.array([temp['Date'].iloc[x] + temp['Home Team'].iloc[x] for x in range(len(temp))])
  temp.to_csv(f'/content/drive/MyDrive/Sports Data Analysis/NBA/Betting Odds/CSV Files/{season}.csv')
  odds_data = odds_data.append(temp).reset_index(drop=True)

first_season = seasons[0]
last_season = seasons[len(seasons) - 1]
s1, s2 = first_season.split('-')
s3, s4 = last_season.split('-')
odds_data.to_csv(f'/content/drive/MyDrive/Sports Data Analysis/NBA/Betting Odds/CSV Files/{s1}-{s4}.csv')

In [7]:
# This cell is to combine the possession data and historic betting data into one dataset

import numpy as np
import pandas as pd

betting_odds = pd.read_csv('/content/drive/MyDrive/Sports Data Analysis/NBA/Betting Odds/CSV Files/2007-2022.csv')
possession_data = pd.read_csv('/content/drive/MyDrive/Sports Data Analysis/NBA/Possession Data/CSV Files/2007-2022.csv')

# The betting odds dataset is larger because it has playoff games, which the possession dataset doesn't have
# That means everything in the combined dataset is based on the size of the possession dataset

combined_data = pd.DataFrame()
