Read CSV data with Pandas.

In [95]:
import pandas as pd

# team stats
team_summaries = pd.read_csv('nba_data/stats_archive/Team Summaries.csv')
# team_totals = pd.read_csv('nba_data/stats_archive/Team Totals.csv')
team_per100 = pd.read_csv('nba_data/stats_archive/Team Stats Per 100 Poss.csv')

# player stats
# player_totals = pd.read_csv('nba_data/stats_archive/Player Totals.csv')
player_shooting = pd.read_csv('nba_data/stats_archive/Player Shooting.csv')
player_play_by_play = pd.read_csv('nba_data/stats_archive/Player Play By Play.csv')
player_per100 = pd.read_csv('nba_data/stats_archive/Per 100 Poss.csv')

# games
games = pd.read_csv('nba_data/game_archive/games.csv')
teamids = pd.read_csv('nba_data/game_archive/teams.csv')


Remove unnecessary features.

In [96]:
# games
teamids['TEAM_NAME'] = teamids['CITY'] + ' ' + teamids['NICKNAME']
teamids = teamids[['TEAM_ID','TEAM_NAME','ABBREVIATION']]

games = games[['GAME_DATE_EST','GAME_ID','HOME_TEAM_ID','VISITOR_TEAM_ID','SEASON','HOME_TEAM_WINS','PTS_home','PTS_away']]

In [97]:
# team stats
team_stats = team_summaries.merge(team_per100)
team_stats.drop(columns=['playoffs','arena','attend','lg'], inplace=True)

# player stats
player_stats = player_shooting.merge(player_play_by_play).merge(player_per100)
player_stats.drop(columns=['birth_year','pos','lg'], inplace=True)

Create player efficiency indices.

𝐼NBA=𝑁pts+𝑁rbs+𝑁asts+𝑁stls+𝑁blcks−(𝑁msFld+𝑁msFT+𝑁tos)

In [98]:
player_stats['player_eff_ind'] = (player_stats['pts_per_100_poss'] +
                           player_stats['trb_per_100_poss'] +
                           player_stats['ast_per_100_poss'] +
                           player_stats['stl_per_100_poss'] +
                           player_stats['blk_per_100_poss'] -
                           (player_stats['tov_per_100_poss'] +
                            player_stats['fga_per_100_poss'] - player_stats['fg_per_100_poss'] +
                            player_stats['fta_per_100_poss'] - player_stats['ft_per_100_poss']))
# player_stats

Create team efficiency indices.

In [99]:
#create the team efficeny index by summing the players by season and team 
team_eff_ind = player_stats.groupby(['tm', 'season'])['player_eff_ind'].sum().reset_index(name='team_eff_ind')
team_eff_ind.rename(columns = {'tm':'abbreviation'}, inplace = True)

In [100]:
#merging the team efficeny index to team_stats
team_stats = team_stats.merge(team_eff_ind)
# team_stats

To each game, append home and away team stats in the respective season.

In [101]:
# add team_id to team stats
teamids.rename(columns = {'ABBREVIATION':'abbreviation','TEAM_NAME':'team'}, inplace = True)
team_stats = team_stats.merge(teamids)
team_stats.drop(columns=['team','abbreviation'], inplace = True)

In [102]:
min(team_stats.season)

1997

In [122]:
# add home team stats to games
h_team_stats = team_stats.add_prefix('h_')
final_df = games.merge(h_team_stats, left_on=['HOME_TEAM_ID','SEASON'], right_on=['h_TEAM_ID','h_season'])
final_df.drop(columns=['h_season','h_TEAM_ID'], inplace = True)

# add away team stats to games
a_team_stats = team_stats.add_prefix('a_')
final_df = final_df.merge(a_team_stats, left_on=['VISITOR_TEAM_ID','SEASON'], right_on=['a_TEAM_ID','a_season'])
final_df.drop(columns=['a_season','a_TEAM_ID'], inplace = True)

In [123]:
# sort games by date, ascending
final_df['GAME_DATE_EST'] = pd.to_datetime(final_df['GAME_DATE_EST'])
final_df = final_df.sort_values(by='GAME_DATE_EST', ascending=True).reset_index(drop=True)

Add novel features from the parent paper.

In [111]:
import warnings
warnings.filterwarnings("ignore")

final_df = final_df[final_df['HOME_TEAM_WINS'].notna()]

def get_avg_win_pct_last_n_games(team, game_date, df, n):
    # Extract the last 10 games 
    prev_game_df = df[df['GAME_DATE_EST'] < game_date][(df['HOME_TEAM_ID'] == team) | (df['VISITOR_TEAM_ID'] == team)].tail(n)
    
    wins = 0 
    # Filter columns to include only 'Home' team and result, then divide into home and away games        
    result_df = prev_game_df
    h_df = result_df.loc[result_df['HOME_TEAM_ID'] == team] 
    
    h_wins = h_df.loc[h_df['HOME_TEAM_WINS'] == 1]
    
    wins += len(h_wins)
    
    a_df = result_df.loc[result_df['HOME_TEAM_ID'] != team]
    a_wins = a_df.loc[a_df['HOME_TEAM_WINS'] == 0]
    
    wins += len(a_wins)

    return wins/n

for index, row in final_df.iterrows() : 
    game_id = row['GAME_ID']
    game_date = row['GAME_DATE_EST']
    h_team = row['HOME_TEAM_ID']
    a_team = row['VISITOR_TEAM_ID']
    final_df.loc[index,'H_w_pct_10'] = get_avg_win_pct_last_n_games(h_team, game_date, final_df, 10)
    final_df.loc[index,'A_w_pct_10'] = get_avg_win_pct_last_n_games(a_team, game_date, final_df, 10)

final_df.head(20)

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,HOME_TEAM_WINS,PTS_home,PTS_away,h_age,h_w,...,a_trb_per_100_poss,a_ast_per_100_poss,a_stl_per_100_poss,a_blk_per_100_poss,a_tov_per_100_poss,a_pf_per_100_poss,a_pts_per_100_poss,a_team_eff_ind,H_w_pct_10,A_w_pct_10
0,2003-10-05,10300001,1610612762,1610612742,2003,1,90.0,85.0,30.9,47.0,...,45.3,24.1,8.7,5.9,12.4,22.7,110.7,351.9,0.0,0.0
1,2003-10-06,10300002,1610612763,1610612749,2003,1,105.0,94.0,24.9,28.0,...,43.3,24.3,8.3,4.6,13.9,24.3,108.8,346.1,0.0,0.0
2,2003-10-07,10300003,1610612765,1610612739,2003,0,96.0,100.0,29.0,50.0,...,47.1,22.0,8.2,6.7,19.3,24.0,96.5,283.3,0.0,0.0
3,2003-10-07,10300009,1610612758,1610612746,2003,1,101.0,82.0,28.3,59.0,...,46.2,21.3,7.6,6.1,17.2,23.7,102.3,314.8,0.0,0.0
4,2003-10-07,10300005,1610612757,1610612745,2003,1,104.0,80.0,29.2,50.0,...,48.7,20.4,8.1,6.7,17.3,21.7,104.3,318.4,0.0,0.0
5,2003-10-07,10300004,1610612742,1610612753,2003,1,99.0,89.0,28.1,60.0,...,43.6,21.8,9.1,3.9,15.3,24.6,105.2,355.0,0.0,0.0
6,2003-10-07,10300006,1610612747,1610612744,2003,0,,,28.1,50.0,...,49.4,22.1,7.7,6.5,16.7,23.0,108.3,320.0,0.0,0.0
7,2003-10-07,10300007,1610612748,1610612755,2003,1,86.0,79.0,26.5,25.0,...,45.7,23.4,11.2,3.8,16.0,23.8,105.0,328.3,0.0,0.0
8,2003-10-07,10300010,1610612764,1610612752,2003,1,104.0,86.0,28.4,37.0,...,42.9,24.0,7.7,3.4,15.3,25.2,104.7,270.2,0.0,0.0
9,2003-10-08,10300014,1610612738,1610612765,2003,0,89.0,104.0,26.9,44.0,...,46.2,22.5,7.7,6.5,15.3,24.3,104.1,290.8,0.0,0.0


In [118]:
# Home and road team win probabilities implied by Elo ratings and home court adjustment 
import math
# import time

final_df = final_df[final_df['PTS_away'].notna() & final_df['PTS_home'].notna()]

# Calculate win probabilities using Elo ratings and home court advantage
def win_probs(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400) 

    denom = r + a*h
    home_prob = a*h / denom
    away_prob = r / denom 
  
    return home_prob, away_prob

# Calculate odds of the home team winning based on Elo ratings and home court advantage
def home_odds_on(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400)
    return a*h/r

# Determines the K factor in Elo rating updates, considering the margin of victory and Elo rating difference
def elo_k(MOV, elo_diff):
    k = 20
    if MOV>0:
        multiplier=(MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
    else:
        multiplier=(-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
    return k*multiplier

# Update Elo ratings for both teams following a game, considering the outcome and game specifics
def update_elo(home_score, away_score, home_elo, away_elo, home_court_advantage) :
    home_prob, away_prob = win_probs(home_elo, away_elo, home_court_advantage) 

    if (home_score - away_score > 0) :
        home_win = 1 
        away_win = 0 
    else :
        home_win = 0 
        away_win = 1 
  
    k = elo_k(home_score - away_score, home_elo - away_elo)

    updated_home_elo = home_elo + k * (home_win - home_prob) 
    updated_away_elo = away_elo + k * (away_win - away_prob)
    
    return updated_home_elo, updated_away_elo

# Define a function to retrieve a team's Elo rating before a given game, adjusting for season transitions
def get_prev_elo(team, date, season, team_stats, elo_df) :
    prev_game = team_stats[team_stats['GAME_DATE_EST'] < date][(team_stats['HOME_TEAM_ID'] == team) | (team_stats['VISITOR_TEAM_ID'] == team)].tail(1).iloc[0] 

    if team == prev_game['HOME_TEAM_ID'] :
        elo_rating = elo_df[elo_df['GAME_ID'] == prev_game['GAME_ID']]['H_Team_Elo_After'].values[0]
    else :
        elo_rating = elo_df[elo_df['GAME_ID'] == prev_game['GAME_ID']]['A_Team_Elo_After'].values[0]
  
    if prev_game['SEASON'] != season :
        return (0.75 * elo_rating) + (0.25 * 1505)
    else :
        return elo_rating

# create elo feature
elo_df = pd.DataFrame(columns=['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'H_Team_Elo_Before', 'A_Team_Elo_Before', 'H_Team_Elo_After', 'A_Team_Elo_After'])
teams_elo_df = pd.DataFrame(columns=['GAME_ID','Team', 'Elo', 'GAME_DATE_EST', 'Where_Played', 'SEASON'])

# Loop through each game to update ELO ratings
for index, row in final_df.iterrows(): 
    game_id = row['GAME_ID']
    game_date = row['GAME_DATE_EST']
    season = row['SEASON']
    h_team, a_team = row['HOME_TEAM_ID'], row['VISITOR_TEAM_ID']
    h_score, a_score = row['PTS_home'], row['PTS_away'] 
    # Set or get previous Elo ratings
    if (h_team not in elo_df['HOME_TEAM_ID'].values and h_team not in elo_df['VISITOR_TEAM_ID'].values) :
        h_team_elo_before = 1500
    else :
        h_team_elo_before = get_prev_elo(h_team, game_date, season, final_df, elo_df)

    if (a_team not in elo_df['HOME_TEAM_ID'].values and a_team not in elo_df['VISITOR_TEAM_ID'].values) :
        a_team_elo_before = 1500
    else :
        a_team_elo_before = get_prev_elo(a_team, game_date, season, final_df, elo_df)

    h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, a_team_elo_before, 69)

    new_row = {'GAME_ID': game_id, 'HOME_TEAM_ID': h_team, 'VISITOR_TEAM_ID': a_team, 'H_Team_Elo_Before': h_team_elo_before, 'A_Team_Elo_Before': a_team_elo_before, \
                                                                        'H_Team_Elo_After' : h_team_elo_after, 'A_Team_Elo_After': a_team_elo_after}
    teams_row_one = {'GAME_ID': game_id,'Team': h_team, 'Elo': h_team_elo_before, 'GAME_DATE_EST': game_date, 'Where_Played': 'HOME_TEAM_ID', 'SEASON': season}
    teams_row_two = {'GAME_ID': game_id,'Team': a_team, 'Elo': a_team_elo_before, 'GAME_DATE_EST': game_date, 'Where_Played': 'VISITOR_TEAM_ID', 'SEASON': season}
    
    # Record Elo ratings in elo_df and teams_elo_df
    elo_df.loc[len(elo_df)] = new_row
    teams_elo_df.loc[len(teams_elo_df)] = teams_row_one
    teams_elo_df.loc[len(teams_elo_df)] = teams_row_two

# dates = list(set([d.strftime("%m-%d-%Y") for d in teams_elo_df["GAME_DATE_EST"]]))
# dates = sorted(dates, key=lambda x: time.strptime(x, '%m-%d-%Y'))
# teams = final_df["VISITOR_TEAM_ID"]
# dataset = pd.DataFrame(columns=dates)
# dataset["Team"] = teams.drop_duplicates()
# dataset = dataset.set_index("Team")

# for index, row in teams_elo_df.iterrows():
#     # Create a dataset for tracking Elo ratings over time
#     date = row["GAME_DATE_EST"].strftime("%m-%d-%Y")
#     team = row["Team"]
#     elo = row["Elo"]
#     dataset[date][team] = elo

teams_elo_df['Elo'] = teams_elo_df['Elo'].astype(float)
# Merge updated Elo ratings back into the main DataFrame
final_df = final_df.merge(elo_df.drop(columns=['HOME_TEAM_ID', 'VISITOR_TEAM_ID']), on ='GAME_ID')

In [121]:
final_df.tail(30)

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,HOME_TEAM_WINS,PTS_home,PTS_away,h_age,h_w,...,a_stl_per_100_poss,a_blk_per_100_poss,a_tov_per_100_poss,a_pf_per_100_poss,a_pts_per_100_poss,a_team_eff_ind,H_Team_Elo_Before,A_Team_Elo_Before,H_Team_Elo_After,A_Team_Elo_After
20294,2022-12-17,22200443,1610612760,1610612763,2022,1,115.0,109.0,22.4,24.0,...,9.7,6.4,13.1,19.6,114.6,492.6,,,,
20295,2022-12-17,22200439,1610612759,1610612748,2022,0,101.0,111.0,24.5,34.0,...,7.6,3.3,15.0,21.2,113.7,522.7,,,,
20296,2022-12-17,22200440,1610612739,1610612742,2022,1,100.0,99.0,24.7,44.0,...,7.0,4.1,13.1,20.5,112.8,586.0,,,,
20297,2022-12-18,22200445,1610612738,1610612753,2022,0,92.0,95.0,26.1,51.0,...,6.8,4.5,14.5,19.7,104.5,439.5,,,,
20298,2022-12-18,22200449,1610612750,1610612741,2022,1,150.0,126.0,24.2,46.0,...,7.2,4.2,13.0,19.1,113.2,519.9,,,,
20299,2022-12-18,22200451,1610612747,1610612764,2022,1,119.0,117.0,30.2,33.0,...,6.5,5.1,13.4,19.3,111.1,623.7,,,,
20300,2022-12-18,22200446,1610612754,1610612752,2022,0,106.0,109.0,26.0,25.0,...,7.2,5.0,13.7,21.1,110.4,361.9,,,,
20301,2022-12-18,22200448,1610612761,1610612744,2022,0,110.0,126.0,24.8,48.0,...,8.9,4.6,15.1,21.3,112.5,427.3,,,,
20302,2022-12-19,22200455,1610612745,1610612759,2022,0,105.0,124.0,24.1,20.0,...,7.6,4.9,12.6,18.0,112.4,514.7,,,,
20303,2022-12-19,22200457,1610612740,1610612749,2022,0,119.0,128.0,25.6,36.0,...,7.6,4.0,13.4,18.2,115.1,564.3,,,,


In [7]:
final_df = team_stats[(team_stats['season'] >= 2018) & (team_stats['season'] <= 2021)]
final_df

Unnamed: 0,season,team,abbreviation,age,w,l,pw,pl,mov,sos,...,orb_per_100_poss,drb_per_100_poss,trb_per_100_poss,ast_per_100_poss,stl_per_100_poss,blk_per_100_poss,tov_per_100_poss,pf_per_100_poss,pts_per_100_poss,team_efficiency_index
90,2021,Atlanta Hawks,ATL,25.4,41.0,31.0,41.0,31.0,2.32,-0.18,...,10.7,35.7,46.4,24.5,7.1,4.8,13.5,19.7,115.7,420.7
91,2021,Boston Celtics,BOS,25.1,36.0,36.0,39.0,33.0,1.46,-0.14,...,10.8,34.0,44.8,23.7,7.8,5.4,14.2,20.7,114.0,501.2
92,2021,Brooklyn Nets,BRK,28.2,48.0,24.0,46.0,26.0,4.50,-0.27,...,8.9,35.5,44.3,26.7,6.7,5.3,13.5,19.0,118.3,631.4
93,2021,Chicago Bulls,CHI,25.6,31.0,41.0,34.0,38.0,-0.89,-0.05,...,9.7,35.5,45.1,26.9,6.7,4.2,15.2,19.0,111.1,525.1
94,2021,Charlotte Hornets,CHO,24.6,33.0,39.0,32.0,40.0,-1.93,-0.01,...,10.7,33.6,44.3,27.2,7.9,4.8,15.0,18.3,110.9,400.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,2018,Sacramento Kings,SAC,26.4,27.0,55.0,23.0,59.0,-6.99,0.39,...,9.9,33.0,42.9,22.6,8.2,4.4,14.4,21.0,103.7,393.4
206,2018,San Antonio Spurs,SAS,29.3,47.0,35.0,49.0,33.0,2.89,0.00,...,10.9,35.6,46.4,23.9,8.0,5.9,13.8,18.0,107.9,409.3
207,2018,Toronto Raptors,TOR,25.8,59.0,23.0,60.0,22.0,7.78,-0.49,...,9.9,34.9,44.8,24.8,7.8,6.2,13.6,22.2,113.8,427.4
208,2018,Utah Jazz,UTA,26.7,48.0,34.0,53.0,29.0,4.30,0.17,...,9.4,35.6,45.0,23.3,9.0,5.3,15.3,20.4,108.4,586.8


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = final_df.drop(['team', 'abbreviation', 'season', 'w'], axis=1)  # Features
y = final_df['w']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

X train shape: (92, 47)
X test shape: (24, 47)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

# Initialize models for regression
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "K-Neighbors": KNeighborsClassifier(),
    "Support Vector": SVC(),
    "XGBoost": XGBClassifier(objective='reg:squarederror'),
    "Gaussian Naive Bayes": GaussianNB(),
    "Multilayer Perceptron": MLPClassifier(max_iter=1000)
}

results = {}

for name, model in models.items():
    pipeline = make_pipeline(StandardScaler(), model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse
    print(f"{name} MSE: {mse:.4f}")

# Displaying all results
print("\nModel Mean Squared Errors:")
for name, mse in results.items():
    print(f"{name}: {mse:.4f}")


Linear Regression MSE: 0.0000
Random Forest Regressor MSE: 8.6299
KNN Regressor MSE: 21.7900
SVR MSE: 80.8252
XGBoost Regressor MSE: 9.7590
Neural Network Regressor MSE: 104.6257

Model Mean Squared Errors:
Linear Regression: 0.0000
Random Forest Regressor: 8.6299
KNN Regressor: 21.7900
SVR: 80.8252
XGBoost Regressor: 9.7590
Neural Network Regressor: 104.6257


