To show that home advantage exists in the NBA, we'll check the home win percentage, point average at home and away and it's variance since 2004.

In [1]:
from utils import *
win_percentage = (len(games_df[games_df['HOME_TEAM_WINS'] == 1]) * 100 / len(games_df)).__round__(3)
home_points_avg = games_df['PTS_home'].mean()
home_points_var = games_df['PTS_home'].var()
away_points_avg = games_df['PTS_away'].mean()
away_points_var = games_df['PTS_away'].var()
print(f"Home win percentage since 2004: {win_percentage}%")
print(f"Home points average: {home_points_avg.__round__(3)}, var: {home_points_var.__round__(3)}")
print(f"Away points average: {away_points_avg.__round__(3)}, var: {away_points_var.__round__(3)}")

Home win percentage since 2004: 58.703%
Home points average: 103.456, var: 176.448
Away points average: 100.64, var: 180.523


During the 2020-2021 season, most games were capped at 10% capacity at most due to covid. Let's see the home wins percentage for this period.

In [2]:
games_df['GAME_DATE_EST'] = pd.to_datetime(games_df['GAME_DATE_EST'], format='mixed', dayfirst=True)
start_date = '2020-07-22'
end_date = '2021-02-27'

mask = (games_df['GAME_DATE_EST'] >= start_date) & (games_df['GAME_DATE_EST'] <= end_date)
covid_df = games_df[mask]
win_percentage_covid = (len(covid_df[covid_df['HOME_TEAM_WINS'] == 1]) * 100 / len(covid_df)).__round__(3)
home_points_avg_covid = covid_df['PTS_home'].mean()
home_points_var_covid = covid_df['PTS_home'].var()
away_points_avg_covid = covid_df['PTS_away'].mean()
away_points_var_covid = covid_df['PTS_away'].var()
print(f"Home win percentage during Covid: {win_percentage_covid}%")
print(f"Home points average: {home_points_avg_covid.__round__(3)}, var: {home_points_var_covid.__round__(3)}")
print(f"Away points average: {away_points_avg_covid.__round__(3)}, var: {away_points_var_covid.__round__(3)}")

Home win percentage during Covid: 53.222%
Home points average: 112.232, var: 159.923
Away points average: 111.233, var: 153.487


We see a significant decrease, about 5%. 
Let's check players' home and away FT%.

In [3]:
years_dict = {2015: pbp_2015, 2016: pbp_2016, 2017: pbp_2017, 2018: pbp_2018, 2019: pbp_2019, 2020: pbp_2020}
for year, pbp_year in years_dict.items():
    mask_home = pbp_year['HomePlay'].notna()
    home_plays = pbp_year[mask_home]

    mask_ft_home = home_plays['FreeThrowOutcome'].notna()
    home_fts = home_plays[mask_ft_home]

    home_ft_percentage = len(home_fts[home_fts['FreeThrowOutcome'] == 'make']) * 100 / len(home_fts)
    print(f"Home FT% in {year}: {home_ft_percentage.__round__(3)}%")

    mask_away = pbp_year['AwayPlay'].notna()
    away_plays = pbp_year[mask_away]

    mask_ft_away = away_plays['FreeThrowOutcome'].notna()
    away_fts = away_plays[mask_ft_away]

    away_ft_percentage = len(away_fts[away_fts['FreeThrowOutcome'] == 'make']) * 100 / len(away_fts)
    print(f"Away FT% in {year}: {away_ft_percentage.__round__(3)}%")


Home FT% in 2015: 75.875%
Away FT% in 2015: 75.335%
Home FT% in 2016: 77.118%
Away FT% in 2016: 77.311%
Home FT% in 2017: 76.713%
Away FT% in 2017: 76.646%
Home FT% in 2018: 76.81%
Away FT% in 2018: 76.687%
Home FT% in 2019: 77.458%
Away FT% in 2019: 77.353%
Home FT% in 2020: 75.695%
Away FT% in 2020: 76.704%


We see no significant difference in FT% between home and away teams. This could imply that home advantage may be influenced by referees/coaches more than players.
Let's check the difference in FG%.

In [4]:
for year, pbp_year in years_dict.items():
    mask_home = pbp_year['HomePlay'].notna()
    home_plays = pbp_year[mask_home]

    mask_fg_home = home_plays['ShotOutcome'].notna()
    home_fgs = home_plays[mask_fg_home]

    home_fg_percentage = len(home_fgs[home_fgs['ShotOutcome'] == 'make']) * 100 / len(home_fgs)
    print(f"Home FG% in {year}: {home_fg_percentage.__round__(3)}%")

    mask_away = pbp_year['AwayPlay'].notna()
    away_plays = pbp_year[mask_away]

    mask_fg_away = away_plays['ShotOutcome'].notna()
    away_fgs = away_plays[mask_fg_away]

    away_fg_percentage = len(away_fgs[away_fgs['ShotOutcome'] == 'make']) * 100 / len(away_fgs)
    print(f"Away FG% in {year}: {away_fg_percentage.__round__(3)}%")

Home FG% in 2015: 45.7%
Away FG% in 2015: 44.585%
Home FG% in 2016: 46.362%
Away FG% in 2016: 45.157%
Home FG% in 2017: 46.505%
Away FG% in 2017: 45.503%
Home FG% in 2018: 46.454%
Away FG% in 2018: 45.425%
Home FG% in 2019: 46.395%
Away FG% in 2019: 45.581%
Home FG% in 2020: 45.99%
Away FG% in 2020: 46.026%


Next, I'd like to check the number of FGA and FTA for home and away teams. More FTAs for the home team could imply crowd influence over the referee.

In [5]:
for year, pbp_year in years_dict.items():
    if year == 2020:
        continue
    mask_home = pbp_year['HomePlay'].notna()
    home_plays = pbp_year[mask_home]

    mask_ft_home = home_plays['FreeThrowOutcome'].notna()
    home_fts = home_plays[mask_ft_home]

    print(f"Home FTs average in a game in {year}: {(len(home_fts) / 1230).__round__(3)}")
    mask_away = pbp_year['AwayPlay'].notna()
    away_plays = pbp_year[mask_away]

    mask_ft_away = away_plays['FreeThrowOutcome'].notna()
    away_fts = away_plays[mask_ft_away]

    print(f"Away FTs average in a game in {year}: {(len(away_fts) / 1230).__round__(3)}")

Home FTs average in a game in 2015: 25.585
Away FTs average in a game in 2015: 24.432
Home FTs average in a game in 2016: 25.194
Away FTs average in a game in 2016: 24.09
Home FTs average in a game in 2017: 23.468
Away FTs average in a game in 2017: 22.885
Home FTs average in a game in 2018: 25.063
Away FTs average in a game in 2018: 24.328
Home FTs average in a game in 2019: 21.813
Away FTs average in a game in 2019: 21.252


In [6]:
for year, pbp_year in years_dict.items():
    if year == 2020:
        continue
    mask_home = pbp_year['HomePlay'].notna()
    home_plays = pbp_year[mask_home]

    mask_fg_home = home_plays['ShotOutcome'].notna()
    home_fgs = home_plays[mask_fg_home]

    print(f"Home FGs average in a game in {year}: {(len(home_fgs) / 1230).__round__(3)}")
    mask_away = pbp_year['AwayPlay'].notna()
    away_plays = pbp_year[mask_away]

    mask_fg_away = away_plays['ShotOutcome'].notna()
    away_fgs = away_plays[mask_fg_away]

    print(f"Away FGs average in a game in {year}: {(len(away_fgs) / 1230).__round__(3)}")

Home FGs average in a game in 2015: 90.402
Away FGs average in a game in 2015: 90.32
Home FGs average in a game in 2016: 90.628
Away FGs average in a game in 2016: 90.849
Home FGs average in a game in 2017: 91.712
Away FGs average in a game in 2017: 91.595
Home FGs average in a game in 2018: 94.999
Away FGs average in a game in 2018: 94.885
Home FGs average in a game in 2019: 82.266
Away FGs average in a game in 2019: 82.285


In [7]:
games_df['PTS_home_fixed'] = games_df['PTS_home'] - (home_points_avg.__round__(3) - away_points_avg.__round__(3))
games_df['home_win_larger_than_avg'] = games_df['PTS_home_fixed'] > games_df['PTS_away']
print(f"Win percentage of home teams after deducting the average loss from home team points: {(len(games_df[games_df['home_win_larger_than_avg'] == True]) / len(games_df) * 100).__round__(3)}%")

Win percentage of home teams after deducting the average loss from home team points: 53.885%


From this we can deduce that home teams tend to win at a higher margin than the average difference.
Next, I'd like to check what is the average margin in a home win vs. away win.

In [8]:
home_wins_df = games_df[games_df['HOME_TEAM_WINS'] == 1].copy()
away_wins_df = games_df[games_df['HOME_TEAM_WINS'] == 0].copy()

home_wins_df['point_diff'] = home_wins_df['PTS_home'] - home_wins_df['PTS_away']
home_win_avg_margin = home_wins_df['point_diff'].mean()

away_wins_df['point_diff'] = away_wins_df['PTS_away'] - away_wins_df['PTS_home']
away_win_avg_margin = away_wins_df['point_diff'].mean()

print(f"Average margin when home team wins: {home_win_avg_margin.__round__(3)}")
print(f"Average margin when away team wins: {away_win_avg_margin.__round__(3)}")

Average margin when home team wins: 11.855
Average margin when away team wins: 10.15


Next, we'll check if players' averages are better at home. If they are, we can start analyzing and understanding which players are most likely to be affected bt it.
We'll start with players who played a minimum of 82 game, which is equivalent to a single NBA season.

In [56]:
players_box_score_df = players_box_score_df[players_box_score_df['Season'] >= 2004]
min_games_played = 200
player_games_counts = players_box_score_df['PLAYER_NAME'].value_counts()
players_with_min_games = player_games_counts[player_games_counts >= min_games_played].index.tolist()
players_box_score_min_one_season = players_box_score_df[players_box_score_df['PLAYER_NAME'].isin(players_with_min_games)]
players_box_score_min_one_season['home_game'] = players_box_score_min_one_season['MATCHUP'].str.contains('vs.').astype(int)
players_box_score_home = players_box_score_min_one_season[players_box_score_min_one_season['home_game'] == 1]
players_box_score_away = players_box_score_min_one_season[players_box_score_min_one_season['home_game'] == 0]
player_fta_home = players_box_score_home.groupby('PLAYER_NAME')[['FTA', 'FTM']].sum().astype(int).reset_index()
player_fta_away = players_box_score_away.groupby('PLAYER_NAME')[['FTA', 'FTM']].sum().astype(int).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_box_score_min_one_season['home_game'] = players_box_score_min_one_season['MATCHUP'].str.contains('vs.').astype(int)


In [57]:
player_fta_home['FTP'] = player_fta_home['FTM'] / player_fta_home['FTA']
player_fta_away['FTP'] = player_fta_away['FTM'] / player_fta_away['FTA']
ftp_diff = player_fta_home['FTP'] - player_fta_away['FTP']
print(f"The difference between players' ft% when compared to themselves: {ftp_diff.mean().__round__(4) * 100}%")

The difference between players' ft% when compared to themselves: 0.21%


Let's check the same for fg%.

In [60]:
player_fga_home = players_box_score_home.groupby('PLAYER_NAME')[['FGA', 'FGM']].sum().astype(int).reset_index()
player_fga_away = players_box_score_away.groupby('PLAYER_NAME')[['FGA', 'FGM']].sum().astype(int).reset_index()
player_fga_home['FGP'] = player_fga_home['FGM'] / player_fga_home['FGA']
player_fga_away['FGP'] = player_fga_away['FGM'] / player_fga_away['FGA']
fgp_diff = player_fga_home['FGP'] - player_fga_away['FGP']
print(f"The difference between players' fg% when compared to themselves: {fgp_diff.mean().__round__(4) * 100}%")

            PLAYER_NAME   FTA   FTM       FTP
0              AJ Price   127    91  0.716535
1          Aaron Brooks   541   457  0.844732
2          Aaron Gordon   875   594  0.678857
3            Aaron Gray   180    93  0.516667
4         Aaron Holiday   163   136  0.834356
..                  ...   ...   ...       ...
819        Zach Collins   176   135  0.767045
820         Zach LaVine  1248  1047  0.838942
821       Zach Randolph  2086  1570  0.752637
822       Zaza Pachulia  1407  1021  0.725657
823  Zydrunas Ilgauskas   905   702  0.775691

[824 rows x 4 columns]
The difference between players' fg% when compared to themselves: 1.03%


Next, I'd like to look at the 50 players with the largest difference in home points vs away points and understand what might be the cause of the difference.

In [64]:
player_scores_home = players_box_score_home.groupby('PLAYER_NAME')['PTS'].mean().reset_index()
player_scores_away = players_box_score_away.groupby('PLAYER_NAME')['PTS'].mean().reset_index()
scores_diff = player_scores_home['PTS'] - player_scores_away['PTS']
scores_diff_df = pd.DataFrame({'PLAYER_NAME': player_scores_home['PLAYER_NAME'],'SCORES_DIFF': scores_diff})
sorted_scores_diff_df = scores_diff_df.sort_values(by='SCORES_DIFF', ascending=False)
top_50_diff_leaders_df = sorted_scores_diff_df.head(50)
print(top_50_diff_leaders_df)

               PLAYER_NAME  SCORES_DIFF
289         Gerald Wallace     3.164724
816               Yao Ming     2.748303
247           Earl Boykins     2.725794
751          Terence Davis     2.570751
519            Luka Doncic     2.517143
753         Terrence Jones     2.452991
169          Danny Granger     2.378938
778             Trae Young     2.361886
594            Monta Ellis     2.137897
733        Stephon Marbury     2.059012
134         Christian Wood     2.025862
67           Blake Griffin     1.965893
685        Ruben Patterson     1.874988
478            Kirk Snyder     1.871470
596       Montrezl Harrell     1.853740
627        P.J. Washington     1.807163
533         MarShon Brooks     1.779112
470           Kevin Martin     1.777949
291         Gilbert Arenas     1.764222
104        Cameron Johnson     1.754568
803              Von Wafer     1.751378
31         Andrea Bargnani     1.729516
51         Antonio Daniels     1.695356
429            Josh Howard     1.677215


I'd like to check if the difference stems from these players' tendency to shoot more often at home than away.

In [72]:
top_50_home_fg = pd.merge(top_50_diff_leaders_df['PLAYER_NAME'], player_fga_home, on='PLAYER_NAME', how='inner')
top_50_away_fg = pd.merge(top_50_diff_leaders_df['PLAYER_NAME'], player_fga_away, on='PLAYER_NAME', how='inner')
top_50_fg_diff = ((top_50_away_fg['FGA'] / top_50_home_fg['FGA']) * 100)
print(f"The top 50 leaders in home points to away points difference shot approximately {(100 - top_50_fg_diff.mean()).__round__(3)}% more fgs at home than away.")

The top 50 leaders in home points to away points difference shot approximately 8.151% more fgs at home than away.
