In [1]:
# Importing Libraries & Functions

#Normal Stuff
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import copy
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Loading + Cleaning Data 
games = pd.read_csv('FP1_DATA/games.csv')
timestamp = pd.to_datetime(games['GAME_DATE_EST'])
games.insert(0, 'TIMESTAMP', timestamp)
games = games[games['SEASON'].isin([2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, 2019])].sort_values('TIMESTAMP')

teams = pd.read_csv('FP1_DATA/teams.csv')
teams = teams[['TEAM_ID','NICKNAME']]
team_dict = teams.set_index('TEAM_ID').T.to_dict('list')
games['HOME_TEAM_NAME'] = games['HOME_TEAM_ID'].map(team_dict)
games['AWAY_TEAM_NAME'] = games['VISITOR_TEAM_ID'].map(team_dict)


drop_cols = ['GAME_DATE_EST', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 
             'VISITOR_TEAM_ID', 'TEAM_ID_home', 'TEAM_ID_away', 'SEASON']
games = games.drop(drop_cols, axis = 1).reset_index(drop = True)
front_cols = ['TIMESTAMP', 'GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']
temp_games = games[front_cols]
games = pd.concat([temp_games, games.drop(front_cols, axis = 1)], axis = 1)
games['HOME_TEAM_NAME'] = games['HOME_TEAM_NAME'].apply(lambda x: x[0])
games['AWAY_TEAM_NAME'] = games['AWAY_TEAM_NAME'].apply(lambda x: x[0])

game_szn_dict = pd.read_csv('FP1_DATA/games.csv')[['GAME_ID','SEASON']].set_index('GAME_ID').iloc[:, 0].T.to_dict()
games['SEASON'] = games['GAME_ID'].map(game_szn_dict)
games.head(5)

Unnamed: 0,TIMESTAMP,GAME_ID,HOME_TEAM_NAME,AWAY_TEAM_NAME,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,SEASON
0,2005-10-10,10500120,Heat,Spurs,103.0,0.522,0.692,0.286,19.0,38.0,101.0,0.506,0.652,0.4,23.0,37.0,1,2005
1,2005-10-10,10500001,Wizards,Cavaliers,94.0,0.432,0.61,0.333,18.0,41.0,116.0,0.507,0.841,0.357,19.0,38.0,0,2005
2,2005-10-11,10500006,Lakers,Warriors,101.0,0.4,0.707,0.083,14.0,47.0,93.0,0.357,0.729,0.296,16.0,44.0,1,2005
3,2005-10-11,10500004,Pistons,Bulls,87.0,0.447,0.652,0.444,27.0,35.0,76.0,0.338,0.719,0.1,13.0,49.0,1,2005
4,2005-10-11,10500008,Cavaliers,Celtics,96.0,0.435,0.725,0.318,17.0,34.0,86.0,0.4,0.783,0.286,20.0,45.0,1,2005


In [13]:
# Creating Roster Data Frame
details = pd.read_csv('FP1_DATA/games_details.csv')
details = details[~details['START_POSITION'].isna()]
details['TEAM_NAME'] = details['TEAM_ID'].map(team_dict).apply(lambda x: x[0])

details = details[['GAME_ID', 'TEAM_NAME', 'PLAYER_NAME']]
details_grouped = details.groupby(['GAME_ID', 'TEAM_NAME'], as_index = False).agg(list)

roster = pd.merge(details_grouped, games[['TIMESTAMP','GAME_ID', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']], on = 'GAME_ID')
home_condition = (roster['TEAM_NAME'] == roster['HOME_TEAM_NAME'])
away_condition = (roster['TEAM_NAME'] != roster['HOME_TEAM_NAME'])

roster_home = roster[home_condition]
roster_home['ROSTER_home'] = roster_home['PLAYER_NAME']
roster_home = roster_home[['TIMESTAMP','GAME_ID', 'ROSTER_home']]

roster_away = roster[away_condition]
roster_away['ROSTER_away'] = roster_away['PLAYER_NAME']
roster_away = roster_away[['TIMESTAMP','GAME_ID', 'ROSTER_away']]

roster = pd.merge(roster_home, roster_away, on = 'GAME_ID')
roster['SEASON'] = roster['GAME_ID'].map(game_szn_dict)
roster = roster[roster['SEASON']<=2018]
roster = roster[roster['SEASON']>=2007]
# roster['SEASON'] = roster['TIMESTAMP_x'].dt.year
roster = roster.drop(['TIMESTAMP_x', 'TIMESTAMP_y'], axis = 1)
roster = roster.sort_values(['SEASON','GAME_ID'])
roster

Unnamed: 0,GAME_ID,ROSTER_home,ROSTER_away,SEASON
2997,20700001,"[Bruce Bowen, Tim Duncan, Fabricio Oberto, Mic...","[Martell Webster, LaMarcus Aldridge, Joel Przy...",2007
2998,20700002,"[Luke Walton, Ronny Turiaf, Kwame Brown, Kobe ...","[Shane Battier, Chuck Hayes, Yao Ming, Tracy M...",2007
2999,20700003,"[Kelenna Azubuike, Mickael Pietrus, Andris Bie...","[Andrei Kirilenko, Carlos Boozer, Mehmet Okur,...",2007
3000,20700004,"[Jason Kapono, Chris Bosh, Andrea Bargnani, An...","[Andre Iguodala, Reggie Evans, Samuel Dalember...",2007
3001,20700005,"[Hedo Turkoglu, Rashard Lewis, Dwight Howard, ...","[Desmond Mason, Yi Jianlian, Andrew Bogut, Mic...",2007
...,...,...,...,...
19740,41800402,"[Kawhi Leonard, Pascal Siakam, Marc Gasol, Dan...","[Andre Iguodala, Draymond Green, DeMarcus Cous...",2018
19741,41800403,"[Andre Iguodala, Draymond Green, DeMarcus Cous...","[Kawhi Leonard, Pascal Siakam, Marc Gasol, Dan...",2018
19742,41800404,"[Andre Iguodala, Draymond Green, DeMarcus Cous...","[Kawhi Leonard, Pascal Siakam, Marc Gasol, Dan...",2018
19743,41800405,"[Kawhi Leonard, Pascal Siakam, Marc Gasol, Dan...","[Andre Iguodala, Kevin Durant, Draymond Green,...",2018


In [8]:
# Games that we will use, from the 2007 season thru the 2018 season
games1 = games[games['GAME_ID'].isin(roster['GAME_ID'].to_list())]
games1 = games1.sort_values(['TIMESTAMP'])
games1

Unnamed: 0,TIMESTAMP,GAME_ID,HOME_TEAM_NAME,AWAY_TEAM_NAME,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,SEASON
2946,2007-10-30,20700003,Warriors,Jazz,96.0,0.416,0.684,0.261,19.0,37.0,117.0,0.456,0.833,0.455,24.0,56.0,0,2007
2947,2007-10-30,20700002,Lakers,Rockets,93.0,0.421,0.600,0.250,18.0,37.0,95.0,0.459,0.677,0.273,23.0,49.0,0,2007
2948,2007-10-30,20700001,Spurs,Trail Blazers,106.0,0.471,0.692,0.250,21.0,40.0,97.0,0.500,0.765,0.462,15.0,40.0,1,2007
2956,2007-10-31,20700007,Nets,Bulls,112.0,0.402,0.902,0.375,24.0,48.0,103.0,0.396,0.731,0.348,23.0,45.0,1,2007
2954,2007-10-31,20700010,Pelicans,Kings,104.0,0.506,0.762,0.476,23.0,44.0,90.0,0.444,0.833,0.300,21.0,34.0,1,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19478,2019-06-02,41800402,Raptors,Warriors,104.0,0.372,0.885,0.289,17.0,49.0,109.0,0.463,0.870,0.382,34.0,42.0,0,2018
19479,2019-06-05,41800403,Warriors,Raptors,109.0,0.396,0.833,0.333,25.0,41.0,123.0,0.524,0.952,0.447,30.0,40.0,0,2018
19480,2019-06-07,41800404,Warriors,Raptors,92.0,0.449,0.667,0.296,26.0,42.0,105.0,0.419,0.958,0.313,22.0,39.0,0,2018
19481,2019-06-10,41800405,Raptors,Warriors,105.0,0.447,0.778,0.250,19.0,43.0,106.0,0.463,0.714,0.476,27.0,37.0,0,2018


## Feature \#1 : NBA Awards

The cell below loads in the AWARDS dataframe, and calculates the number of players on the HOME/AWAY team's rosters who received awards for Most-Valuable Player (MVP), Defensive Player of the Year (DPOY), Rookie of the Year (ROY), and Sixth Man of the Year (6MOY) in the last four years.

In [16]:
pd.read_csv('FP2_DATA/awards.csv')

Unnamed: 0,SEASON,MVP,DPOY,ROY,6MOY
0,2019,Giannis Antetokounmpo,Giannis Antetokounmpo,Ja Morant,Montrezl Harrell
1,2018,Giannis Antetokounmpo,Rudy Gobert,Luka Dončić,Lou Williams
2,2017,James Harden,Rudy Gobert,Ben Simmons,Lou Williams
3,2016,Russell Westbrook,Draymond Green,Malcolm Brogdon,Eric Gordon
4,2015,Stephen Curry,Kawhi Leonard,Karl-Anthony Towns,Jamal Crawford
5,2014,Stephen Curry,Kawhi Leonard,Andrew Wiggins,Lou Williams
6,2013,Kevin Durant,Joakim Noah,Michael Carter-Williams,Jamal Crawford
7,2012,LeBron James,Marc Gasol,Damian Lillard,J.R. Smith
8,2011,LeBron James,Tyson Chandler,Kyrie Irving,James Harden
9,2010,Derrick Rose,Dwight Howard,Blake Griffin,Lamar Odom


In [17]:
pd.read_csv('FP2_DATA/awards.csv').iloc[:15, :5]

Unnamed: 0,SEASON,MVP,DPOY,ROY,6MOY
0,2019,Giannis Antetokounmpo,Giannis Antetokounmpo,Ja Morant,Montrezl Harrell
1,2018,Giannis Antetokounmpo,Rudy Gobert,Luka Dončić,Lou Williams
2,2017,James Harden,Rudy Gobert,Ben Simmons,Lou Williams
3,2016,Russell Westbrook,Draymond Green,Malcolm Brogdon,Eric Gordon
4,2015,Stephen Curry,Kawhi Leonard,Karl-Anthony Towns,Jamal Crawford
5,2014,Stephen Curry,Kawhi Leonard,Andrew Wiggins,Lou Williams
6,2013,Kevin Durant,Joakim Noah,Michael Carter-Williams,Jamal Crawford
7,2012,LeBron James,Marc Gasol,Damian Lillard,J.R. Smith
8,2011,LeBron James,Tyson Chandler,Kyrie Irving,James Harden
9,2010,Derrick Rose,Dwight Howard,Blake Griffin,Lamar Odom


In [22]:
# Feature #1: NBA Awards
def prev_4(award, awards): 
    award_list = np.array(list(awards[award]))
    prev_4_list = []
    for i in np.arange(13): 
        prev_4_list.append(award_list[np.arange(i+1, i+5)])
    return prev_4_list

awards = pd.read_csv('FP2_DATA/awards.csv')
awards['SEASON'] = awards['SEASON'].astype(int)
full_list = []
award_names = ['MVP', 'DPOY', 'ROY', '6MOY']
for award in award_names: 
    temp_4_list = prev_4(award, awards)
    full_list.append(temp_4_list)
awards = awards.iloc[:13]
for i in np.arange(len(full_list)): 
    awards['PREV_4_' + str(award_names[i])] = full_list[i]
    awards = awards.drop(award_names[i], axis = 1)

df_X1 = pd.merge(roster, awards, on = 'SEASON')
def count_award_home(row, award_name): 
    return sum(item in row['ROSTER_home'] for item in row['PREV_4_' + str(award_name)])
def count_award_away(row, award_name): 
    return sum(item in row['ROSTER_away'] for item in row['PREV_4_' + str(award_name)])


award_name = '6MOY'
for award_name in award_names: 
    df_X1['COUNT_PREV_4_' + award_name + '_home'] = df_X1.apply(lambda row: count_award_home(row, award_name), axis = 1)
    df_X1['COUNT_PREV_4_' + award_name + '_away'] = df_X1.apply(lambda row: count_award_away(row, award_name), axis = 1)
    df_X1 = df_X1.drop('PREV_4_' + str(award_name), axis = 1)
award_df = df_X1.drop(['ROSTER_home', 'ROSTER_away', 'SEASON'], axis = 1)
award_df

Unnamed: 0,GAME_ID,COUNT_PREV_4_MVP_home,COUNT_PREV_4_MVP_away,COUNT_PREV_4_DPOY_home,COUNT_PREV_4_DPOY_away,COUNT_PREV_4_ROY_home,COUNT_PREV_4_ROY_away,COUNT_PREV_4_6MOY_home,COUNT_PREV_4_6MOY_away
0,20700001,0,0,0,0,0,1,0,0
1,20700002,0,0,0,0,0,0,0,0
2,20700003,0,0,0,0,0,0,0,0
3,20700004,0,0,0,0,0,0,0,0
4,20700005,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
15955,41800402,0,2,2,1,0,0,0,0
15956,41800403,2,0,1,2,0,0,0,0
15957,41800404,2,0,1,2,0,0,0,0
15958,41800405,0,2,2,1,0,0,0,0


## Feature \#2 : The Lebron Effect

The cell below outputs a binary column for whether Lebron James is on the roster for the HOME or AWAY team.

In [24]:
# Feature #2: The Lebron Effect
df_X2 = copy.copy(roster)
def lebron_home(row):
    home_roster = row['ROSTER_home']
    if 'LeBron James' in home_roster:
        return 1
    return 0

def lebron_away(row):
    away_roster = row['ROSTER_away']
    if 'LeBron James' in away_roster:
        return 1
    return 0

df_X2['LEBRON_home'] = df_X2.apply(lambda row: lebron_home(row), axis = 1)
df_X2['LEBRON_away'] = df_X2.apply(lambda row: lebron_away(row), axis = 1)

lebron_df = df_X2.drop(['ROSTER_home', 'ROSTER_away', 'SEASON'], axis = 1)
lebron_df

Unnamed: 0,GAME_ID,LEBRON_home,LEBRON_away
2997,20700001,0,0
2998,20700002,0,0
2999,20700003,0,0
3000,20700004,0,0
3001,20700005,0,0
...,...,...,...
19740,41800402,0,0
19741,41800403,0,0
19742,41800404,0,0
19743,41800405,0,0


## Feature \#3 : All-NBA Appearances

The cell below calculate the number of players on the HOME/AWAY roster who were voted on the NBA First-Team, Second-Team, and Third-Team in the last four years. 

In [30]:
# Feature #3 : All-NBA Appearances
first_team = pd.read_csv('FP2_DATA/first_team.csv')
second_team = pd.read_csv('FP2_DATA/second_team.csv')
third_team = pd.read_csv('FP2_DATA/third_team.csv')

first_team['SEASON'] = first_team['SEASON'].astype(int)
second_team['SEASON'] = second_team['SEASON'].astype(int)
third_team['SEASON'] = third_team['SEASON'].astype(int)


full_ls1, full_ls2, full_ls3 = [], [], []
for i in range(len(first_team)-4):
    temp_ls1, temp_ls2, temp_ls3 = [], [], []
    for row in first_team.values[i+1:i+5]:
        temp_ls1 = np.append(temp_ls1, row[1:])
    for row in second_team.values[i+1:i+5]:
        temp_ls2 = np.append(temp_ls2, row[1:])
    for row in third_team.values[i+1:i+5]:
        temp_ls3 = np.append(temp_ls3, row[1:])
    full_ls1.append(temp_ls1)
    full_ls2.append(temp_ls2)
    full_ls3.append(temp_ls3)
    
seasons = [2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
all_nba = pd.DataFrame({'SEASON':seasons[:], 
                       'PREV_first_team': full_ls1, 
                       'PREV_second_team': full_ls2, 
                       'PREV_third_team': full_ls3})
df_X3 = pd.merge(roster, all_nba, on='SEASON')

def count_all_nba_home(row, team_name):
    return sum(item in row['ROSTER_home'] for item in row['PREV_' + str(team_name)])
def count_all_nba_away(row, team_name):
    return sum(item in row['ROSTER_away'] for item in row['PREV_' + str(team_name)])

team_names = ['first_team', 'second_team', 'third_team']
for team_name in team_names:
    df_X3['COUNT_PREV_4_' + team_name] = df_X3.apply(lambda row: count_all_nba_home(row, team_name), axis = 1)
    df_X3['COUNT_PREV_4_' + team_name] = df_X3.apply(lambda row: count_all_nba_away(row, team_name), axis = 1)
    df_X3 = df_X3.drop('PREV_' + str(team_name), axis = 1)

all_nba_df = df_X3.drop(['ROSTER_home', 'ROSTER_away', 'SEASON'], axis = 1)
all_nba_df

Unnamed: 0,GAME_ID,COUNT_PREV_4_first_team,COUNT_PREV_4_second_team,COUNT_PREV_4_third_team
0,20700001,0,1,1
1,20700002,0,0,0
2,20700003,0,0,0
3,20700004,0,0,0
4,20700005,0,0,0
...,...,...,...,...
15955,41800402,0,0,0
15956,41800403,0,0,0
15957,41800404,0,0,0
15958,41800405,0,0,0


## Feature \#4 : Home-Court Advantage

The cell below contains the winning percentage at home for each HOME team over the last five years to approximate the home-court advantage. 

In [32]:
# Feature #4 : Home-Court Advantage
games2 = copy.copy(games)
games2 = games2[['TIMESTAMP', 'GAME_ID', 'SEASON', 'HOME_TEAM_NAME', 'HOME_TEAM_WINS']]

df_X4 = copy.copy(games1[['TIMESTAMP', 'GAME_ID', 'SEASON', 'HOME_TEAM_NAME']])

def home_win_pct_past4(row):
    curr_date = row['TIMESTAMP']
    curr_season = row['SEASON']
    home_team = row['HOME_TEAM_NAME']
    temp_games = games2[games2['TIMESTAMP']<curr_date]
    temp_games = temp_games[curr_season-temp_games['SEASON'] < 4]
    temp_games = temp_games[temp_games['HOME_TEAM_NAME']==home_team]
    
    home_wins_past4 = sum(temp_games['HOME_TEAM_WINS'])
    home_games_past4 = len(temp_games)
    return np.round(home_wins_past4/home_games_past4,3)

df_X4['HOME_TEAM_home_win_pct_past4yrs'] = df_X4.apply(lambda row: home_win_pct_past4(row), axis = 1)
home_advantage_df = df_X4#.drop(['TIMESTAMP', 'SEASON', 'HOME_TEAM_NAME'], axis=1)
home_advantage_df

Unnamed: 0,TIMESTAMP,GAME_ID,SEASON,HOME_TEAM_NAME,HOME_TEAM_home_win_pct_past4yrs
2946,2007-10-30,20700003,2007,Warriors,0.624
2947,2007-10-30,20700002,2007,Lakers,0.615
2948,2007-10-30,20700001,2007,Spurs,0.755
2956,2007-10-31,20700007,2007,Nets,0.650
2954,2007-10-31,20700010,2007,Pelicans,0.560
...,...,...,...,...,...
19478,2019-06-02,41800402,2018,Raptors,0.762
19479,2019-06-05,41800403,2018,Warriors,0.808
19480,2019-06-07,41800404,2018,Warriors,0.805
19481,2019-06-10,41800405,2018,Raptors,0.758


## Feature \#5 / \#6 : Altitude and Timezone Difference

The cell below contains the difference in altitude between the HOME and AWAY team.

In [34]:
# Feature #5 / #6 : Altitude and Timezone Difference

# Game Season Dictionary
game_szn_dict = pd.read_csv('FP1_DATA/games.csv')[['GAME_ID','SEASON']].set_index('GAME_ID').iloc[:, 0].T.to_dict()
games['SEASON'] = games['GAME_ID'].map(game_szn_dict)


altitude_timezone = pd.read_csv('FP2_DATA/altitude_timezone.csv')
alt_team_dict = {}
for row in altitude_timezone.values:
    alt_team_dict[row[0],row[1],row[2]] = row[3]
timezone_team_dict = {}
for row in altitude_timezone.values:
    timezone_team_dict[row[0],row[1],row[2]] = row[4]
    
df_X5 = copy.copy(games1[['GAME_ID', 'SEASON', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME']])

def home_team_alt(row):
    for key in alt_team_dict:
        if row['HOME_TEAM_NAME']==key[0] and np.all([row['SEASON']+1>=key[1], row['SEASON']+1<=key[2]]):
            return alt_team_dict[key]
def away_team_alt(row):
    for key in alt_team_dict:
        if row['AWAY_TEAM_NAME']==key[0] and np.all([row['SEASON']+1>=key[1], row['SEASON']+1<=key[2]]):
            return alt_team_dict[key]
        
def home_team_timezone(row):
    for key in timezone_team_dict:
        if row['HOME_TEAM_NAME']==key[0] and np.all([row['SEASON']+1>=key[1], row['SEASON']+1<=key[2]]):
            return timezone_team_dict[key]
def away_team_timezone(row):
    for key in timezone_team_dict:
        if row['AWAY_TEAM_NAME']==key[0] and np.all([row['SEASON']+1>=key[1], row['SEASON']+1<=key[2]]):
            return timezone_team_dict[key]
        
df_X5['HOME_TEAM_altitude'] = df_X5.apply(lambda row: home_team_alt(row), axis = 1)    
df_X5['AWAY_TEAM_altitude'] = df_X5.apply(lambda row: away_team_alt(row), axis = 1)
df_X5['AWAY_TEAM_delta_altitude'] = df_X5['HOME_TEAM_altitude']-df_X5['AWAY_TEAM_altitude']

df_X5['HOME_TEAM_timezone'] = df_X5.apply(lambda row: home_team_timezone(row), axis = 1)    
df_X5['AWAY_TEAM_timezone'] = df_X5.apply(lambda row: away_team_timezone(row), axis = 1)
df_X5['AWAY_TEAM_delta_timezone'] = df_X5['HOME_TEAM_timezone']-df_X5['AWAY_TEAM_timezone']

altitude_timezone_df = copy.copy(df_X5[['GAME_ID','AWAY_TEAM_delta_altitude','AWAY_TEAM_delta_timezone']])

Finally, we merge all of these dataframes together using 'GAME_ID' as the key, and output this dataframe as a CSV to be loaded into our FP3_MODELS notebook.

In [37]:
XFactor_df = pd.merge(award_df, lebron_df, on='GAME_ID').merge(all_nba_df, on='GAME_ID').merge(home_advantage_df, on='GAME_ID').merge(altitude_timezone_df, on='GAME_ID')
XFactor_df.to_csv('FP2.csv', index=False)