# EDA

The purpose of this script will be to clean the `games` dataset and get it in position for modelling.  We start by importing the standard libraries, and reading in the data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
games = pd.read_csv('Data/games.csv')
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2021-05-26,42000102,Final,1610612755,1610612764,2020,1610612755,120.0,0.557,0.684,...,26.0,45.0,1610612764,95.0,0.402,0.633,0.091,22.0,40.0,1
1,2021-05-26,42000132,Final,1610612752,1610612737,2020,1610612752,101.0,0.383,0.739,...,15.0,54.0,1610612737,92.0,0.369,0.818,0.273,17.0,41.0,1
2,2021-05-26,42000142,Final,1610612762,1610612763,2020,1610612762,141.0,0.544,0.774,...,28.0,42.0,1610612763,129.0,0.541,0.763,0.348,20.0,33.0,1
3,2021-05-25,42000112,Final,1610612751,1610612738,2020,1610612751,130.0,0.523,0.955,...,31.0,46.0,1610612738,108.0,0.424,0.783,0.353,23.0,43.0,1
4,2021-05-25,42000152,Final,1610612756,1610612747,2020,1610612756,102.0,0.465,0.933,...,21.0,31.0,1610612747,109.0,0.45,0.871,0.303,24.0,39.0,0


In [3]:
games.shape

(24677, 21)

And let's pull out only the games from the 2018-2019 season:

In [4]:
# generate a datetime column
games['date'] = games['GAME_DATE_EST'].astype('datetime64')

# pull out some useful features
games['year'] = games['date'].dt.year
games['month'] = games['date'].dt.month
games['day'] = games['date'].dt.day

# filter for the 2018-19 NBA season
season1 = games.loc[(games['year'] == 2018) & \
               (((games['month'] == 10) & (games['day'] >= 16)) | \
               (games['month'] > 10)) | \
                      
              (games['year'] == 2019) & \
               (((games['month'] == 4) & (games['day'] <= 10)) | \
                (games['month'] < 4)), :]

# sort the games by date
season1 = season1.sort_values('date')

season1.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,date,year,month,day
18989,2018-10-16,21800002,Final,1610612744,1610612760,2018,1610612744,108.0,0.442,0.944,...,0.363,0.649,0.27,21.0,45.0,1,2018-10-16,2018,10,16
18988,2018-10-16,21800001,Final,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,...,0.391,0.609,0.192,18.0,47.0,1,2018-10-16,2018,10,16
18986,2018-10-17,21800012,Final,1610612746,1610612743,2018,1610612746,98.0,0.398,0.833,...,0.379,0.786,0.333,20.0,56.0,0,2018-10-17,2018,10,17
18977,2018-10-17,21800003,Final,1610612766,1610612749,2018,1610612766,112.0,0.446,0.636,...,0.494,0.75,0.412,26.0,57.0,0,2018-10-17,2018,10,17
18978,2018-10-17,21800004,Final,1610612765,1610612751,2018,1610612765,103.0,0.424,0.864,...,0.488,0.682,0.185,28.0,39.0,1,2018-10-17,2018,10,17


In [5]:
season1.shape

(1230, 25)

Looks like we have 1,230 games, which makes sense because there are this many games in the regular season ((82 games * 30 teams) / 2 permutations).  Let's save this dataset.

In [6]:
season1['date'] = season1['date'].astype('datetime64')

season1.to_csv('Data/games_2018-19.csv', index=False)

Let's read in the `teams` dataset, so we can generate a dictionary that maps team ID to team name.

In [7]:
# read in the teams dataset
teams = pd.read_csv('Data/teams.csv')

# pull out the id and abbreviation columns, and convert to a dictionary
teams = teams[['TEAM_ID', 'ABBREVIATION']]
teams = teams.set_index('TEAM_ID')
teams = teams.to_dict()['ABBREVIATION']

teams

{1610612737: 'ATL',
 1610612738: 'BOS',
 1610612740: 'NOP',
 1610612741: 'CHI',
 1610612742: 'DAL',
 1610612743: 'DEN',
 1610612745: 'HOU',
 1610612746: 'LAC',
 1610612747: 'LAL',
 1610612748: 'MIA',
 1610612749: 'MIL',
 1610612750: 'MIN',
 1610612751: 'BKN',
 1610612752: 'NYK',
 1610612753: 'ORL',
 1610612754: 'IND',
 1610612755: 'PHI',
 1610612756: 'PHX',
 1610612757: 'POR',
 1610612758: 'SAC',
 1610612759: 'SAS',
 1610612760: 'OKC',
 1610612761: 'TOR',
 1610612762: 'UTA',
 1610612763: 'MEM',
 1610612764: 'WAS',
 1610612765: 'DET',
 1610612766: 'CHA',
 1610612739: 'CLE',
 1610612744: 'GSW'}

In [8]:
season1.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,date,year,month,day
18989,2018-10-16,21800002,Final,1610612744,1610612760,2018,1610612744,108.0,0.442,0.944,...,0.363,0.649,0.27,21.0,45.0,1,2018-10-16,2018,10,16
18988,2018-10-16,21800001,Final,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,...,0.391,0.609,0.192,18.0,47.0,1,2018-10-16,2018,10,16
18986,2018-10-17,21800012,Final,1610612746,1610612743,2018,1610612746,98.0,0.398,0.833,...,0.379,0.786,0.333,20.0,56.0,0,2018-10-17,2018,10,17
18977,2018-10-17,21800003,Final,1610612766,1610612749,2018,1610612766,112.0,0.446,0.636,...,0.494,0.75,0.412,26.0,57.0,0,2018-10-17,2018,10,17
18978,2018-10-17,21800004,Final,1610612765,1610612751,2018,1610612765,103.0,0.424,0.864,...,0.488,0.682,0.185,28.0,39.0,1,2018-10-17,2018,10,17


In [10]:
season1 = season1[['date', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS']]
season1.rename(columns={'HOME_TEAM_ID' : 'home_id', 'VISITOR_TEAM_ID' : 'away_id', 
                        'HOME_TEAM_WINS' : 'is_home_win'}, inplace=True)
season1.head()

Unnamed: 0,date,home_id,away_id,is_home_win
18989,2018-10-16,1610612744,1610612760,1
18988,2018-10-16,1610612738,1610612755,1
18986,2018-10-17,1610612746,1610612743,0
18977,2018-10-17,1610612766,1610612749,0
18978,2018-10-17,1610612765,1610612751,1


In [7]:
# get the id of the first team in the dataset
team1 = games['HOME_TEAM_ID'].value_counts().index[0]

# select out the games in which this team played
team1_games = games.loc[(games['HOME_TEAM_ID'] == team1) | (games['VISITOR_TEAM_ID'] == team1), :]
team1_games = team1_games.sort_values('date').reset_index().drop('index', axis=1)
team1_games.head()

# get rid of some columns for now
team1_games = team1_games[['date', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away', 'HOME_TEAM_WINS']]
team1_games.head()

Unnamed: 0,date,HOME_TEAM_ID,VISITOR_TEAM_ID,PTS_home,PTS_away,HOME_TEAM_WINS
0,2018-10-17,1610612766,1610612749,112.0,113.0,0
1,2018-10-19,1610612753,1610612766,88.0,120.0,0
2,2018-10-20,1610612748,1610612766,112.0,113.0,0
3,2018-10-22,1610612761,1610612766,127.0,106.0,1
4,2018-10-24,1610612741,1610612766,112.0,110.0,1


Let's re-order and re-name the columns for a better visual appeal

In [8]:
# add the names to the team1 dataframe
team1_games['HOME_TEAM'] = team1_games['HOME_TEAM_ID'].map(teams)
team1_games['AWAY_TEAM'] = team1_games['VISITOR_TEAM_ID'].map(teams)

# re-name the columns to easier names
team1_games.rename(columns={'HOME_TEAM_ID' : 'home_id', 'VISITOR_TEAM_ID' : 'away_id', 
                           'PTS_home' : 'home_pts', 'PTS_away' : 'away_pts', 'HOME_TEAM_WINS' : 'is_home_win', 
                           'TEAM_ID' : 'team_id', 'HOME_TEAM' : 'home_name', 'AWAY_TEAM' : 'away_name'}, inplace=True)

# re-order the columns to make it look nicer
team1_games = team1_games[['date', 'home_id', 'home_name', 'away_id', 'away_name', 
                          'home_pts', 'away_pts', 'is_home_win']]

# change the is_home_win column to whether team1 won or not
team1_games['is_win'] = np.where(((team1_games['home_name'] == 'CHA') & (team1_games['is_home_win'] == 1)) | \
                                ((team1_games['away_name'] == 'CHA') & (team1_games['is_home_win'] == 0)), 1, 0)
team1_games.drop(columns=['is_home_win'], inplace=True)


team1_games.head()

Unnamed: 0,date,home_id,home_name,away_id,away_name,home_pts,away_pts,is_win
0,2018-10-17,1610612766,CHA,1610612749,MIL,112.0,113.0,0
1,2018-10-19,1610612753,ORL,1610612766,CHA,88.0,120.0,1
2,2018-10-20,1610612748,MIA,1610612766,CHA,112.0,113.0,1
3,2018-10-22,1610612761,TOR,1610612766,CHA,127.0,106.0,0
4,2018-10-24,1610612741,CHI,1610612766,CHA,112.0,110.0,0


In [11]:
team1_games['is_home'] = np.where(team1_games['home_name'] == 'CHA', 1, 0)
team1_games['opp_id'] = np.where(team1_games['is_home'] == 1, team1_games['away_id'], team1_games['home_id'])

test = team1_games[['date', 'is_home', 'opp_id', 'is_win']]
test.head()

Unnamed: 0,date,is_home,opp_id,is_win
0,2018-10-17,1,1610612749,0
1,2018-10-19,0,1610612753,1
2,2018-10-20,0,1610612748,1
3,2018-10-22,0,1610612761,0
4,2018-10-24,0,1610612741,0


In [14]:
test['is_win'].value_counts()

0    43
1    39
Name: is_win, dtype: int64

Now, we need some way to look up a teams win percentage at any date in the season.  That way, we could add the opposing teams win percentage as a feature for our model.

The approach we will take is as follows:
1. Each team will get a look-up dictionary, in which the keys are dates, and the values are winning percentage.
2. With the dictionary, we could then just add a column where we map the opposing team id and date to their winning percentage at that time.
3. In order to generate these dictionaries, we will need to separate the original games DataFrame into 30 individual ones for each team.
4. We will then loop through all the days in the regular season, and all the teams, and fill in the winning percentages on each day.

In [82]:
games = games.reset_index().drop('index', axis=1)
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,date,year,month,day
0,2018-10-16,21800002,Final,1610612744,1610612760,2018,1610612744,108.0,0.442,0.944,...,0.363,0.649,0.27,21.0,45.0,1,2018-10-16,2018,10,16
1,2018-10-16,21800001,Final,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,...,0.391,0.609,0.192,18.0,47.0,1,2018-10-16,2018,10,16
2,2018-10-17,21800007,Final,1610612752,1610612737,2018,1610612752,126.0,0.455,0.774,...,0.456,0.833,0.278,19.0,45.0,1,2018-10-17,2018,10,17
3,2018-10-17,21800011,Final,1610612758,1610612762,2018,1610612758,117.0,0.516,0.667,...,0.519,0.737,0.481,21.0,44.0,0,2018-10-17,2018,10,17
4,2018-10-17,21800010,Final,1610612759,1610612750,2018,1610612759,112.0,0.43,0.724,...,0.429,0.889,0.316,20.0,46.0,1,2018-10-17,2018,10,17
