# EDA

The purpose of this script will be to clean the `games` dataset and get it in position for modelling.  We start by importing the standard libraries, and reading in the data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
games = pd.read_csv('Data/games.csv')
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2021-05-26,42000102,Final,1610612755,1610612764,2020,1610612755,120.0,0.557,0.684,...,26.0,45.0,1610612764,95.0,0.402,0.633,0.091,22.0,40.0,1
1,2021-05-26,42000132,Final,1610612752,1610612737,2020,1610612752,101.0,0.383,0.739,...,15.0,54.0,1610612737,92.0,0.369,0.818,0.273,17.0,41.0,1
2,2021-05-26,42000142,Final,1610612762,1610612763,2020,1610612762,141.0,0.544,0.774,...,28.0,42.0,1610612763,129.0,0.541,0.763,0.348,20.0,33.0,1
3,2021-05-25,42000112,Final,1610612751,1610612738,2020,1610612751,130.0,0.523,0.955,...,31.0,46.0,1610612738,108.0,0.424,0.783,0.353,23.0,43.0,1
4,2021-05-25,42000152,Final,1610612756,1610612747,2020,1610612756,102.0,0.465,0.933,...,21.0,31.0,1610612747,109.0,0.45,0.871,0.303,24.0,39.0,0


In [3]:
games.shape

(24677, 21)

And let's pull out only the games from the 2018-2019 season:

In [4]:
# generate a datetime column
games['date'] = games['GAME_DATE_EST'].astype('datetime64')

# pull out some useful features
games['year'] = games['date'].dt.year
games['month'] = games['date'].dt.month
games['day'] = games['date'].dt.day

# filter for the 2018-19 NBA season
season1 = games.loc[(games['year'] == 2018) & \
               (((games['month'] == 10) & (games['day'] >= 16)) | \
               (games['month'] > 10)) | \
                      
              (games['year'] == 2019) & \
               (((games['month'] == 4) & (games['day'] <= 10)) | \
                (games['month'] < 4)), :]

# sort the games by date
season1 = season1.sort_values('date')

season1.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,date,year,month,day
18989,2018-10-16,21800002,Final,1610612744,1610612760,2018,1610612744,108.0,0.442,0.944,...,0.363,0.649,0.27,21.0,45.0,1,2018-10-16,2018,10,16
18988,2018-10-16,21800001,Final,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,...,0.391,0.609,0.192,18.0,47.0,1,2018-10-16,2018,10,16
18986,2018-10-17,21800012,Final,1610612746,1610612743,2018,1610612746,98.0,0.398,0.833,...,0.379,0.786,0.333,20.0,56.0,0,2018-10-17,2018,10,17
18977,2018-10-17,21800003,Final,1610612766,1610612749,2018,1610612766,112.0,0.446,0.636,...,0.494,0.75,0.412,26.0,57.0,0,2018-10-17,2018,10,17
18978,2018-10-17,21800004,Final,1610612765,1610612751,2018,1610612765,103.0,0.424,0.864,...,0.488,0.682,0.185,28.0,39.0,1,2018-10-17,2018,10,17


In [5]:
season1.shape

(1230, 25)

Looks like we have 1,230 games, which makes sense because there are this many games in the regular season ((82 games * 30 teams) / 2 permutations).

Let's rename the columns to something more nice, and filter out for just the following features for each game:
- date
- home team id
- away team id
- is_home_win

From there, we will be able to use our LUTs to add in the real features for each team involved in the game.

In [6]:
season1 = season1[['date', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS']]
season1.rename(columns={'HOME_TEAM_ID' : 'home_id', 'VISITOR_TEAM_ID' : 'away_id', 
                        'HOME_TEAM_WINS' : 'is_home_win'}, inplace=True)
season1.head()

Unnamed: 0,date,home_id,away_id,is_home_win
18989,2018-10-16,1610612744,1610612760,1
18988,2018-10-16,1610612738,1610612755,1
18986,2018-10-17,1610612746,1610612743,0
18977,2018-10-17,1610612766,1610612749,0
18978,2018-10-17,1610612765,1610612751,1


In [7]:
season1.shape

(1230, 4)

Let's save this season into memory, so we can load it in our modelling script.

In [8]:
season1['date'] = season1['date'].astype('datetime64')

season1.to_csv('Data/games_2018-19.csv', index=False)