# Goal:
Explore the NBA player career stats data and determine the state of the dataset for its usage in our models

Import libraries so we can connect to the dataframe, manipulate the results, and visualize it. 

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import getpass
import psycopg2
import seaborn as sns
from psycopg2.extensions import adapt, register_adapter, AsIs
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

Grab password from user input and connect to the database

In [2]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'fhfrf', #replace with pawprint
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)

········


Create an empty dataframe to store our dataframe results. We needed to split our query into two separate queries to avoid memory errors. 

In [3]:
nba_player_box_scores = pd.DataFrame()

In [4]:
nba_player_box_1990 = pd.read_sql_query("""
SELECT *
FROM nba_player_boxscores
WHERE date >= TO_DATE('1990-01-01', 'YYYY-MM-DD') AND date < TO_DATE('2010-01-01', 'YYYY-MM-DD')
""", con = conn)
nba_player_box_scores = nba_player_box_scores.append(nba_player_box_1990)
del nba_player_box_1990

In [5]:
nba_player_box_post2010 = pd.read_sql_query("""
SELECT *
FROM nba_player_boxscores
WHERE date >= TO_DATE('2010-01-01', 'YYYY-MM-DD')
""", con = conn)
nba_player_box_scores = nba_player_box_scores.append(nba_player_box_post2010)
del nba_player_box_post2010

Look at our nba_player_box_scores dataframe after its construction

In [6]:
nba_player_box_scores.head()

Unnamed: 0,player_id,player_name,boxscore,date,season,team_abbr,assist_percentage,assists,block_percentage,blocks,...,two_point_percentage,two_pointers,usage_percentage,home_game,birth_date,age,player_game_number_season,player_game_number_career,player_game_number_team,days_since_last_game
0,copella01,Lanard Copeland,199005030CLE,1990-05-03,1990.0,PHI,0.0,0.0,0.0,0.0,...,,,0.0,0.0,1965-07-16,24.797224,25.0,25.0,25.0,2.0
1,causwdu01,Duane Causwell,199011270SAC,1990-11-27,1991.0,SAC,0.0,0.0,0.0,0.0,...,,,0.0,1.0,1968-05-31,22.491906,12.0,12.0,12.0,2.0
2,payneke01,Kenny Payne,199011300DET,1990-11-30,1991.0,PHI,0.0,0.0,0.0,0.0,...,,,0.0,0.0,1966-11-25,24.014182,13.0,51.0,51.0,2.0
3,palmewa01,Walter Palmer,199011300UTA,1990-11-30,1991.0,UTA,0.0,0.0,0.0,0.0,...,,,0.0,1.0,1968-10-23,22.103123,7.0,7.0,7.0,2.0
4,kerrst01,Steve Kerr,199101040CLE,1991-01-04,1991.0,CLE,0.0,0.0,0.0,0.0,...,,,0.0,1.0,1965-09-27,25.270882,27.0,136.0,110.0,2.0


See how many rows and columns we have in our dataset

In [7]:
nba_player_box_scores.shape

(809288, 50)

Let's see what columns have missing data so we are aware of which columns we may need to clean before inputing to our model

In [9]:
nba_player_box_scores.isnull().sum()/nba_player_box_scores.shape[0]

player_id                          0.000000
player_name                        0.000000
boxscore                           0.000000
date                               0.000000
season                             0.000000
team_abbr                          0.000000
assist_percentage                  0.044826
assists                            0.044710
block_percentage                   0.044807
blocks                             0.044710
box_plus_minus                     0.358384
defensive_rating                   0.044801
defensive_rebound_percentage       0.044807
defensive_rebounds                 0.044710
effective_field_goal_percentage    0.094534
field_goal_attempts                0.044710
field_goal_percentage              0.094534
field_goals                        0.044710
free_throw_attempt_rate            0.094534
free_throw_attempts                0.044710
free_throw_percentage              0.423473
free_throws                        0.044710
minutes_played                  

Aggregate game data into a per-season to see what years have the most complete data

In [15]:
nba_player_missing_df = pd.DataFrame()
for i in set(nba_player_box_scores['season']):
    y = i
    x = 1-(nba_player_box_scores[nba_player_box_scores['season']==i].isnull().sum()/nba_player_box_scores[nba_player_box_scores['season']==i].shape[0])
    total = 1-(nba_player_box_scores[nba_player_box_scores['season']==i].isnull().sum().sum()/nba_player_box_scores[nba_player_box_scores['season']==i].size)
    x = pd.DataFrame(x)
    x = x.T
    x['season'] = y
    x['n_records'] = nba_player_box_scores[nba_player_box_scores['season']==i].shape[0]
    x['total_pct_complete'] = total
    x = x.set_index('season')
    nba_player_missing_df = nba_player_missing_df.append(x)
nba_player_missing_df

Unnamed: 0_level_0,player_id,player_name,boxscore,date,team_abbr,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_rating,defensive_rebound_percentage,defensive_rebounds,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,minutes_played,offensive_rating,offensive_rebound_percentage,offensive_rebounds,personal_fouls,points,steal_percentage,steals,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,home_game,birth_date,age,player_game_number_season,player_game_number_career,player_game_number_team,days_since_last_game,n_records,total_pct_complete
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1990.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.9478,1.0,0.9478,1.0,0.9478,1.0,0.650275,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9478,1.0,0.321158,1.0,1.0,1.0,0.956377,0.965632,1.0,0.9478,0.939594,0.85895,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16207,0.94862
1991.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.95118,1.0,0.95118,1.0,0.95118,1.0,0.643658,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.95118,1.0,0.328899,1.0,1.0,1.0,0.96009,0.969587,1.0,0.95118,0.943817,0.860484,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23904,0.949249
1992.0,1.0,1.0,1.0,1.0,1.0,0.999958,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.953,1.0,0.953,1.0,0.953,1.0,0.63911,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.953,1.0,0.338229,1.0,1.0,1.0,0.960764,0.970206,1.0,0.953,0.944524,0.865128,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23830,0.949658
1993.0,1.0,1.0,1.0,1.0,1.0,0.999958,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.955695,1.0,0.955695,1.0,0.955695,1.0,0.644105,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.955695,1.0,0.366836,1.0,1.0,1.0,0.963183,0.973084,1.0,0.955695,0.946876,0.865172,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,24038,0.950754
1994.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.949761,1.0,0.949761,1.0,0.949761,1.0,0.638604,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.949761,1.0,0.386162,1.0,1.0,1.0,0.95828,0.968668,1.0,0.949761,0.939788,0.858841,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,24065,0.949983
1995.0,1.0,1.0,1.0,1.0,1.0,0.999958,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.94849,1.0,0.94849,1.0,0.94849,1.0,0.644081,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.94849,1.0,0.487709,1.0,1.0,1.0,0.958038,0.96901,1.0,0.94849,0.931781,0.851627,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23879,0.951693
1996.0,1.0,1.0,1.0,1.0,1.0,0.99996,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.94646,1.0,0.94646,1.0,0.94646,1.0,0.629457,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.94646,1.0,0.498793,1.0,1.0,1.0,0.955403,0.966167,1.0,0.94646,0.926635,0.846702,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25271,0.951108
1997.0,1.0,1.0,1.0,1.0,1.0,0.999921,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.940877,1.0,0.940877,1.0,0.940877,1.0,0.615504,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.940877,1.0,0.519151,1.0,1.0,1.0,0.95085,0.962174,1.0,0.940877,0.916004,0.83936,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25168,0.950147
1998.0,1.0,1.0,1.0,1.0,1.0,0.999921,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.944305,1.0,0.944305,1.0,0.944305,1.0,0.628815,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.944305,1.0,0.443243,1.0,1.0,1.0,0.953587,0.965623,1.0,0.944305,0.927745,0.841567,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25424,0.94964
1999.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.944403,1.0,0.944403,1.0,0.944403,1.0,0.613346,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.944403,1.0,0.448278,1.0,1.0,1.0,0.954613,0.965744,1.0,0.944403,0.926015,0.83278,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16260,0.949256


Let's look at the statistics behind the nba player box scores

In [16]:
nba_player_box_scores.describe()

Unnamed: 0,season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,defensive_rating,defensive_rebound_percentage,defensive_rebounds,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,minutes_played,offensive_rating,offensive_rebound_percentage,offensive_rebounds,personal_fouls,points,steal_percentage,steals,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,home_game,age,player_game_number_season,player_game_number_career,player_game_number_team,days_since_last_game
count,809288.0,773011.0,773105.0,773026.0,773105.0,519252.0,773031.0,773026.0,773105.0,732783.0,773105.0,732783.0,773105.0,732783.0,773105.0,466576.0,773105.0,773102.0,773026.0,773026.0,773105.0,773105.0,773105.0,773026.0,773105.0,732783.0,773104.0,408825.0,773104.0,773026.0,773105.0,738741.0,746457.0,773105.0,732783.0,713254.0,657431.0,773026.0,809288.0,809288.0,809288.0,791035.0,791035.0,809288.0
mean,2005.909821,0.134319,2.177455,0.015512,0.486741,-0.002195,107.543951,0.141537,2.988902,0.474453,8.055567,0.439163,3.669699,0.335913,2.44007,0.741538,1.838304,23.619553,100.289409,0.056673,1.151141,2.116927,9.781661,0.016148,0.763631,0.211454,1.705839,0.318976,0.603959,0.099419,4.140043,0.512424,0.141491,1.389436,6.699129,0.466571,3.605154,0.191713,0.499885,27.45128,36.560165,385.33259,162.701705,6.202485
std,8.870471,0.162084,2.571731,0.033939,0.923408,10.507653,14.029906,0.120804,2.71095,0.258969,5.811039,0.237032,3.102818,0.498378,2.994018,0.284577,2.421983,11.76926,45.582108,0.082225,1.495857,1.539704,8.136938,0.026118,1.020225,0.257121,2.313236,0.319851,1.093086,0.080509,3.602848,0.246843,0.158782,1.436252,4.966014,0.26484,2.735395,0.090962,0.5,4.189539,23.649874,313.084639,195.266039,34.630933
min,1990.0,-10.0,0.0,0.0,0.0,-57.0,-1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.018166,1.0,1.0,1.0,0.0
25%,1998.0,0.0,0.0,0.0,0.0,-7.0,99.0,0.057,1.0,0.333,3.0,0.313,1.0,0.0,0.0,0.5,0.0,14.783333,77.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044,1.0,0.376,0.0,0.0,3.0,0.333,1.0,0.136,0.0,24.178457,16.0,126.0,36.0,2.0
50%,2006.0,0.1,1.0,0.0,0.0,0.0,108.0,0.123,2.0,0.5,7.0,0.444,3.0,0.2,2.0,0.8,1.0,24.0,103.0,0.033,1.0,2.0,8.0,0.0,0.0,0.111,1.0,0.333,0.0,0.086,3.0,0.515,0.113,1.0,6.0,0.5,3.0,0.188,0.0,26.927315,34.0,313.0,95.0,2.0
75%,2014.0,0.205,3.0,0.022,1.0,6.0,117.0,0.203,4.0,0.625,12.0,0.571,6.0,0.5,4.0,1.0,3.0,33.0,126.0,0.086,2.0,3.0,15.0,0.026,1.0,0.364,3.0,0.5,1.0,0.14,6.0,0.658,0.202,2.0,10.0,0.625,5.0,0.243,1.0,30.316844,55.0,577.0,214.0,3.0
max,2020.0,1.0,30.0,1.0,15.0,57.0,169.0,1.0,26.0,1.5,50.0,1.0,28.0,17.0,39.0,1.0,26.0,64.966667,300.0,1.0,18.0,7.0,81.0,1.0,11.0,1.0,24.0,1.0,14.0,1.0,34.0,1.5,1.0,14.0,46.0,1.0,26.0,1.0,1.0,44.614195,107.0,1795.0,1686.0,2776.0


Takeaway: The NBA player box scores are the dataframe we want to focus on. However,we want to aggregate the player data so that our dataset is not a season by season dataset, but rather a player by player dataset. This means we need to aggregate our NBA player box score data so that every row in our dataset represents an NBA player's career. Only then will we have the data we are looking for in order to start building our forecast model.