1. Data Preprocessing
 Read the datasets.
 Convert appropriate columns to integers or dates
 Store the data in structured Pandas DataFrames.
2. Basic Data Exploration and Summary Statistics
 Present three conclusions with different statistical methods (e.g., correlation analysis, hypothesis testing, and summary statistics).
 Include one plot for each method.

# Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# 1) Data Preprocessing
## Importing our data

In [2]:
nba_players = pd.read_csv('all_seasons.csv')
team_stats = pd.read_csv('nba_team_stats_00_to_23.csv')
team_pg_stats = pd.read_csv('nba_team_stats_playoffs_00_to_21.csv')

## Data Exploration

### Examining nba_players (NBA Players Stats (1996-2022))

In [3]:
print("Columns: ")
print(nba_players.columns, "\n")

print("Column Types:")
print(nba_players.dtypes, "\n")

for column in nba_players.columns:
  print(nba_players[column].describe())
# Somethings to note:
# player_height is in cm, player_weight is in kg
# pts, reb, ast are averaged

# Somethings to note about the data:
# 'draft_year', 'draft_round', and 'draft_number' are type string b/c of value 'undrafted', can substitute undraft w/ missing placeholder (such as -1 for ints, 0000 for year)
# Column 'Unamed: 0' can be dropped as it is just an index
# age could be converted to type int
nba_players

Columns: 
Index(['Unnamed: 0', 'player_name', 'team_abbreviation', 'age',
       'player_height', 'player_weight', 'college', 'country', 'draft_year',
       'draft_round', 'draft_number', 'gp', 'pts', 'reb', 'ast', 'net_rating',
       'oreb_pct', 'dreb_pct', 'usg_pct', 'ts_pct', 'ast_pct', 'season'],
      dtype='object') 

Column Types:
Unnamed: 0             int64
player_name           object
team_abbreviation     object
age                  float64
player_height        float64
player_weight        float64
college               object
country               object
draft_year            object
draft_round           object
draft_number          object
gp                     int64
pts                  float64
reb                  float64
ast                  float64
net_rating           float64
oreb_pct             float64
dreb_pct             float64
usg_pct              float64
ts_pct               float64
ast_pct              float64
season                object
dtype: object 

coun

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.030,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.20,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.20,102.058200,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.500,0.064,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12839,12839,Joel Embiid,PHI,29.0,213.36,127.005760,Kansas,Cameroon,2014,1,...,33.1,10.2,4.2,8.8,0.057,0.243,0.370,0.655,0.233,2022-23
12840,12840,John Butler Jr.,POR,20.0,213.36,86.182480,Florida State,USA,Undrafted,Undrafted,...,2.4,0.9,0.6,-16.1,0.012,0.065,0.102,0.411,0.066,2022-23
12841,12841,John Collins,ATL,25.0,205.74,102.511792,Wake Forest,USA,2017,1,...,13.1,6.5,1.2,-0.2,0.035,0.180,0.168,0.593,0.052,2022-23
12842,12842,Jericho Sims,NYK,24.0,208.28,113.398000,Texas,USA,2021,2,...,3.4,4.7,0.5,-6.7,0.117,0.175,0.074,0.780,0.044,2022-23


### Examining team_stats (Regular Season Stats 2000-2023)

In [4]:
print("Columns: ")
print(team_stats.columns, "\n")

print("Column Types:")
print(team_stats.dtypes)

team_stats

Columns: 
Index(['teamstatspk', 'Team', 'games_played', 'wins', 'losses',
       'win_percentage', 'Min', 'points', 'field_goals_made',
       'field_goals_attempted', 'field_goal_percentage', 'three_pointers_made',
       'three_pointers_attempted', 'three_point_percentage',
       'free_throws_made', 'free_throw_attempted', 'free_throw_percentage',
       'offensive_rebounds', 'defensive_rebounds', 'rebounds', 'assists',
       'turnovers', 'steals', 'blocks', 'blocks_attempted', 'personal_fouls',
       'personal_fouls_drawn', 'plus_minus', 'season'],
      dtype='object') 

Column Types:
teamstatspk                   int64
Team                         object
games_played                  int64
wins                          int64
losses                        int64
win_percentage              float64
Min                           int64
points                        int64
field_goals_made              int64
field_goals_attempted         int64
field_goal_percentage       float64
three

Unnamed: 0,teamstatspk,Team,games_played,wins,losses,win_percentage,Min,points,field_goals_made,field_goals_attempted,...,rebounds,assists,turnovers,steals,blocks,blocks_attempted,personal_fouls,personal_fouls_drawn,plus_minus,season
0,0,Boston Celtics,82,64,18,0.780,3966,9887,3601,7396,...,3799,2207,979,557,538,304,1326,1416,930,2023-24
1,1,Denver Nuggets,82,57,25,0.695,3941,9418,3610,7279,...,3643,2415,1036,585,456,394,1489,1467,431,2023-24
2,2,Oklahoma City Thunder,82,57,25,0.695,3961,9847,3653,7324,...,3447,2223,1039,694,538,419,1545,1548,608,2023-24
3,3,Minnesota Timberwolves,82,56,26,0.683,3961,9264,3383,6974,...,3577,2184,1162,647,497,371,1544,1630,529,2023-24
4,4,LA Clippers,82,51,31,0.622,3941,9481,3473,7108,...,3523,2097,1078,640,413,384,1519,1537,269,2023-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,711,Atlanta Hawks,82,25,57,0.305,3946,7459,2876,6668,...,3518,1559,1368,634,387,513,1863,7,-427,2000-01
712,712,Vancouver Grizzlies,82,23,59,0.280,3956,7522,2870,6539,...,3325,1899,1291,586,359,476,1733,5,-470,2000-01
713,713,Washington Wizards,82,19,63,0.232,3936,7645,2833,6453,...,3386,1647,1391,630,383,511,1913,9,-547,2000-01
714,714,Golden State Warriors,82,17,65,0.207,3956,7584,2937,7175,...,3730,1788,1301,742,410,493,1727,6,-742,2000-01


### Examining team_pg_stats (Playoff Games Stats 2000-2021)

In [5]:
print("Columns: ")
print(team_pg_stats.columns, "\n")

print("Column Types:")
print(team_pg_stats.dtypes)

team_pg_stats

Columns: 
Index(['teamstatspk', 'team', 'games_played', 'wins', 'losses',
       'win_percentage', 'minutes', 'points', 'field_goals_made',
       'field_goals_attempted', 'field_goal_percentage', 'three_pointers_made',
       'three_pointers_attempted', 'three_point_percentage',
       'free_throws_made', 'free_throw_attempted', 'free_throw_percentage',
       'offensive_rebounds', 'defensive_rebounds', 'rebounds', 'assists',
       'turnovers', 'steals', 'blocks', 'blocks_attempted', 'personal_fouls',
       'personal_fouls_drawn', 'plus_minus', 'season'],
      dtype='object') 

Column Types:
teamstatspk                   int64
team                         object
games_played                  int64
wins                          int64
losses                        int64
win_percentage              float64
minutes                     float64
points                      float64
field_goals_made            float64
field_goals_attempted       float64
field_goal_percentage       float64
t

Unnamed: 0,teamstatspk,team,games_played,wins,losses,win_percentage,minutes,points,field_goals_made,field_goals_attempted,...,rebounds,assists,turnovers,steals,blocks,blocks_attempted,personal_fouls,personal_fouls_drawn,plus_minus,season
0,0,Milwaukee Bucks,23,16,7,0.696,48.4,110.3,42.1,91.1,...,49.0,22.8,12.7,7.8,4.2,3.8,18.0,20.1,5.1,2020-21
1,1,Phoenix Suns,22,14,8,0.636,48.0,109.0,40.9,85.0,...,42.7,23.2,11.9,6.5,4.2,3.6,19.7,18.5,4.5,2020-21
2,2,Brooklyn Nets,12,7,5,0.583,48.4,112.5,40.7,86.2,...,42.6,22.6,11.6,7.1,4.8,6.2,21.2,18.6,6.3,2020-21
3,3,Philadelphia 76ers,12,7,5,0.583,48.0,116.3,42.5,85.8,...,44.7,24.9,12.8,8.0,6.2,3.8,23.0,23.8,7.5,2020-21
4,4,Atlanta Hawks,18,10,8,0.556,48.0,106.3,38.9,86.8,...,42.4,20.2,12.4,6.7,4.4,4.3,21.2,20.3,-1.5,2020-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,331,Minnesota Timberwolves,4,1,3,0.250,48.0,82.0,30.8,79.5,...,42.5,18.5,12.8,7.5,5.8,4.5,24.3,0.0,-6.5,2000-01
332,332,Orlando Magic,4,1,3,0.250,49.3,102.8,39.0,89.8,...,43.8,22.0,11.3,8.0,6.3,5.3,26.5,0.3,-5.8,2000-01
333,333,Phoenix Suns,4,1,3,0.250,48.0,88.5,34.0,89.5,...,44.3,21.3,13.8,9.8,5.8,5.3,23.5,0.0,-9.5,2000-01
334,334,Miami Heat,3,0,3,0.000,48.0,78.3,29.7,72.3,...,35.0,16.7,18.7,5.0,5.0,4.0,25.3,0.3,-22.3,2000-01


## Parsing / Cleaning

### Checking for Missingness/Duplicates

In [6]:
# Helper functions
def missingness(df):
  for column in df.columns:
    print(column + ": " + str(sum(df[column].isna()) / len(df[column])))
  print("\n")

def has_duplicates(df):
  return len(df) != len(df.drop_duplicates())

# Only column that contains missingness (from NaNs) is 'college' from nba_players w/ 14% missingness
print("Missingness in nba_players:")
missingness(nba_players)
print("Missingness in team_stats:")
missingness(team_stats)
print("Missingness in team_pg_stats:")
missingness(team_pg_stats)

# Simple duplicate checking
print("nba_players duplicates: " + str(has_duplicates(nba_players)))
print("team_stats duplicates: " + str(has_duplicates(team_stats)))
print("team_pg_stats duplicates: " + str(has_duplicates(team_pg_stats)))

# More in-depth (looking at set of columns that should be a primary key)
print("nba_players duplicates: " + str(len(nba_players[['player_name', 'team_abbreviation', 'season']].drop_duplicates()) != len(nba_players)))
print("team_stats duplicates: " + str(len(team_stats[['Team', 'season']].drop_duplicates()) != len(team_stats)))
print("team_pg_stats duplicates: " + str(len(team_pg_stats[['team', 'season']].drop_duplicates()) != len(team_pg_stats)))

print("\nBased on our quick examination, we've observe that the only missingness came from nba_players' college column and there were no duplicates among the datasets")

Missingness in nba_players:
Unnamed: 0: 0.0
player_name: 0.0
team_abbreviation: 0.0
age: 0.0
player_height: 0.0
player_weight: 0.0
college: 0.14434755527872936
country: 0.0
draft_year: 0.0
draft_round: 0.0
draft_number: 0.0
gp: 0.0
pts: 0.0
reb: 0.0
ast: 0.0
net_rating: 0.0
oreb_pct: 0.0
dreb_pct: 0.0
usg_pct: 0.0
ts_pct: 0.0
ast_pct: 0.0
season: 0.0


Missingness in team_stats:
teamstatspk: 0.0
Team: 0.0
games_played: 0.0
wins: 0.0
losses: 0.0
win_percentage: 0.0
Min: 0.0
points: 0.0
field_goals_made: 0.0
field_goals_attempted: 0.0
field_goal_percentage: 0.0
three_pointers_made: 0.0
three_pointers_attempted: 0.0
three_point_percentage: 0.0
free_throws_made: 0.0
free_throw_attempted: 0.0
free_throw_percentage: 0.0
offensive_rebounds: 0.0
defensive_rebounds: 0.0
rebounds: 0.0
assists: 0.0
turnovers: 0.0
steals: 0.0
blocks: 0.0
blocks_attempted: 0.0
personal_fouls: 0.0
personal_fouls_drawn: 0.0
plus_minus: 0.0
season: 0.0


Missingness in team_pg_stats:
teamstatspk: 0.0
team: 0.0
games_p

### Dealing with Missingness

In [7]:
# Identifying what type of missingness is occurring w/ the 'college' column
print("Unique Values under nba_players' college")
print(nba_players['college'].unique())

# From the unique value, it can be observed that most if not all of the universities
# recorded were within the United States. Some possibilities for this missingness
# would be:
# 1) This player did not play for a college team
# 2) This player did play for a college team, but they were outside the US

# As such, this data is missing not at random. Since no other features within the
# dataset has an influence upon the missingness in 'college'

# As such, since it is infeasible to look up all players to find out their
# college history, a placeholder 'N/A'.

print("\n")
nba_players[nba_players['college'].isna()]
nba_players.loc[nba_players['college'].isna(), 'college'] = "N/A"
missingness(nba_players)

# Furthermore, there were some values that stood out. There were some values that
# were just whitespace. We'll replace these values with 'N/A'
nba_players.loc[nba_players['college'].str.isspace(), 'college'] = "N/A"


Unique Values under nba_players' college
['Louisiana State' 'Northwestern Oklahoma' 'North Carolina'
 'Florida State' 'UCLA' 'Tennessee-Chattanooga' nan 'Michigan' 'Purdue'
 'Duke' 'Ohio' 'Eastern Michigan' 'Nevada-Las Vegas' 'Kansas'
 'Texas-El Paso' 'Indiana' 'Louisville' 'Houston' 'Oklahoma'
 'Oral Roberts' 'Oregon State' 'Brigham Young' 'Washington' 'Memphis'
 'Notre Dame' 'Delaware State' 'Alabama' 'Wyoming' 'Pittsburgh'
 'Providence' 'Nebraska' 'Michigan State' 'Mississippi State'
 'New Orleans' 'Penn State' 'Western Carolina' 'Iowa State'
 "St. Mary's (TX)" 'Clemson' 'Ohio State' 'Georgetown' 'Marquette'
 'Virginia Tech' 'Southern Mississippi' 'McNeese State' 'Longwood'
 'Arkansas' 'Arkansas-Little Rock' 'Virginia' 'Detroit Mercy'
 'Oklahoma State' 'Gonzaga' 'Syracuse' 'Richmond' 'Georgia Tech'
 'Maryland' 'Pennsylvania' 'Grand Canyon' 'Tulane' 'Boston College'
 'Arizona State' 'Kentucky' "St. John's (NY)" 'South Carolina'
 'California' 'Texas Tech' 'Bradley' 'Temple' 'Illinois'

### Manipulating Features

In [8]:
# As mentioned earlier, there were some additional columns used for indexing which are redundant
# We'll be removing them here
nba_players = nba_players.drop(['Unnamed: 0'], axis=1)
team_stats = team_stats.drop(['teamstatspk'], axis=1)
team_pg_stats = team_pg_stats.drop(['teamstatspk'], axis=1)

# We'll convert draft_year from a str to a year by replacing undrafted w/ NaT
nba_players.loc[nba_players['draft_year'] == 'Undrafted', 'draft_year'] = pd.NaT
nba_players['draft_year'] = nba_players['draft_year'].apply(lambda x: pd.to_datetime(x, format='%Y'))

# We'll convert draft_round and draft_number from a str to a int by replacing undrafted w/ -1
nba_players.loc[nba_players['draft_round'] == 'Undrafted', 'draft_round'] = -1
nba_players['draft_round'] = nba_players['draft_round'].astype(int)
nba_players.loc[nba_players['draft_number'] == 'Undrafted', 'draft_number'] = -1
nba_players['draft_number'] = nba_players['draft_number'].astype(int)
nba_players.dtypes

Unnamed: 0,0
player_name,object
team_abbreviation,object
age,float64
player_height,float64
player_weight,float64
college,object
country,object
draft_year,datetime64[ns]
draft_round,int64
draft_number,int64


In [11]:
# Evolving label with how drafts works. Possible solution: Either keep if drafts are not relevant, split/remove if they are
# src: https://en.wikipedia.org/wiki/NBA_draft#:~:text=In%20the%20early%20years%20of,was%20shortened%20to%20seven%20rounds.
nba_players[nba_players['draft_round'] > 2]

Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,draft_number,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
43,Frank Brickowski,BOS,37.0,205.74,108.86208,Penn State,USA,1981-01-01,3,57,...,4.8,2.0,0.9,-22.4,0.026,0.133,0.173,0.512,0.116,1996-97
81,Jack Haley,NJN,33.0,208.28,109.769264,UCLA,USA,1987-01-01,4,79,...,2.0,1.6,0.3,5.0,0.167,0.284,0.224,0.441,0.088,1996-97
115,Craig Ehlo,SEA,35.0,198.12,81.64656,Washington State,USA,1983-01-01,3,48,...,3.5,1.8,1.1,5.7,0.056,0.107,0.166,0.412,0.13,1996-97
120,Chris Dudley,POR,32.0,210.82,108.86208,Yale,USA,1987-01-01,4,75,...,3.9,7.3,0.5,-0.2,0.134,0.249,0.107,0.449,0.036,1996-97
139,Charles Jones,HOU,40.0,205.74,97.52228,Albany State (GA),USA,1979-01-01,8,165,...,0.3,1.1,0.3,2.6,0.068,0.103,0.025,0.4,0.046,1996-97
155,Donald Royal,CHH,31.0,203.2,98.883056,Notre Dame,USA,1987-01-01,3,52,...,3.5,2.5,0.4,-4.6,0.073,0.137,0.131,0.552,0.047,1996-97
167,Earl Cureton,TOR,39.0,205.74,95.25432,Detroit Mercy,USA,1979-01-01,3,58,...,0.8,1.0,0.4,-2.1,0.105,0.102,0.103,0.376,0.148,1996-97
203,Sedale Threatt,HOU,35.0,187.96,83.91452,West Virginia Tech,USA,1983-01-01,6,139,...,3.3,1.1,1.9,-3.5,0.017,0.063,0.122,0.451,0.185,1996-97
220,Sarunas Marciulionis,DEN,33.0,195.58,90.7184,,USA,1987-01-01,6,127,...,6.8,1.8,1.5,-7.6,0.057,0.086,0.27,0.496,0.188,1996-97
241,Sam Mitchell,MIN,33.0,200.66,97.52228,Mercer,USA,1985-01-01,3,54,...,9.3,4.0,1.0,-4.9,0.064,0.124,0.183,0.523,0.068,1996-97
