# NBA Draft Data Cleaning

In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Pull CSV
players_df = pd.read_csv("Resources/players.csv")
players_df.head()

Unnamed: 0,index,_id,birthDate,birthPlace,career_AST,career_FG%,career_FG3%,career_FT%,career_G,career_PER,...,draft_pick,draft_round,draft_team,draft_year,height,highSchool,name,position,shoots,weight
0,0,abdelal01,"June 24, 1968","Cairo, Egypt",0.3,50.2,0.0,70.1,256,13.0,...,25th overall,1st round,Portland Trail Blazers,1990,6-10,"Bloomfield in Bloomfield, New Jersey",Alaa Abdelnaby,Power Forward,Right,240lb
1,1,abdulza01,"April 7, 1946","Brooklyn, New York",1.2,42.8,,72.8,505,15.1,...,5th overall,1st round,Cincinnati Royals,1968,6-9,"John Jay in Brooklyn, New York",Zaid Abdul-Aziz,Power Forward and Center,Right,235lb
2,2,abdulka01,"April 16, 1947","New York, New York",3.6,55.9,5.6,72.1,1560,24.6,...,1st overall,1st round,Milwaukee Bucks,1969,7-2,"Power Memorial in New York, New York",Kareem Abdul-Jabbar,Center,Right,225lb
3,3,abdulma02,"March 9, 1969","Gulfport, Mississippi",3.5,44.2,35.4,90.5,586,15.4,...,3rd overall,1st round,Denver Nuggets,1990,6-1,"Gulfport in Gulfport, Mississippi",Mahmoud Abdul-Rauf,Point Guard,Right,162lb
4,4,abdulta01,"November 3, 1974","Maisons Alfort, France",1.1,41.7,23.7,70.3,236,11.4,...,11th overall,1st round,Sacramento Kings,1997,6-6,"Lycee Aristide Briand in Evreux, France",Tariq Abdul-Wahad,Shooting Guard,Right,223lb


In [3]:
# Pull names of all columns for reference
players_df.columns

Index(['index', '_id', 'birthDate', 'birthPlace', 'career_AST', 'career_FG%',
       'career_FG3%', 'career_FT%', 'career_G', 'career_PER', 'career_PTS',
       'career_TRB', 'career_WS', 'career_eFG%', 'college', 'draft_pick',
       'draft_round', 'draft_team', 'draft_year', 'height', 'highSchool',
       'name', 'position', 'shoots', 'weight'],
      dtype='object')

In [4]:
# Print unique number of players
print(len(players_df["_id"].unique()))

4685


In [5]:
# Clean up misspelled and duplicated data
players_df["draft_round"] = players_df["draft_round"].replace(
    {
        '2nd roun': '2nd round',
        '3rd roun': '3rd round',
        '4th roun': '4th round',
        '5th roun': '5th round',
        '6th roun': '6th round',
        '7th roun': '7th round',
        '8th roun': '8th round',
        '9th roun': '9th round',
        '10th roun': '10th round',
        "1948 BAA Draf": "1948 BAA Draft",
        "1947 BAA Draf": "1947 BAA Draft",
        "1952 NBA Draf": "1952 NBA Draft",
        "1956 NBA Draf": "1956 NBA Draft",
        "1953 NBA Draf": "1953 NBA Draft",
        "1949 BAA Draf": "1949 BAA Draft",
        "1955 NBA Draf": "1955 NBA Draft" 
        
        
    }
)

In [6]:
# Check that renaming worked
players_df["draft_round"].value_counts()

draft_round
1st round                                             1433
2nd round                                             1052
3rd round                                              283
4th round                                              171
5th round                                              102
6th round                                               72
7th round                                               69
8th round                                               59
1948 BAA Draft                                          39
9th round                                               31
1947 BAA Draft                                          26
10th round                                              26
1952 NBA Draft                                          24
1956 NBA Draft                                          19
11th round                                              18
12th round                                              16
1953 NBA Draft                              

In [7]:
# Create column for career points scored
players_df["career_pts_total"] = players_df["career_G"] * players_df['career_PTS']
players_df["career_pts_total"] = players_df["career_pts_total"].astype(int)

In [8]:
# Drop columns we won't be looking at
clean_players_df = players_df[["name", "birthDate", "position", 'college','draft_year', 'draft_round', 'draft_pick', 'career_G', 'career_PTS', 'career_pts_total', 'birthPlace', '_id' ]]
clean_players_df

Unnamed: 0,name,birthDate,position,college,draft_year,draft_round,draft_pick,career_G,career_PTS,career_pts_total,birthPlace,_id
0,Alaa Abdelnaby,"June 24, 1968",Power Forward,Duke University,1990,1st round,25th overall,256,5.7,1459,"Cairo, Egypt",abdelal01
1,Zaid Abdul-Aziz,"April 7, 1946",Power Forward and Center,Iowa State University,1968,1st round,5th overall,505,9.0,4545,"Brooklyn, New York",abdulza01
2,Kareem Abdul-Jabbar,"April 16, 1947",Center,"University of California, Los Angeles",1969,1st round,1st overall,1560,24.6,38376,"New York, New York",abdulka01
3,Mahmoud Abdul-Rauf,"March 9, 1969",Point Guard,Louisiana State University,1990,1st round,3rd overall,586,14.6,8555,"Gulfport, Mississippi",abdulma02
4,Tariq Abdul-Wahad,"November 3, 1974",Shooting Guard,"University of Michigan, San Jose State University",1997,1st round,11th overall,236,7.8,1840,"Maisons Alfort, France",abdulta01
...,...,...,...,...,...,...,...,...,...,...,...,...
4680,Ante Zizic,"January 4, 1997",Center,,2016,1st round,23rd overall,91,6.4,582,"Split, Croatia",zizican01
4681,Jim Zoet,"December 20, 1953",Center,Kent State University,,,,7,0.3,2,"Uxbridge, Canada",zoetji01
4682,Bill Zopf,"June 7, 1948",Point Guard,Duquesne University,1970,2nd round,33rd overall,53,2.2,116,,zopfbi01
4683,Ivica Zubac,"March 18, 1997",Center,,2016,2nd round,32nd overall,140,6.9,966,"Mostar, Bosnia and Herzegovina",zubaciv01


In [9]:
# Rename columns
clean_players_df.columns = ['Name', 'Birth Date', 'Position', 'College', "Draft Year", "Draft Round", "Draft Pick", "Career Games Played", "Career Points Per Game", "Total Career Points", "Birthplace", "_id"]
clean_players_df

Unnamed: 0,Name,Birth Date,Position,College,Draft Year,Draft Round,Draft Pick,Career Games Played,Career Points Per Game,Total Career Points,Birthplace,_id
0,Alaa Abdelnaby,"June 24, 1968",Power Forward,Duke University,1990,1st round,25th overall,256,5.7,1459,"Cairo, Egypt",abdelal01
1,Zaid Abdul-Aziz,"April 7, 1946",Power Forward and Center,Iowa State University,1968,1st round,5th overall,505,9.0,4545,"Brooklyn, New York",abdulza01
2,Kareem Abdul-Jabbar,"April 16, 1947",Center,"University of California, Los Angeles",1969,1st round,1st overall,1560,24.6,38376,"New York, New York",abdulka01
3,Mahmoud Abdul-Rauf,"March 9, 1969",Point Guard,Louisiana State University,1990,1st round,3rd overall,586,14.6,8555,"Gulfport, Mississippi",abdulma02
4,Tariq Abdul-Wahad,"November 3, 1974",Shooting Guard,"University of Michigan, San Jose State University",1997,1st round,11th overall,236,7.8,1840,"Maisons Alfort, France",abdulta01
...,...,...,...,...,...,...,...,...,...,...,...,...
4680,Ante Zizic,"January 4, 1997",Center,,2016,1st round,23rd overall,91,6.4,582,"Split, Croatia",zizican01
4681,Jim Zoet,"December 20, 1953",Center,Kent State University,,,,7,0.3,2,"Uxbridge, Canada",zoetji01
4682,Bill Zopf,"June 7, 1948",Point Guard,Duquesne University,1970,2nd round,33rd overall,53,2.2,116,,zopfbi01
4683,Ivica Zubac,"March 18, 1997",Center,,2016,2nd round,32nd overall,140,6.9,966,"Mostar, Bosnia and Herzegovina",zubaciv01


In [10]:
# Filter data to beginning of modern era (1980)
modern_era_df = clean_players_df.loc[clean_players_df["Draft Year"] > "1985"]
modern_era_df.reset_index(drop=True, inplace=True)
modern_era_df

Unnamed: 0,Name,Birth Date,Position,College,Draft Year,Draft Round,Draft Pick,Career Games Played,Career Points Per Game,Total Career Points,Birthplace,_id
0,Alaa Abdelnaby,"June 24, 1968",Power Forward,Duke University,1990,1st round,25th overall,256,5.7,1459,"Cairo, Egypt",abdelal01
1,Mahmoud Abdul-Rauf,"March 9, 1969",Point Guard,Louisiana State University,1990,1st round,3rd overall,586,14.6,8555,"Gulfport, Mississippi",abdulma02
2,Tariq Abdul-Wahad,"November 3, 1974",Shooting Guard,"University of Michigan, San Jose State University",1997,1st round,11th overall,236,7.8,1840,"Maisons Alfort, France",abdulta01
3,Shareef Abdur-Rahim,"December 11, 1976",Center and Small Forward and Power Forward,University of California,1996,1st round,3rd overall,830,18.1,15023,"Marietta, Georgia",abdursh01
4,Alex Abrines,"August 1, 1993",Shooting Guard,,2013,2nd round,32nd overall,174,5.3,922,"Palma de Mallorca, Spain",abrinal01
...,...,...,...,...,...,...,...,...,...,...,...,...
1674,Derrick Zimmerman,"December 2, 1981",Point Guard,Mississippi State University,2003,2nd round,40th overall,2,2.0,4,"Monroe, Louisiana",zimmede01
1675,Stephen Zimmerman,"September 9, 1996",Center,"University of Nevada, Las Vegas",2016,2nd round,41st overall,19,1.2,22,"Hendersonville, Tennessee",zimmest01
1676,Paul Zipser,"February 18, 1994",Small Forward,,2016,2nd round,48th overall,98,4.7,460,"Heidelberg, Germany",zipsepa01
1677,Ante Zizic,"January 4, 1997",Center,,2016,1st round,23rd overall,91,6.4,582,"Split, Croatia",zizican01


In [11]:
# Check for null values
modern_era_df.isnull().sum()

Name                        0
Birth Date                  0
Position                    0
College                   219
Draft Year                  0
Draft Round                 0
Draft Pick                  0
Career Games Played         0
Career Points Per Game      0
Total Career Points         0
Birthplace                  0
_id                         0
dtype: int64

In [12]:
# Fill null values for players that didn't attend college
modern_era_df["College"].fillna('No college', inplace=True)

In [13]:
# Check that null fill worked
modern_era_df.isnull().sum()

Name                      0
Birth Date                0
Position                  0
College                   0
Draft Year                0
Draft Round               0
Draft Pick                0
Career Games Played       0
Career Points Per Game    0
Total Career Points       0
Birthplace                0
_id                       0
dtype: int64

In [14]:
# Check draft year column for strange values
modern_era_df["Draft Year"].value_counts()

Draft Year
1986    67
1988    58
2012    56
1998    56
1987    55
2005    55
2011    54
2016    54
2014    53
2017    53
2018    53
1990    52
2006    52
2010    51
2008    51
2013    51
2000    50
1995    50
2009    50
2007    49
2001    49
2002    48
1992    48
1989    48
2003    47
1996    47
1997    47
2004    46
1999    46
1994    45
1991    44
2015    44
1993    43
1st      3
2nd      3
8th      1
Name: count, dtype: int64

In [15]:
# Check strange values to see if they should be included in Dataset or not
first = modern_era_df[(modern_era_df["Draft Year"] == "1st ")]
second = modern_era_df[(modern_era_df["Draft Year"] == "2nd ")]
eighth = modern_era_df[(modern_era_df["Draft Year"] == "8th ")]

In [16]:
# Drop strange values as they were drafted outside scope of dataset
modern_era_df.drop(first.index, inplace=True)
modern_era_df.drop(second.index, inplace=True)
modern_era_df.drop(eighth.index, inplace=True)

In [17]:
# Check draft year column for strange values
modern_era_df["Draft Year"].value_counts()

Draft Year
1986    67
1988    58
2012    56
1998    56
1987    55
2005    55
2016    54
2011    54
2018    53
2014    53
2017    53
1990    52
2006    52
2010    51
2008    51
2013    51
1995    50
2000    50
2009    50
2001    49
2007    49
1989    48
2002    48
1992    48
2003    47
1997    47
1996    47
2004    46
1999    46
1994    45
2015    44
1991    44
1993    43
Name: count, dtype: int64

In [18]:
# Split birthdate and birthplace column for sorting
modern_era_df[["Date", "Year"]] = modern_era_df["Birth Date"].str.split(",", expand = True)
modern_era_df[["Month", "Day"]] = modern_era_df["Date"].str.split(" ", expand = True)
modern_era_df[["Birth City", "Birth State/Country"]] = modern_era_df["Birthplace"].str.split(",", expand = True)
modern_era_df

Unnamed: 0,Name,Birth Date,Position,College,Draft Year,Draft Round,Draft Pick,Career Games Played,Career Points Per Game,Total Career Points,Birthplace,_id,Date,Year,Month,Day,Birth City,Birth State/Country
0,Alaa Abdelnaby,"June 24, 1968",Power Forward,Duke University,1990,1st round,25th overall,256,5.7,1459,"Cairo, Egypt",abdelal01,June 24,1968,June,24,Cairo,Egypt
1,Mahmoud Abdul-Rauf,"March 9, 1969",Point Guard,Louisiana State University,1990,1st round,3rd overall,586,14.6,8555,"Gulfport, Mississippi",abdulma02,March 9,1969,March,9,Gulfport,Mississippi
2,Tariq Abdul-Wahad,"November 3, 1974",Shooting Guard,"University of Michigan, San Jose State University",1997,1st round,11th overall,236,7.8,1840,"Maisons Alfort, France",abdulta01,November 3,1974,November,3,Maisons Alfort,France
3,Shareef Abdur-Rahim,"December 11, 1976",Center and Small Forward and Power Forward,University of California,1996,1st round,3rd overall,830,18.1,15023,"Marietta, Georgia",abdursh01,December 11,1976,December,11,Marietta,Georgia
4,Alex Abrines,"August 1, 1993",Shooting Guard,No college,2013,2nd round,32nd overall,174,5.3,922,"Palma de Mallorca, Spain",abrinal01,August 1,1993,August,1,Palma de Mallorca,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674,Derrick Zimmerman,"December 2, 1981",Point Guard,Mississippi State University,2003,2nd round,40th overall,2,2.0,4,"Monroe, Louisiana",zimmede01,December 2,1981,December,2,Monroe,Louisiana
1675,Stephen Zimmerman,"September 9, 1996",Center,"University of Nevada, Las Vegas",2016,2nd round,41st overall,19,1.2,22,"Hendersonville, Tennessee",zimmest01,September 9,1996,September,9,Hendersonville,Tennessee
1676,Paul Zipser,"February 18, 1994",Small Forward,No college,2016,2nd round,48th overall,98,4.7,460,"Heidelberg, Germany",zipsepa01,February 18,1994,February,18,Heidelberg,Germany
1677,Ante Zizic,"January 4, 1997",Center,No college,2016,1st round,23rd overall,91,6.4,582,"Split, Croatia",zizican01,January 4,1997,January,4,Split,Croatia


In [19]:
# Re-order columns after reformatting birthdate column
modern_era_dates = modern_era_df[['_id', 'Name', 'Month', 'Day', 'Year', 'Position',
                                  'College', "Draft Year", "Draft Round", "Draft Pick", 
                                  "Career Games Played", "Career Points Per Game", 
                                  "Total Career Points", "Birth City", 
                                  "Birth State/Country"]]
modern_era_dates

Unnamed: 0,_id,Name,Month,Day,Year,Position,College,Draft Year,Draft Round,Draft Pick,Career Games Played,Career Points Per Game,Total Career Points,Birth City,Birth State/Country
0,abdelal01,Alaa Abdelnaby,June,24,1968,Power Forward,Duke University,1990,1st round,25th overall,256,5.7,1459,Cairo,Egypt
1,abdulma02,Mahmoud Abdul-Rauf,March,9,1969,Point Guard,Louisiana State University,1990,1st round,3rd overall,586,14.6,8555,Gulfport,Mississippi
2,abdulta01,Tariq Abdul-Wahad,November,3,1974,Shooting Guard,"University of Michigan, San Jose State University",1997,1st round,11th overall,236,7.8,1840,Maisons Alfort,France
3,abdursh01,Shareef Abdur-Rahim,December,11,1976,Center and Small Forward and Power Forward,University of California,1996,1st round,3rd overall,830,18.1,15023,Marietta,Georgia
4,abrinal01,Alex Abrines,August,1,1993,Shooting Guard,No college,2013,2nd round,32nd overall,174,5.3,922,Palma de Mallorca,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1674,zimmede01,Derrick Zimmerman,December,2,1981,Point Guard,Mississippi State University,2003,2nd round,40th overall,2,2.0,4,Monroe,Louisiana
1675,zimmest01,Stephen Zimmerman,September,9,1996,Center,"University of Nevada, Las Vegas",2016,2nd round,41st overall,19,1.2,22,Hendersonville,Tennessee
1676,zipsepa01,Paul Zipser,February,18,1994,Small Forward,No college,2016,2nd round,48th overall,98,4.7,460,Heidelberg,Germany
1677,zizican01,Ante Zizic,January,4,1997,Center,No college,2016,1st round,23rd overall,91,6.4,582,Split,Croatia


In [20]:
# Push clean csv files
filepath = Path('Resources/cleaned_draft_data.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
modern_era_dates.to_csv(filepath, index=False) 