## Clean Up Individual Player Data Set for year 2017-18

In [1]:
import pandas as pd

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

In [2]:
def clean_df(df):
    df = df.drop(labels = ['Unnamed: 19', 'Unnamed: 24'], axis = 1)
    df['VORP'] = df['VORP\\'].str.replace("\\", '')
    df.drop(labels = ['VORP\\'], axis = 1, inplace = True)
    
    # edit Player column, split string on \\, expand into two columns and drop second column that doesn't contain name
    a = df['Player'].str.split('\\', n = 1, expand = True)
    player_list = a.drop(1, axis = 1)
    df_copy = df.copy()
    
    # replace name columns with updated list player's name
    df_copy['Player'] = player_list
    df['Player'] = df_copy['Player']
    
    # drop 'Rk' column
    df.drop('Rk', axis = 1, inplace = True)
    
    return df

In [3]:
import pandas as pd

df = pd.read_csv('Data/advanced_player_stats_1718.csv')

In [4]:
df_1718 = clean_df(df)

df_1718.head(5)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,24.0,OKC,75.0,1134.0,9.0,0.567,0.759,0.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,1.3,1.0,2.2,0.094,-0.5,-1.7,-2.2,-0.1
1,Quincy Acy,PF,27.0,BRK,70.0,1359.0,8.2,0.525,0.8,0.164,3.1,17.1,10.0,6.0,1.2,1.6,13.3,14.4,-0.1,1.1,1.0,0.036,-2.0,-0.2,-2.2,-0.1
2,Steven Adams,C,24.0,OKC,76.0,2487.0,20.6,0.63,0.003,0.402,16.6,13.9,15.3,5.5,1.8,2.8,13.3,16.7,6.7,3.0,9.7,0.187,2.2,1.1,3.3,3.3
3,Bam Adebayo,C,20.0,MIA,69.0,1368.0,15.7,0.57,0.021,0.526,9.7,21.6,15.6,11.0,1.2,2.5,13.6,15.9,2.3,1.9,4.2,0.148,-1.6,1.8,0.2,0.8
4,Arron Afflalo,SG,32.0,ORL,53.0,682.0,5.8,0.516,0.432,0.16,0.6,10.1,5.3,6.2,0.3,1.1,10.8,12.5,-0.1,0.2,0.1,0.009,-4.1,-1.8,-5.8,-0.7


In [5]:
# check for dupilcate entries (because some players played for more than one team during the season)
player_dup_bool = df_1718.duplicated(subset = 'Player')

In [6]:
df_1718[player_dup_bool]

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP


In [10]:
df_1718[df_1718.isnull().any(axis=1)]

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
306,Tyler Lydon,PF,21.0,DEN,1.0,2.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,-0.014,-5.7,-0.8,-6.5,0.0
328,Trey McKinney-Jones,SG,27.0,IND,1.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,-0.001,-5.8,-0.1,-5.9,0.0
347,Ben Moore,PF,22.0,IND,2.0,9.0,-2.3,,,,0.0,12.5,6.3,13.1,0.0,0.0,,0.0,0.0,0.0,0.0,0.072,-5.3,0.3,-5.0,0.0
540,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
drop_list = [306, 328, 347, 540]
df_1718.drop(df_1718.index[drop_list], inplace = True)

In [12]:
df_1718.isnull().sum()

Player    0
Pos       0
Age       0
Tm        0
G         0
MP        0
PER       0
TS%       0
3PAr      0
FTr       0
ORB%      0
DRB%      0
TRB%      0
AST%      0
STL%      0
BLK%      0
TOV%      0
USG%      0
OWS       0
DWS       0
WS        0
WS/48     0
OBPM      0
DBPM      0
BPM       0
VORP      0
dtype: int64

In [13]:
df_1718.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 537 entries, 0 to 539
Data columns (total 26 columns):
Player    537 non-null object
Pos       537 non-null object
Age       537 non-null float64
Tm        537 non-null object
G         537 non-null float64
MP        537 non-null float64
PER       537 non-null float64
TS%       537 non-null float64
3PAr      537 non-null float64
FTr       537 non-null float64
ORB%      537 non-null float64
DRB%      537 non-null float64
TRB%      537 non-null float64
AST%      537 non-null float64
STL%      537 non-null float64
BLK%      537 non-null float64
TOV%      537 non-null float64
USG%      537 non-null float64
OWS       537 non-null float64
DWS       537 non-null float64
WS        537 non-null float64
WS/48     537 non-null float64
OBPM      537 non-null float64
DBPM      537 non-null float64
BPM       537 non-null float64
VORP      537 non-null object
dtypes: float64(22), object(4)
memory usage: 113.3+ KB


## Clean Up Individual Player Data Set for year 2016-17

In [14]:
df = pd.read_csv('Data/advanced_player_stats_1617.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 10, saw 29
