# 2 Data Wrangling

 

### 2.1 Loading and Import
### 2.2 Data Exploring
* 2.2.1 Merging 
* 2.2.2 Counting Categorical Variables 
* 2.2.3 Numeric Data Exploring

### 2.3 Examine Means and Medians
### 2.4 Data Profiling
### 2.5 Dropping Certain Missing Values


## 2.1 Loading and Import

In [33]:
#Load all the necessary packages for the wrangling part of the project
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from pathlib import Path
import pandas_profiling
from pandas_profiling.utils.cache import cache_file


In [None]:
import sys
!{sys.executable} -m pip install -U pandas-profiling==2.9.0
!jupyter nbextension enable --py widgetsnbextension





In [None]:
nba1 = '../Capstone_Project2/Raw_data/NBA1.csv'
nba2 = '../Capstone_Project2/Raw_data/NBA2.csv'
nba3 = '../Capstone_Project2/Raw_data/NBA3.csv'

'''Files from pro basketball reference, they are stored as raw data in the directory'''

In [None]:
def loadNBA(file):
    df = pd.read_csv(file)
    df.columns = df.iloc[0, :]
    df = df.drop(0)
    return df

###DF1 is the salary for each player, which we will only keep the first column of it.

In [10]:
df1 = loadNBA(nba1)
df2 = pd.read_csv(nba2)
df3 = pd.read_csv(nba3)



In [11]:
df1.head()

Unnamed: 0,Rk,Player,Tm,2021-22,2022-23,2023-24,2024-25,2025-26,2026-27,Signed Using,Guaranteed
1,1,Stephen Curry\curryst01,GSW,"$45,780,966","$48,070,014","$51,915,615","$55,761,216","$59,606,817",,Bird Rights,"$261,134,628"
2,2,John Wall\walljo01,HOU,"$44,310,840","$47,366,760",,,,,Bird Rights,"$44,310,840"
3,3,Russell Westbrook\westbru01,LAL,"$44,211,146","$47,063,478",,,,,Bird Rights,"$44,211,146"
4,4,James Harden\hardeja01,BRK,"$43,848,000","$46,872,000",,,,,Bird Rights,"$43,848,000"
5,5,LeBron James\jamesle01,LAL,"$41,180,544","$44,474,988",,,,,Bird,"$85,655,532"


In [12]:
df2.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,1,Precious Achiuwa\achiupr01,PF,21,MIA,61,737,14.2,0.55,0.004,...,,0.3,1.0,1.3,0.085,,-3.6,-0.5,-4.1,-0.4
1,2,Jaylen Adams\adamsja01,PG,24,MIL,7,18,-6.5,0.125,0.25,...,,-0.1,0.0,-0.1,-0.252,,-15.1,-4.6,-19.8,-0.1
2,3,Steven Adams\adamsst01,C,27,NOP,58,1605,15.1,0.596,0.01,...,,2.3,1.7,4.0,0.119,,-0.4,0.1,-0.3,0.7
3,4,Bam Adebayo\adebaba01,C,23,MIA,64,2143,22.7,0.626,0.01,...,,5.6,3.2,8.8,0.197,,2.9,2.0,4.9,3.7
4,5,LaMarcus Aldridge\aldrila01,C,35,TOT,26,674,15.7,0.556,0.27,...,,0.5,0.6,1.1,0.08,,-0.2,-0.2,-0.3,0.3


In [13]:
df3.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa\achiupr01,PF,21,MIA,61,4,12.1,2.0,3.7,...,0.509,1.2,2.2,3.4,0.5,0.3,0.5,0.7,1.5,5.0
1,2,Jaylen Adams\adamsja01,PG,24,MIL,7,0,2.6,0.1,1.1,...,,0.0,0.4,0.4,0.3,0.0,0.0,0.0,0.1,0.3
2,3,Steven Adams\adamsst01,C,27,NOP,58,58,27.7,3.3,5.3,...,0.444,3.7,5.2,8.9,1.9,0.9,0.7,1.3,1.9,7.6
3,4,Bam Adebayo\adebaba01,C,23,MIA,64,64,33.5,7.1,12.5,...,0.799,2.2,6.7,9.0,5.4,1.2,1.0,2.6,2.3,18.7
4,5,LaMarcus Aldridge\aldrila01,C,35,TOT,26,23,25.9,5.4,11.4,...,0.872,0.7,3.8,4.5,1.9,0.4,1.1,1.0,1.8,13.5


## 2.2 Data Exploring

### 2.2.1 Merging

#### Since we have three datasets, we will do a left join first with df3 and df2, which represents their regular season stats and their advanced regular season stats. After that, we will do a left join with df1, which represents their salaries in 2021.

In [14]:
new_df = df3.merge(df2, how='left', on='Player', suffixes=['','_a'])
#Data with _r as suffix represents the regular season stats, and _a represents advanced stats

new_df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,1,Precious Achiuwa\achiupr01,PF,21,MIA,61,4,12.1,2.0,3.7,...,,0.3,1.0,1.3,0.085,,-3.6,-0.5,-4.1,-0.4
1,2,Jaylen Adams\adamsja01,PG,24,MIL,7,0,2.6,0.1,1.1,...,,-0.1,0.0,-0.1,-0.252,,-15.1,-4.6,-19.8,-0.1
2,3,Steven Adams\adamsst01,C,27,NOP,58,58,27.7,3.3,5.3,...,,2.3,1.7,4.0,0.119,,-0.4,0.1,-0.3,0.7
3,4,Bam Adebayo\adebaba01,C,23,MIA,64,64,33.5,7.1,12.5,...,,5.6,3.2,8.8,0.197,,2.9,2.0,4.9,3.7
4,5,LaMarcus Aldridge\aldrila01,C,35,TOT,26,23,25.9,5.4,11.4,...,,0.5,0.6,1.1,0.080,,-0.2,-0.2,-0.3,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,536,Delon Wright\wrighde01,PG,28,SAC,27,8,25.8,3.9,8.3,...,,0.8,0.5,1.3,0.092,,0.5,0.6,1.2,0.6
1217,537,Thaddeus Young\youngth01,PF,32,CHI,68,23,24.3,5.4,9.7,...,,2.8,2.2,5.1,0.147,,1.9,1.4,3.3,2.2
1218,538,Trae Young\youngtr01,PG,22,ATL,63,63,33.7,7.7,17.7,...,,5.9,1.3,7.2,0.163,,5.3,-1.7,3.7,3.0
1219,539,Cody Zeller\zelleco01,C,28,CHO,48,21,20.9,3.8,6.8,...,,2.1,1.1,3.3,0.156,,-0.2,-0.2,-0.5,0.4


#### We can see that there are two columns called "Unnamed: 19" and "Unnamed: 24", which have null values for every observation

#### We could also remove some rows from the advanced data, like the one that have the suffix '_a' which is the same as the ones from the regular seaon, which represents their teams, positions, and basic informations

In [15]:
new_df = new_df.drop(axis=1, columns=['Unnamed: 19', 'Unnamed: 24'])

In [16]:
new_df = new_df.drop(axis=1, columns=['Rk_a', 'Pos_a', 'Age_a', 'Tm_a', 'G_a', 'MP_a',])

In [17]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1221 entries, 0 to 1220
Data columns (total 50 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      1221 non-null   int64  
 1   Player  1221 non-null   object 
 2   Pos     1221 non-null   object 
 3   Age     1221 non-null   int64  
 4   Tm      1221 non-null   object 
 5   G       1221 non-null   int64  
 6   GS      1221 non-null   int64  
 7   MP      1221 non-null   float64
 8   FG      1221 non-null   float64
 9   FGA     1221 non-null   float64
 10  FG%     1216 non-null   float64
 11  3P      1221 non-null   float64
 12  3PA     1221 non-null   float64
 13  3P%     1142 non-null   float64
 14  2P      1221 non-null   float64
 15  2PA     1221 non-null   float64
 16  2P%     1212 non-null   float64
 17  eFG%    1216 non-null   float64
 18  FT      1221 non-null   float64
 19  FTA     1221 non-null   float64
 20  FT%     1163 non-null   float64
 21  ORB     1221 non-null   float64
 22  

#### Now, we will do another left join with the salaries dataframe, df1. All we need from df1 is the salary for 2021-22.

In [18]:
df = new_df.merge(df1[['Player', '2021-22']], on='Player', how='left')
%store df

Stored 'df' (DataFrame)


In [19]:
df.head()
df = df.rename(axis=1,mapper={'2021-22':'Salary'})

**Now we have the merged all the data. The informations of the columns are contained in the repository called Glossary.txt**

## 2.2.2 Counting Categorical Variables

#### It looks like positions and teams are the categorical variables we have in the dataset. I believe that these categories might be really important for our data.

In [20]:
df.info()
df['Salary'] = df['Salary'].str.replace('[$,]', '')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1345 entries, 0 to 1344
Data columns (total 51 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      1345 non-null   int64  
 1   Player  1345 non-null   object 
 2   Pos     1345 non-null   object 
 3   Age     1345 non-null   int64  
 4   Tm      1345 non-null   object 
 5   G       1345 non-null   int64  
 6   GS      1345 non-null   int64  
 7   MP      1345 non-null   float64
 8   FG      1345 non-null   float64
 9   FGA     1345 non-null   float64
 10  FG%     1340 non-null   float64
 11  3P      1345 non-null   float64
 12  3PA     1345 non-null   float64
 13  3P%     1263 non-null   float64
 14  2P      1345 non-null   float64
 15  2PA     1345 non-null   float64
 16  2P%     1336 non-null   float64
 17  eFG%    1340 non-null   float64
 18  FT      1345 non-null   float64
 19  FTA     1345 non-null   float64
 20  FT%     1283 non-null   float64
 21  ORB     1345 non-null   float64
 22  

  df['Salary'] = df['Salary'].str.replace('[$,]', '')


In [21]:
# We will drop the columns that have no values in salaries, since they will not help us to get a prediction

df = df.dropna(subset=['Salary'])

In [22]:
df['Player'].value_counts()

DeMarcus Cousins\couside01     27
Justin Jackson\jacksju01       27
Brad Wanamaker\wanambr01       18
Jabari Parker\parkeja01        18
Blake Griffin\griffbl01        18
                               ..
Tim Hardaway Jr.\hardati02      1
Tyrese Haliburton\halibty01     1
Rui Hachimura\hachiru01         1
Josh Green\greenjo02            1
Ivica Zubac\zubaciv01           1
Name: Player, Length: 395, dtype: int64

#### There are certain players that have played on different teams, and as a result, they were recorded several times. For example, Demarcus Cousins have appeared 27 times.

In [23]:
df['Salary'].isna().any()

False

In [24]:
#Salaries are strings not numbers yet
df['Salary'] = df['Salary'].astype(int)
df['Salary']

0        2711280
2       17073171
3       28103550
4        2641691
5        2641691
          ...   
1340     8526316
1341    14190000
1342     8326471
1343     2389641
1344     7518518
Name: Salary, Length: 992, dtype: int64

In [25]:
pts = df[['Player', 'PTS']].sort_values(by='PTS', ascending=False)
pts

Unnamed: 0,Player,PTS
289,Stephen Curry\curryst01,32.0
70,Bradley Beal\bealbr01,31.3
798,Damian Lillard\lillada01,28.8
352,Joel Embiid\embiijo01,28.5
35,Giannis Antetokounmpo\antetgi01,28.1
...,...,...
221,Gary Clark\clarkga01,0.0
220,Gary Clark\clarkga01,0.0
219,Gary Clark\clarkga01,0.0
216,Gary Clark\clarkga01,0.0


Now we are gonna see what positions are there, and whether certain positions get paid more or not

In [26]:
salaries_by_positions = df.groupby(['Pos'])

In [27]:
salaries_by_positions['Salary'].mean()

Pos
C        6.204436e+06
C-PF     1.219512e+07
PF       9.834783e+06
PF-C     1.782621e+06
PF-SF    4.347600e+06
PG       8.374691e+06
PG-SG    4.384800e+07
SF       5.633077e+06
SF-PF    1.239929e+06
SF-SG    7.179310e+06
SG       8.138949e+06
SG-PG    5.414678e+06
SG-SF    1.035862e+07
Name: Salary, dtype: float64

In [28]:
df['Pos'].value_counts()

C        248
SG       191
PG       182
PF       178
SF       153
SF-SG      9
SF-PF      7
SG-SF      6
SG-PG      6
PF-C       3
PG-SG      3
PF-SF      3
C-PF       3
Name: Pos, dtype: int64

## 2.2.3 Numeric Data Exploring

In [29]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rk,992.0,269.4688,154.4548,1.0,127.0,253.5,411.0,540.0
Age,992.0,26.82056,4.156143,19.0,23.0,27.0,30.0,40.0
G,992.0,37.43044,19.30192,1.0,21.0,36.0,54.0,72.0
GS,992.0,17.08367,19.73375,0.0,2.0,9.0,27.0,72.0
MP,992.0,21.61371,7.725259,2.0,16.3,21.5,27.5,37.6
FG,992.0,3.560484,2.044152,0.0,2.2,3.2,4.5,11.2
FGA,992.0,7.75,4.368978,0.0,4.7,6.8,9.5,23.0
FG%,988.0,0.4616457,0.09202866,0.0,0.408,0.446,0.503,1.0
3P,992.0,1.054738,0.8561039,0.0,0.375,0.9,1.6,5.3
3PA,992.0,2.97379,2.205433,0.0,1.2,2.6,4.4,12.7


3P% has a lower standard deviation than 2point%, which could be something to discover.

## 2.3 Examine the Means and Medians

Examine the mean and medians for points, rebounds, and assists

In [30]:
df[['PTS','TRB', 'AST']].mean()

PTS    9.671270
TRB    4.061089
AST    2.229234
dtype: float64

In [31]:
df[['PTS','TRB', 'AST']].median()

PTS    8.45
TRB    3.50
AST    1.70
dtype: float64

## 2.4 Profiling

In [32]:
report = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False)
report

exception calling callback for <Future at 0x7fb53413d310 state=finished returned list>
Traceback (most recent call last):
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backe

exception calling callback for <Future at 0x7fb534286670 state=finished raised PicklingError>
Traceback (most recent call last):
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/queues.py", line 153, in _feed
    obj_ = dumps(obj, reducers=reducers)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/reduction.py", line 271, in dumps
    dump(obj, buf, reducers=reducers, protocol=protocol)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/reduction.py", line 264, in dump
    _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/cloudpickle/cloudpickle_fast.py", line 602, in dump
    return Pickler.dump(self, obj)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 267, in __reduce__
    self._reducer_callbac

    self.terminate_broken(bpe)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 742, in terminate_broken
    for work_id, work_item in self.pending_work_items.items():
RuntimeError: dictionary changed size during iteration
exception calling callback for <Future at 0x7fb5338b7760 state=finished raised PicklingError>
Traceback (most recent call last):
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/queues.py", line 153, in _feed
    obj_ = dumps(obj, reducers=reducers)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/reduction.py", line 271, in dumps
    dump(obj, buf, reducers=reducers, protocol=protocol)
  File "/Users/liqingyang/opt/anaconda3/lib/python3.9/site-packages/joblib/externals/loky/backend/reduction.py", line 264, in dump
    _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj)
  File "/Users/liqi

KeyboardInterrupt: 

In [74]:
profile_report = df.profile_report(explorative=True, html={'style': {'full_width': True}})
profile_report

KeyboardInterrupt: 

## 2.5 Dropping Certain Missing Values

In [75]:
temp = df[df['FG%'].isna() == True]
#We are trying to get rid of the observations that don't have a field goal percentage
temp

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary


**It seems like Gary Clark is a player that has been on this list several times, we should eliminate him from the dataset**

In [76]:
df = df.dropna(subset=['FG%'])
df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary
0,1,Precious Achiuwa\achiupr01,PF,21,MIA,61,4,12.1,2.0,3.7,...,19.5,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,2711280
2,3,Steven Adams\adamsst01,C,27,NOP,58,58,27.7,3.3,5.3,...,11.7,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,17073171
3,4,Bam Adebayo\adebaba01,C,23,MIA,64,64,33.5,7.1,12.5,...,23.7,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,28103550
7,5,LaMarcus Aldridge\aldrila01,C,35,SAS,21,18,25.9,5.5,11.8,...,22.2,0.5,0.6,1.1,0.080,-0.2,-0.2,-0.3,0.3,2641691
10,5,LaMarcus Aldridge\aldrila01,C,35,BRK,5,5,26.0,5.0,9.6,...,22.2,0.5,0.6,1.1,0.080,-0.2,-0.2,-0.3,0.3,2641691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338,536,Delon Wright\wrighde01,PG,28,SAC,27,8,25.8,3.9,8.3,...,16.3,2.5,1.8,4.3,0.117,0.6,1.5,2.2,1.8,8526316
1341,537,Thaddeus Young\youngth01,PF,32,CHI,68,23,24.3,5.4,9.7,...,22.3,2.8,2.2,5.1,0.147,1.9,1.4,3.3,2.2,14190000
1342,538,Trae Young\youngtr01,PG,22,ATL,63,63,33.7,7.7,17.7,...,33.0,5.9,1.3,7.2,0.163,5.3,-1.7,3.7,3.0,8326471
1343,539,Cody Zeller\zelleco01,C,28,CHO,48,21,20.9,3.8,6.8,...,18.3,2.1,1.1,3.3,0.156,-0.2,-0.2,-0.5,0.4,2389641


In [77]:
df["eFG%"].isna().sort_values()

0       False
905     False
904     False
903     False
902     False
        ...  
410     False
408     False
407     False
492     False
1344    False
Name: eFG%, Length: 454, dtype: bool

In [78]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rk,454.0,270.6894,157.7435,1.0,128.25,267.5,406.75,540.0
Age,454.0,25.96696,4.248491,19.0,22.0,26.0,29.0,40.0
G,454.0,43.9163,19.59033,1.0,26.25,47.0,61.0,72.0
GS,454.0,22.96035,23.08786,0.0,3.0,13.0,40.75,72.0
MP,454.0,22.85022,8.123187,3.0,17.4,23.35,29.675,37.6
FG,454.0,3.919604,2.269588,0.0,2.3,3.5,5.075,11.2
FGA,454.0,8.419604,4.745942,0.5,4.8,7.6,10.975,23.0
FG%,454.0,0.4667423,0.08682955,0.0,0.418,0.45,0.50475,1.0
3P,454.0,1.180396,0.937089,0.0,0.4,1.0,1.8,5.3
3PA,454.0,3.247577,2.36724,0.0,1.325,2.85,4.9,12.7


***IT seems like a lot of players a missing a three point percentage, let's see if we should keep them***

In [79]:
df.isna().any()

Rk        False
Player    False
Pos       False
Age       False
Tm        False
G         False
GS        False
MP        False
FG        False
FGA       False
FG%       False
3P        False
3PA       False
3P%        True
2P        False
2PA       False
2P%       False
eFG%      False
FT        False
FTA       False
FT%        True
ORB       False
DRB       False
TRB       False
AST       False
STL       False
BLK       False
TOV       False
PF        False
PTS       False
PER       False
TS%       False
3PAr      False
FTr       False
ORB%      False
DRB%      False
TRB%      False
AST%      False
STL%      False
BLK%      False
TOV%      False
USG%      False
OWS       False
DWS       False
WS        False
WS/48     False
OBPM      False
DBPM      False
BPM       False
VORP      False
Salary    False
dtype: bool

In [80]:
a =  df.sort_values('3P%')
a[a['3P%'].isna() == True]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary
19,9,Jarrett Allen\allenja01,C,22,BRK,12,5,26.7,3.7,5.4,...,16.6,4.3,2.1,6.4,0.166,1.3,-0.2,1.1,1.4,20000000
54,23,Udoka Azubuike\azubuud01,C,21,UTA,15,0,3.8,0.3,0.6,...,12.4,0.0,0.1,0.1,0.119,-5.5,1.7,-3.8,0.0,2075880
164,75,Moses Brown\brownmo01,C,21,OKC,43,32,21.4,3.4,6.2,...,16.9,1.3,1.3,2.6,0.138,-1.3,-1.2,-2.4,-0.1,1701593
188,91,Clint Capela\capelca01,C,26,ATL,63,63,30.1,6.6,11.0,...,19.9,4.9,3.3,8.2,0.207,2.7,0.0,2.7,2.2,17103448
220,100,Gary Clark\clarkga01,PF,26,PHI,2,0,6.5,0.0,0.5,...,10.1,-0.3,0.4,0.1,0.008,-4.2,-0.7,-4.9,-0.5,377645
292,119,Ed Davis\davised01,C,31,MIN,23,7,13.0,0.8,1.9,...,7.9,0.5,0.4,0.8,0.135,-3.3,1.1,-2.2,0.0,2641691
341,137,Andre Drummond\drumman01,C,27,LAL,21,21,24.8,4.9,9.1,...,27.6,-0.6,2.5,2.0,0.077,-1.7,0.5,-1.2,0.3,2401537
393,168,Daniel Gafford\gaffoda01,PF,22,CHI,31,11,12.4,1.9,2.8,...,16.7,2.1,1.3,3.5,0.209,0.6,1.6,2.2,0.8,1782621
396,168,Daniel Gafford\gaffoda01,C,22,WAS,23,0,17.7,4.1,6.0,...,16.7,2.1,1.3,3.5,0.209,0.6,1.6,2.2,0.8,1782621
521,211,Isaiah Hartenstein\harteis01,C,22,DEN,30,0,9.1,1.4,2.7,...,19.2,0.7,0.9,1.6,0.134,-2.4,1.5,-0.9,0.2,3430810


**Upon inspection, 3P% is justing missing for some big men, since most of them have similar positions as C (center) or PF. We will keep these missing values, but we do need to get rid of some duplicates. Moreover, it seems like they are up here only because of their advanced stats, so as a result, we will get rid of some of these observations later.**

In [81]:
df[df.duplicated(subset=['Player', 'Tm'])].head(20)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary


***We can get rid of certain players that have duplicated values due to their names and teams. Certain players were definitely been traded but it appears that some of them appeared multiple times. I decide to get rid of the numbers for tot which stands for total. I think a player's performance on different teams might affect their salaries.***

In [82]:
condition = df['Tm']=='TOT'
len(df[condition==False]['Tm'].unique())

30

In [83]:
df = df[condition==False]

In [84]:
df = df.drop_duplicates(subset=['Player', 'Tm'])

### We got rid of some observations that had duplicated values with different pair of players and teams.

In [85]:
df[df.duplicated(subset=['Player', 'Tm'])]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary


In [None]:
df[df['FT%'].isna()==True]

It seems like these four players don't have a missing values due to the fact that they varely played any games. We could keep them, since they have relatively low salaries as well.

### After inspecting all missing values, we decide to keep the current data frame and move foreward.

In [None]:
df_after_wrangling = df

In [None]:
datapath = '../Capstone_Project2/Raw_data/'
df_after_wrangling = df.to_csv(r'../Capstone_Project2/Raw_data/df1.csv')

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Salary
0,1,Precious Achiuwa\achiupr01,PF,21,MIA,61,4,12.1,2.0,3.7,...,19.5,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,2711280
2,3,Steven Adams\adamsst01,C,27,NOP,58,58,27.7,3.3,5.3,...,11.7,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,17073171
3,4,Bam Adebayo\adebaba01,C,23,MIA,64,64,33.5,7.1,12.5,...,23.7,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,28103550
7,5,LaMarcus Aldridge\aldrila01,C,35,SAS,21,18,25.9,5.5,11.8,...,22.2,0.5,0.6,1.1,0.080,-0.2,-0.2,-0.3,0.3,2641691
10,5,LaMarcus Aldridge\aldrila01,C,35,BRK,5,5,26.0,5.0,9.6,...,22.2,0.5,0.6,1.1,0.080,-0.2,-0.2,-0.3,0.3,2641691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338,536,Delon Wright\wrighde01,PG,28,SAC,27,8,25.8,3.9,8.3,...,16.3,2.5,1.8,4.3,0.117,0.6,1.5,2.2,1.8,8526316
1341,537,Thaddeus Young\youngth01,PF,32,CHI,68,23,24.3,5.4,9.7,...,22.3,2.8,2.2,5.1,0.147,1.9,1.4,3.3,2.2,14190000
1342,538,Trae Young\youngtr01,PG,22,ATL,63,63,33.7,7.7,17.7,...,33.0,5.9,1.3,7.2,0.163,5.3,-1.7,3.7,3.0,8326471
1343,539,Cody Zeller\zelleco01,C,28,CHO,48,21,20.9,3.8,6.8,...,18.3,2.1,1.1,3.3,0.156,-0.2,-0.2,-0.5,0.4,2389641
