# Wczytanie danych oraz wstępny wybór

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_info_columns = 50
pd.options.display.max_rows = 100

data_source = pd.read_csv(r"data_2022/seasons_stats.csv",encoding='latin-1')
cols = ['Year', 'Player', 'Pos', 'Age', 'G', 'GS', 'MP',
       'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
       'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
       'OBPM', 'DBPM', 'BPM', 'VORP']       
data_source = data_source[cols]
data_source = data_source.query("Year >= 1982 and Year < 2022").reset_index(drop=True)
data_source.info() #20935 rows

In [None]:
data_source[data_source.duplicated(subset = ['Year', 'Player'],keep=False)][15:31]

# Usunięcie duplikatów

In [69]:
data = data_source.copy()
#Drop rows with all missing values 
data.dropna(axis=0,how='all',inplace=True)
#Check for duplicated rows
data.duplicated().any() #False
data.duplicated(subset = ['Year', 'Player']).any() #True 
data = data[~data.duplicated(subset = ['Year', 'Player'], keep = 'first')]

# Dodanie kolumny MVP

In [70]:
mvp_winners_data = pd.read_excel(r"data/mvp_winners.xlsx",header=None,names=['Year','Player'])
mvp_winners_data['Year'] = (mvp_winners_data['Year'].str[:2] + mvp_winners_data['Year'].str[5:]).astype('double')
mvp_winners_data['MVP'] = 1
mvp_winners_data

data['Player'] = data['Player'].str.replace('*','')
data = pd.merge(data,mvp_winners_data,on=['Year','Player'],how='left')
data.loc[(data['Year'] == 2021 ) & ( data["Player"].str.find('Nikola Joki') != -1 ),'MVP'] = 1
data['MVP'] = data['MVP'].fillna(0)


# Wypełnienie brakujących wartości 

In [71]:
missing_cels = data.isnull().sum().sum()
all_cels = data.shape[0] * data.shape[1]
print("Missing values:", round(missing_cels / all_cels * 100,4),"%")


Missing values: 0.411 %


In [73]:
data.isnull().mean()
#Deleting 3 players with NAN that are not significant 
data = data[data['USG%'].notna()]
#Deleting players that didnt make single shot
data['FG%'].fillna(0,inplace=True)
data['2P%'].fillna(0,inplace=True)
data['3P%'].fillna(0,inplace=True)
data['eFG%'].fillna(0,inplace=True)
data['FT%'].fillna(0,inplace=True)
data['TS%'].fillna(0,inplace=True)
data['3PAr'].fillna(0,inplace=True)
data['FTr'].fillna(0,inplace=True)
data['TOV%'].fillna(0,inplace=True)
data.isnull().mean()


Year      0.0
Player    0.0
Pos       0.0
Age       0.0
G         0.0
GS        0.0
MP        0.0
FG        0.0
FGA       0.0
FG%       0.0
3P        0.0
3PA       0.0
3P%       0.0
2P        0.0
2PA       0.0
2P%       0.0
eFG%      0.0
FT        0.0
FTA       0.0
FT%       0.0
ORB       0.0
DRB       0.0
TRB       0.0
AST       0.0
STL       0.0
BLK       0.0
TOV       0.0
PF        0.0
PTS       0.0
PER       0.0
TS%       0.0
3PAr      0.0
FTr       0.0
ORB%      0.0
DRB%      0.0
TRB%      0.0
AST%      0.0
STL%      0.0
BLK%      0.0
TOV%      0.0
USG%      0.0
OWS       0.0
DWS       0.0
WS        0.0
WS/48     0.0
OBPM      0.0
DBPM      0.0
BPM       0.0
VORP      0.0
MVP       0.0
dtype: float64

# Zapisanie gotowych danych

In [None]:
data.to_csv(r"data/cleared_data.csv")
data.info()

In [75]:
data.head(5)

Unnamed: 0,Year,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,MVP
0,1982,Kareem Abdul-Jabbar,C,34.0,76,76.0,2677.0,753,1301,0.579,0.0,3.0,0.0,753,1298,0.58,0.579,312,442,0.706,172.0,487.0,659.0,225,63.0,207.0,230.0,224,1818,23.4,0.608,0.002,0.34,7.3,19.1,13.4,11.9,1.1,4.1,13.3,25.6,6.9,3.9,10.7,0.192,2.9,1.4,4.3,4.2,0.0
1,1982,Alvan Adams,C,27.0,79,75.0,2393.0,507,1027,0.494,0.0,1.0,0.0,507,1026,0.494,0.494,182,233,0.781,138.0,448.0,586.0,356,114.0,78.0,196.0,269,1196,18.6,0.529,0.001,0.227,6.6,20.2,13.6,22.1,2.3,1.9,14.8,22.8,2.5,4.7,7.2,0.144,0.9,2.9,3.7,3.5,0.0
2,1982,Mark Aguirre,SF,22.0,51,20.0,1468.0,381,820,0.465,25.0,71.0,0.352,356,749,0.475,0.48,168,247,0.68,89.0,160.0,249.0,164,37.0,22.0,135.0,152,955,17.3,0.514,0.087,0.301,6.7,12.9,9.7,18.6,1.2,0.9,12.7,29.8,1.0,0.8,1.9,0.061,1.8,-1.7,0.1,0.8,0.0
3,1982,Danny Ainge,SG,22.0,53,1.0,564.0,79,221,0.357,5.0,17.0,0.294,74,204,0.363,0.369,56,65,0.862,25.0,31.0,56.0,87,37.0,3.0,53.0,86,219,10.1,0.439,0.077,0.294,5.0,5.9,5.5,19.7,3.1,0.3,17.5,21.5,-0.3,0.8,0.5,0.042,-2.6,-0.1,-2.6,-0.1,0.0
4,1982,Tiny Archibald,PG,33.0,68,51.0,2167.0,308,652,0.472,6.0,16.0,0.375,302,636,0.475,0.477,236,316,0.747,25.0,91.0,116.0,541,52.0,3.0,178.0,131,858,14.3,0.542,0.025,0.485,1.3,4.5,2.9,31.9,1.1,0.1,18.4,17.9,3.4,1.8,5.2,0.115,0.1,-2.5,-2.5,-0.2,0.0
