# Data Wrangling for Random Sampling of Players

In [9]:
import pandas as pd
df = pd.read_csv('Seasons_Stats.csv')

In [10]:
pd.options.display.max_columns = None

#cleaning up the dataset for sampling purposes
df = df.iloc[:,1:]
df = df[df.Year >= 1993]
df = df.drop(['blanl','blank2'], axis=1)
df['Year1'] = df.Year

In [11]:
#casting these columns to be of type int

l = ['Year','Age', 'G', 'GS']
for feature in l:
    df[feature] = df[feature].astype(dtype ='int')

In [12]:
#This nested for loop converts the total stats for the season to per game stats for readability.

to_per_game = [ 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA','FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL','BLK', 'TOV', 'PF', 'PTS']
for index,player in df.iterrows():
    for per_game in to_per_game:
        df.loc[index,per_game] = df.loc[index,per_game]/df.loc[index,'G']

In [13]:
#Read in the team_win_ratio csv file that was scraped in the other jupyter notebook to merge it with
#the Season Stats dataframe

df_team = pd.read_csv('team_win_ratio.csv')
df_team = df_team.iloc[:, 2:]
df_team1 = df_team[['Team','Year', 'W/L%']]
df_team1 = df_team1.rename(columns={'Team':'Tm'})
df = pd.merge(df, df_team1, how ='left', on=['Tm', 'Year'])

#setup to take out players who were traded

x = df.groupby(['Year','Player']).count()
df = df.set_index(['Year','Player'])
df = df[x.Tm == 1]

  


In [14]:
#This cell here creates a new column called MVP and each player will have either a 1 if they won MVP that season or 
#a 0 if they did not

df['MVP'] = 0
MVP = ["Shaquille O'Neal*", 'Allen Iverson*', 'Tim Duncan','Tim Duncan', 'Kevin Garnett', 'Steve Nash', 'Steve Nash', 'Dirk Nowitzki', 'Kobe Bryant', 'LeBron James','LeBron James','Derrick Rose', 'LeBron James','LeBron James','Kevin Durant','Stephen Curry','Stephen Curry','Russell Westbrook']
Season = [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
MVP_winner = list(zip(MVP,Season))
for mvp, year in MVP_winner:
    df.loc[(year,mvp),'MVP'] = 1

#seperate mvps and non mvps into two different dataframes

df_mvps = df[df.MVP == 1]
df_non_mvps = df[df.MVP == 0]
df_mvps = df_mvps.drop('Year1', axis = 1)

The following cell will randomly sample 99 players from each year from the non_mvps dataframe. This randomly sampled dataframe will then be concatenated with the mvp dataframe to have a dataset that is ready for classification.

In [15]:
#df1 contains the randomly sampled data that will be used to train our model with to classify MVPs and non MVPs

df1 = pd.DataFrame()
for year in range(2000,2018):
    random_sample = df_non_mvps.loc[(year,slice(None))].sample(99, replace = False, random_state = 4)
    df1 = pd.concat([df1,random_sample], axis = 0)
    
df1 = df1.reset_index()
df1 = df1.rename(columns={'Year1':'Year'})
df1= df1.set_index(['Player','Year'])
df_mvps = df_mvps.swaplevel()
df1 = pd.concat([df1, df_mvps], axis = 0)

In [16]:
#The following three cells were used to help combine the MVP and non MVP dataset so that statistical analysis and EDA  can be done on the clean data set.

df_mvp1 = df_mvps
df_mvp1 = df_mvp1.swaplevel()
df_clean = pd.concat([df_mvp1,df_non_mvps], axis = 0)
df_clean.to_csv('Clean NBA Player Data.csv')

In [17]:
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,W/L%,MVP
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Kevin Willis,2000.0,PF,37,TOR,79,1,21.253165,12.3,0.471,0.005,0.288,13.0,19.4,16.1,4.8,1.1,2.0,13.3,19.5,0.7,1.6,2.3,0.066,-3.1,-1.2,-4.4,-1.0,2.987342,7.202532,0.415,0.012658,0.037975,0.333,2.974684,7.164557,0.415,0.416,1.658228,2.075949,0.799,2.544304,3.556962,6.101266,0.620253,0.455696,0.607595,1.240506,3.240506,7.64557,0.549,0
Kenny Anderson,2000.0,PG,29,BOS,82,82,31.621951,17.4,0.524,0.223,0.257,2.3,7.9,4.9,26.7,2.7,0.2,10.6,20.5,5.4,1.9,7.3,0.136,2.6,-1.4,1.2,2.1,5.292683,12.02439,0.44,1.036585,2.682927,0.386,4.256098,9.341463,0.456,0.483,2.390244,3.085366,0.775,0.670732,2.073171,2.743902,5.121951,1.695122,0.097561,1.585366,2.804878,14.012195,0.427,0
Randy Brown,2000.0,PG,31,CHI,59,55,27.542373,7.3,0.402,0.014,0.193,1.6,8.9,5.2,22.6,2.0,0.7,18.2,16.2,-2.0,1.2,-0.8,-0.024,-5.2,-0.5,-5.7,-1.5,2.661017,7.372881,0.361,0.050847,0.101695,0.5,2.610169,7.271186,0.359,0.364,1.050847,1.423729,0.738,0.389831,2.050847,2.440678,3.423729,1.033898,0.254237,1.779661,2.033898,6.423729,0.207,0
Allen Iverson*,2000.0,SG,24,PHI,70,70,40.757143,20.0,0.496,0.151,0.358,2.7,7.7,5.2,23.0,2.6,0.1,10.3,34.4,3.3,3.6,6.9,0.116,3.4,-0.8,2.6,3.3,10.414286,24.757143,0.421,1.271429,3.728571,0.341,9.142857,21.028571,0.435,0.446,6.314286,8.857143,0.713,1.014286,2.8,3.814286,4.685714,2.057143,0.071429,3.285714,2.314286,28.414286,0.598,0
Vince Carter,2000.0,SF,23,TOR,82,82,38.121951,23.4,0.543,0.139,0.325,5.2,12.1,8.5,20.5,1.8,2.1,8.4,30.0,9.1,2.7,11.8,0.182,5.2,-0.6,4.6,5.2,9.609756,20.682927,0.465,1.158537,2.878049,0.403,8.45122,17.804878,0.475,0.493,5.317073,6.719512,0.791,1.829268,3.97561,5.804878,3.926829,1.341463,1.121951,2.170732,3.207317,25.695122,0.549,0
