# Imports

In [192]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Concat Function

In [44]:
def concat(year):
    df = pd.read_csv(f'./total/total_{year - 1}_{year}.csv')
    df_2 = pd.read_csv(f'./total/total_{year}_{year + 1}.csv')
    df_final = pd.concat([df, df_2], ignore_index=True)
    return df_final

In [176]:
all_data = []
for year in range(1986,2014,2):
    df_final = concat(year)
    all_data.append(df_final)
df_final = pd.concat(all_data, ignore_index = True)
df_final.to_csv('./df_final.csv', index = False)

In [177]:
df_final = pd.read_csv('./df_final.csv')

In [178]:
#name, season, and team will all be dropped beofre modeling. No null values, need to encode predicted variable and change all of them to int
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11736 entries, 0 to 11735
Data columns (total 76 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   name                               11736 non-null  object 
 1   mpg                                11736 non-null  float64
 2   fgm_per_g                          11736 non-null  float64
 3   fga_per_g                          11736 non-null  float64
 4   fg_pct                             11736 non-null  float64
 5   fg3m_per_g                         11736 non-null  float64
 6   fg3a_per_g                         11736 non-null  float64
 7   fg3_pct                            11736 non-null  float64
 8   fg2m_per_g                         11736 non-null  float64
 9   fg2a_per_g                         11736 non-null  float64
 10  fg2_pct                            11736 non-null  float64
 11  efg_pct                            11736 non-null  flo

In [179]:
#change all target columns to integers
award_columns = df_final.columns[-7:]
for award in award_columns:
    df_final[award] = df_final[award].astype(int)

In [180]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11736 entries, 0 to 11735
Data columns (total 76 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   name                               11736 non-null  object 
 1   mpg                                11736 non-null  float64
 2   fgm_per_g                          11736 non-null  float64
 3   fga_per_g                          11736 non-null  float64
 4   fg_pct                             11736 non-null  float64
 5   fg3m_per_g                         11736 non-null  float64
 6   fg3a_per_g                         11736 non-null  float64
 7   fg3_pct                            11736 non-null  float64
 8   fg2m_per_g                         11736 non-null  float64
 9   fg2a_per_g                         11736 non-null  float64
 10  fg2_pct                            11736 non-null  float64
 11  efg_pct                            11736 non-null  flo

In [181]:
#relabel target columns the names of the columns to label encode them
for award in list(award_columns):
    df_final[award] = df_final[award].apply(lambda x: award if x == 1 else None)
# add a new column 'target' that is a string of all the awards
df_final['target'] = df_final[list(award_columns)].apply(lambda x: ', '.join(x.dropna().astype(str)), axis = 1)

In [182]:
#fill in the empty strings with 0
df_final['target'] = df_final['target'].replace('', 'no_award')

In [183]:
#check the value_counts
df_final['target'].value_counts()

target
no_award          11271
team_2nd            131
team_3rd            117
team_1st            105
MVP, team_1st        28
SMOY                 27
MIP                  23
DPOY, team_1st        9
DPOY, team_2nd        7
DPOY                  7
DPOY, team_3rd        5
MIP, team_2nd         3
MIP, team_3rd         2
SMOY, team_3rd        1
Name: count, dtype: int64

# Label encode the target column

In [184]:
le = LabelEncoder()
df_final['target_encoded'] = le.fit_transform(df_final['target'])

In [185]:
le.classes_

array(['DPOY', 'DPOY, team_1st', 'DPOY, team_2nd', 'DPOY, team_3rd',
       'MIP', 'MIP, team_2nd', 'MIP, team_3rd', 'MVP, team_1st', 'SMOY',
       'SMOY, team_3rd', 'no_award', 'team_1st', 'team_2nd', 'team_3rd'],
      dtype=object)

In [190]:
# dropping extra columns for data analysis
df_final.drop(columns = list(award_columns), inplace = True)
df_final.drop(columns = ['season', 'team', 'name', 'target'], inplace = True)

# Check Coorelations

In [199]:
corr = df_final.corr(numeric_only = True)[['target_encoded']].sort_values(by = 'target_encoded', ascending = False) 
corr

Unnamed: 0,target_encoded
target_encoded,1.000000
ftm_per_g,0.103688
ppg,0.099278
made_free_throws,0.098822
fga_per_g,0.096884
...,...
offensive_rebound_percentage,-0.025651
positions_Center,-0.030777
block_percentage,-0.039647
blk_per_g,-0.049625


# Standard Scale the Data

In [194]:
X = df_final.drop(columns = 'target_encoded')

In [197]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)
pd.DataFrame(X_scaled, columns = X.columns)

Unnamed: 0,mpg,fgm_per_g,fga_per_g,fg_pct,fg3m_per_g,fg3a_per_g,fg3_pct,fg2m_per_g,fg2a_per_g,fg2_pct,...,steals,blocks,turnovers,personal_fouls,points,positions_Center,positions_Point Guard,positions_Power Forward,positions_Shooting Guard,positions_Small Forward
0,-1.731342,-1.212437,-1.284439,-0.101471,-0.688791,-0.755499,-1.134624,-1.119031,-1.179238,-0.280748,...,-1.101595,-0.673671,-1.155184,-1.408938,-1.081310,-0.515109,-0.489833,1.929932,-0.493574,-0.483137
1,-0.158322,-0.332295,-0.394757,0.313749,-0.511809,-0.555012,0.220333,-0.220010,-0.270688,0.278352,...,-0.411501,-0.648009,-0.394852,-0.509094,-0.633030,-0.515109,2.041511,-0.518153,-0.493574,-0.483137
2,-1.270946,-1.036409,-0.903147,-1.598447,-0.688791,-0.688670,-1.134624,-0.929764,-0.772781,-1.599380,...,-1.076036,-0.699333,-1.080642,-1.321453,-1.052912,-0.515109,-0.489833,-0.518153,2.026038,-0.483137
3,-1.568285,-1.168430,-1.157342,-0.975618,-0.688791,-0.755499,-1.134624,-1.071715,-1.035783,-1.124672,...,-0.999359,-0.211752,-1.006099,-1.058999,-0.994088,1.941338,-0.489833,-0.518153,-0.493574,-0.483137
4,-0.100772,-0.244281,-0.140562,-0.254446,-0.511809,-0.621841,0.415447,-0.125376,0.040132,-0.375690,...,0.738655,-0.648009,0.529473,0.040811,-0.168524,-0.515109,2.041511,-0.518153,-0.493574,-0.483137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11731,0.311666,0.283804,0.473742,-0.276300,1.611974,1.650344,0.800254,-0.125376,-0.055505,-0.069767,...,-0.181470,-0.340063,-0.439578,-0.446605,0.180364,-0.515109,-0.489833,-0.518153,2.026038,-0.483137
11732,-0.791366,-0.948394,-0.924330,-0.505764,-0.157845,-0.154038,0.534683,-0.929764,-0.987964,-0.185807,...,-0.718210,-0.494036,-0.693022,-0.721557,-0.779076,-0.515109,-0.489833,-0.518153,2.026038,-0.483137
11733,1.337966,1.560009,1.193960,1.013066,-0.688791,-0.688670,-0.457145,1.861935,1.594230,0.837452,...,2.297756,0.712085,0.156761,0.890663,1.192542,-0.515109,-0.489833,1.929932,-0.493574,-0.483137
11734,-1.635426,-1.124423,-1.136159,-1.008398,-0.511809,-0.555012,-0.050658,-1.071715,-1.083601,-0.787103,...,-1.101595,-0.699333,-1.185001,-1.433934,-1.054940,1.941338,-0.489833,-0.518153,-0.493574,-0.483137
