# Final Project
Clay Kindiger and Haoyang Pang

## Load Libraries

In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,mean_squared_error, mean_absolute_error, r2_score

## Load Original Data

In [None]:
players_team = pd.read_csv('full_player_stats.csv')
advanced_stats = pd.read_csv('data/advanced_players.csv')
mvp_voting = pd.read_csv('data/mvp_voting_shares.csv')
dpoy_voting = pd.read_csv('data/dpoy_voting.csv')

In [None]:
players_team.head()

In [None]:
advanced_stats.head()

In [None]:
mvp_voting.head()

In [None]:
dpoy_voting.head()

## Clean Data

In [None]:
players_team.drop(columns = ['mvp','dpoy'],inplace = True)

In [None]:
season = []
start = 2023
for idx,row in dpoy_voting.iterrows():
    if row['Rank'] == '1':
        start -= 1
    
    season.append(start)
dpoy_voting['Season'] = season

In [None]:
dpoy_voting = dpoy_voting[['Season', 'Player',  'Pts Won', 'Pts Max', 'Share']]
dpoy_voting.head()

In [None]:
advanced_stats = advanced_stats[advanced_stats['Player'] != 'Player']

In [None]:
players_teams = {}
for ply,group in advanced_stats.groupby(['Player','Season']):
    if group.shape[0] > 1:
        tm = list(group.Tm)[-1]
        if 28 in group.index:
            print(group.Tm)
        players_teams[list(group.index)[0]] = tm
        if tm == 'TOT':
            print(ply)

In [None]:
teams_list = []
for idx,row in advanced_stats.iterrows():
    if idx in players_teams.keys():
        tm = players_teams[idx]
        teams_list.append(tm)
    else:
        tm = row['Tm']
        if idx == 3613:
            tm = 'UTA'
        elif idx == 21:
            tm = 'MIA'
        elif idx == 4189:
            tm = 'WAS'
        elif idx == 3037:
            tm = 'BOS'
        elif idx == 11099:
            tm = 'CHI'
        elif idx == 6128:
            tm = 'NJN'
        teams_list.append(tm)
advanced_stats['Tm'] = teams_list

In [None]:
advanced_stats.drop_duplicates(['Player','Season'],keep = 'first',inplace = True)

In [None]:
players_team.shape

In [None]:
advanced_stats.shape

In [None]:
advanced_stats.columns

In [None]:
players_team.columns

In [None]:
advanced_stats = advanced_stats[['Season', 'Player', 'Tm', 'PER', 'TS%', '3PAr',
       'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]

In [None]:
full_players = players_team.merge(advanced_stats, on = ['Player','Tm','Season'],how = 'inner')
full_players.shape

In [None]:
full_players.head()

In [None]:
full_players_mvp = full_players.merge(mvp_voting,on = ['Season','Player'],how = 'left')
full_players_mvp.shape

In [None]:
full_players_mvp

In [None]:
full_players_mvp[~full_players_mvp['Share'].isnull()]

In [None]:
full_players_awards = full_players_mvp.merge(dpoy_voting,on = ['Season','Player'],how = 'left',suffixes = ('_mvp','_dpoy'))
full_players_awards.shape

In [None]:
full_players_awards.fillna(0,inplace = True)

In [None]:
full_players_awards.to_csv('full_players_dataset.csv',index = False)

In [None]:
full_dataset = pd.read_csv('full_players_dataset.csv')
full_dataset.head()

In [None]:
full_dataset.columns

In [None]:
full_dataset['received_mvp_votes'] = [1 if row['Share_mvp']!=0 else 0 for idx,row in full_dataset.iterrows()]
full_dataset['received_dpoy_votes'] = [1 if row['Share_dpoy']!=0 else 0 for idx,row in full_dataset.iterrows()]

In [None]:
cols_to_normalize = ['G','GS','W','L','FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%']

In [None]:
normalized = full_dataset.groupby('Season')[cols_to_normalize].transform(lambda x: (x - x.mean()) / x.std())

In [None]:
full_normalized = full_dataset.merge(normalized,right_index = True,left_index=True,suffixes = ('','_normalized'))

In [None]:
full_normalized.head()

In [None]:
full_normalized.drop(cols_to_normalize,axis = 1,inplace=True)

In [None]:
full_normalized.head()

In [None]:
full_normalized.to_csv("normalized_data.csv",index=False)

## Load Cleaned Data

In [2]:
full_dataset = pd.read_csv('normalized_data.csv')
full_dataset.head()

Unnamed: 0,Season,Player,Pos,Age,Tm,MP,Year,Team,W/L%,GB,...,3PAr_normalized,FTr_normalized,ORB%_normalized,DRB%_normalized,TRB%_normalized,AST%_normalized,STL%_normalized,BLK%_normalized,TOV%_normalized,USG%_normalized
0,1998,Mahmoud Abdul-Rauf,PG,28.0,SAC,530.0,1998,SAC,0.329,34.0,...,-0.223145,-1.479163,-1.061591,-1.082926,-1.256266,0.678781,-0.270151,-0.824493,-1.271023,1.028959
1,1998,Tariq Abdul-Wahad,SG,23.0,SAC,959.0,1998,SAC,0.329,34.0,...,-0.577654,0.059612,-0.333466,-0.775932,-0.686891,-0.475951,0.010661,-0.35274,-0.260471,0.499256
2,1998,Terry Dehere,PG,26.0,SAC,1410.0,1998,SAC,0.329,34.0,...,0.817135,-0.627813,-0.984946,-1.050611,-1.195261,0.967465,0.010661,-0.765524,0.118486,-0.118731
3,1998,Lawrence Funderburke,PF,27.0,SAC,1094.0,1998,SAC,0.329,34.0,...,-0.781061,0.403325,0.24137,0.419729,0.37052,-0.358339,-0.925378,-0.35274,-0.513109,0.358002
4,1998,Derek Grimm,SF,23.0,SAC,34.0,1998,SAC,0.329,34.0,...,4.094891,-1.034981,-1.291525,-0.000368,-0.72756,-1.427536,2.537968,0.29592,0.188663,0.728794


## Feature Selection

In [None]:
full_dataset.columns

## Create Models

In [3]:
mvp_dataset = full_dataset[['Playoffs', 'OWS', 'DWS',
       'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'received_mvp_votes',
       'G_normalized', 'GS_normalized', 'W_normalized', 'L_normalized',
       'FG_normalized', 'FGA_normalized', 'FG%_normalized', '3P_normalized',
       '3PA_normalized', '3P%_normalized', '2P_normalized', '2PA_normalized',
       '2P%_normalized', 'eFG%_normalized', 'FT_normalized', 'FTA_normalized',
       'FT%_normalized', 'ORB_normalized', 'DRB_normalized', 'TRB_normalized',
       'AST_normalized', 'STL_normalized', 'BLK_normalized', 'TOV_normalized',
       'PF_normalized', 'PTS_normalized', 'PER_normalized', 'TS%_normalized',
       '3PAr_normalized', 'FTr_normalized', 'ORB%_normalized',
       'DRB%_normalized', 'TRB%_normalized', 'AST%_normalized',
       'STL%_normalized', 'BLK%_normalized', 'TOV%_normalized',
       'USG%_normalized']]

Xm = np.array(mvp_dataset.drop(['received_mvp_votes'],axis=1))
ym = np.array(mvp_dataset['received_mvp_votes'])

Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.1, random_state=42)

In [4]:
dpoy_dataset = full_dataset[['Playoffs', 'DWS',
       'WS', 'WS/48', 'DBPM', 'VORP', 'received_dpoy_votes',
       'G_normalized', 'GS_normalized', 'W_normalized', 'L_normalized',
       'ORB_normalized', 'DRB_normalized', 'TRB_normalized',
       'STL_normalized', 'BLK_normalized',
       'PF_normalized', 'PTS_normalized', 'PER_normalized',
       'DRB%_normalized', 'TRB%_normalized',
       'STL%_normalized', 'BLK%_normalized']]

Xd = np.array(dpoy_dataset.drop(['received_dpoy_votes'],axis=1))
yd = np.array(dpoy_dataset['received_dpoy_votes'])

Xd_train, Xd_test, yd_train, yd_test = train_test_split(Xd, yd, test_size=0.1, random_state=42)

### Logistic Regression

#### MVP

In [5]:
mvp_logreg = LogisticRegression(max_iter=500).fit(Xm_train,ym_train)

#### DPOY

In [6]:
dpoy_logreg = LogisticRegression(max_iter=500).fit(Xd_train,yd_train)

### Random Forest

#### MVP

In [7]:
mvp_clf = RandomForestClassifier(30, oob_score = True).fit(Xm_train,ym_train)

#### DPOY

In [8]:
dpoy_clf = RandomForestClassifier(30, oob_score = True).fit(Xd_train,yd_train)

### Regression (predict vote share)

#### MVP

In [55]:
#create the new variable for the regression
ym_regr = np.array(full_dataset['Share_mvp'])

pred_votes = np.where(mvp_clf.predict(Xm) == 1)

Xm_pipe = Xm[pred_votes]
ym_pipe = ym_regr[pred_votes]

Xm_train_regr, Xm_test_regr, ym_train_regr, ym_test_regr = train_test_split(Xm_pipe, ym_pipe, test_size=0.2, random_state=42)

mvp_regr_pipe = RandomForestRegressor(n_estimators=50, oob_score=True).fit(Xm_train_regr, ym_train_regr)

mvp_regression_score = mvp_regr_pipe.score(Xm_test_regr, ym_test_regr)

In [58]:
#create the new variable for the regression
ym_regr = np.array(full_dataset['Share_mvp'])

Xm_train_regr, Xm_test_regr, ym_train_regr, ym_test_regr = train_test_split(Xm, ym_regr, test_size=0.2, random_state=42)

mvp_regr_full = RandomForestRegressor(n_estimators=50, oob_score=True).fit(Xm_train_regr, ym_train_regr)

mvp_regression_score = mvp_regr_full.score(Xm_test_regr, ym_test_regr)

#### DPOY

In [None]:
#create the new variable for the regression
yd_regr = np.array(full_dataset['Share_dpoy'])

pred_votes = np.where(dpoy_clf.predict(Xm) == 1)

Xd_pipe = Xd[pred_votes]
yd_pipe = yd_regr[pred_votes]

Xd_train_regr, Xd_test_regr, yd_train_regr, yd_test_regr = train_test_split(Xd_pipe, yd_pipe, test_size=0.2, random_state=42)

dpoy_regr = RandomForestRegressor(n_estimators=50, oob_score=True).fit(Xd_train_regr, yd_train_regr)

dpoy_regression_score = dpoy_regr.score(Xd_test_regr, yd_test_regr)

In [52]:
#create the new variable for the regression
yd_regr = np.array(full_dataset['Share_dpoy'])

Xd_train_regr, Xd_test_regr, yd_train_regr, yd_test_regr = train_test_split(Xd, yd_regr, test_size=0.2, random_state=42)

dpoy_regr = RandomForestRegressor(n_estimators=50, oob_score=True).fit(Xd_train_regr, yd_train_regr)

dpoy_regression_score = dpoy_regr.score(Xd_test_regr, yd_test_regr)

## Evaluate Models

### Logistic Regression

#### MVP

In [None]:
acc_mvp = accuracy_score(ym_test,mvp_logreg.predict(Xm_test))
print(f'The accuracy score was: {acc_mvp}')
print(confusion_matrix(ym_test,mvp_logreg.predict(Xm_test)))
print(f'F1 score = {f1_score(ym_test,mvp_logreg.predict(Xm_test))}')

#### DPOY

In [None]:
acc_dpoy = accuracy_score(yd_test,dpoy_logreg.predict(Xd_test))
print(f'The accuracy score was: {acc_dpoy}')
print(confusion_matrix(yd_test,dpoy_logreg.predict(Xd_test)))
print(f'F1 score = {f1_score(yd_test,dpoy_logreg.predict(Xd_test))}')

### Random Forest

#### MVP

In [None]:
print(f'The training accuracy: {mvp_clf.score(Xm_train,ym_train)}')
print(f'The out of bag classification accuracy: {mvp_clf.oob_score_}')

In [None]:
mvp_clf.score(Xm_test,ym_test)

In [None]:
for i in range(20,100,5):
    mvp_clf = RandomForestClassifier(i, oob_score = True).fit(Xm_train,ym_train)
    print(f'Test Accuracy with {i} trees = {mvp_clf.score(Xm_test,ym_test)}')

In [None]:
i = 0
feat_imp = {}
cols = list(mvp_dataset.columns)
for val in mvp_clf.feature_importances_:
    if cols[i] == 'received_mvp_votes':
        i += 1
        pass
    feat_imp[cols[i]] = val
    print(f'The imporance of {cols[i]} = {val}')
    i += 1

#### DPOY

In [None]:
print(f'The training accuracy: {dpoy_clf.score(Xd_train,yd_train)}')
print(f'The out of bag classification accuracy: {dpoy_clf.oob_score_}')

In [None]:
dpoy_clf.score(Xd_test,yd_test)

In [None]:
dpoy_clf.feature_importances_

In [None]:
for i in range(20,100,5):
    dpoy_clf = RandomForestClassifier(i, oob_score = True).fit(Xd_train,yd_train)
    print(f'Test Accuracy with {i} trees = {dpoy_clf.score(Xd_test,yd_test)}')

In [None]:
i = 0
cols = list(dpoy_dataset.columns)
for val in dpoy_clf.feature_importances_:
    if cols[i] == 'received_dpoy_votes':
        i += 1
        pass
    print(f'The imporance of {cols[i]} = {val}')
    i += 1

### Regression (voting share)

#### MVP

In [56]:
# Predictions for regression
ym_pred_regr = mvp_regr_pipe.predict(Xm_test_regr)

# Regression metrics
mse_mvp_regr = mean_squared_error(ym_test_regr, ym_pred_regr)
mae_mvp_regr = mean_absolute_error(ym_test_regr, ym_pred_regr)
r2_mvp_regr = r2_score(ym_test_regr, ym_pred_regr)

print("Mean Squared Error (MSE) for Regression:", mse_mvp_regr)
print("Mean Absolute Error (MAE) for Regression:", mae_mvp_regr)
print("R² Score for Regression:", r2_mvp_regr)

Mean Squared Error (MSE) for Regression: 0.03539832014444444
Mean Absolute Error (MAE) for Regression: 0.1360872222222222
R² Score for Regression: 0.4325309197501487


In [59]:
# Predictions for regression
ym_pred_regr = mvp_regr_full.predict(Xm_test_regr)

# Regression metrics
mse_mvp_regr = mean_squared_error(ym_test_regr, ym_pred_regr)
mae_mvp_regr = mean_absolute_error(ym_test_regr, ym_pred_regr)
r2_mvp_regr = r2_score(ym_test_regr, ym_pred_regr)

print("Mean Squared Error (MSE) for Regression:", mse_mvp_regr)
print("Mean Absolute Error (MAE) for Regression:", mae_mvp_regr)
print("R² Score for Regression:", r2_mvp_regr)

Mean Squared Error (MSE) for Regression: 0.001308327486340641
Mean Absolute Error (MAE) for Regression: 0.004158043844856661
R² Score for Regression: 0.4572886016136759


In [43]:
for i in range(20,100,5):
    mvp_regr = RandomForestRegressor(i, oob_score = True).fit(Xm_train_regr,ym_train_regr)
    print(f'R² Score with {i} trees = {mvp_regr.score(Xm_test_regr,ym_test_regr)}')

R² Score with 20 trees = 0.4427807999657215
R² Score with 25 trees = 0.42830128631142494
R² Score with 30 trees = 0.40047591518849923
R² Score with 35 trees = 0.4515963008382411
R² Score with 40 trees = 0.42437504889071953
R² Score with 45 trees = 0.4227677215110299
R² Score with 50 trees = 0.45331102142899105
R² Score with 55 trees = 0.4995465757251851
R² Score with 60 trees = 0.43857246465531885
R² Score with 65 trees = 0.4494617795147118
R² Score with 70 trees = 0.4809882462098565
R² Score with 75 trees = 0.47120828867960374
R² Score with 80 trees = 0.38335753763203095
R² Score with 85 trees = 0.47461713853353715
R² Score with 90 trees = 0.4739213867139478
R² Score with 95 trees = 0.4621364967609739


#### DPOY

In [53]:
# Predictions for regression
yd_pred_regr = dpoy_regr.predict(Xd_test_regr)

# Regression metrics
mse_dpoy_regr = mean_squared_error(yd_test_regr, yd_pred_regr)
mae_dpoy_regr = mean_absolute_error(yd_test_regr, yd_pred_regr)
r2_dpoy_regr = r2_score(yd_test_regr, yd_pred_regr)

print("Mean Squared Error (MSE) for Regression:", mse_dpoy_regr)
print("Mean Absolute Error (MAE) for Regression:", mae_dpoy_regr)
print("R² Score for Regression:", r2_dpoy_regr)

Mean Squared Error (MSE) for Regression: 0.00098907938381113
Mean Absolute Error (MAE) for Regression: 0.0035267453625632376
R² Score for Regression: 0.2973082652866236


In [54]:
for i in range(20,100,5):
    dpoy_regr = RandomForestRegressor(i, oob_score = True).fit(Xd_train_regr,yd_train_regr)
    print(f'R² Score with {i} trees = {dpoy_regr.score(Xd_test_regr,yd_test_regr)}')

R² Score with 20 trees = 0.3754818185444495
R² Score with 25 trees = 0.22309048633244755
R² Score with 30 trees = 0.2981880563880748
R² Score with 35 trees = 0.27673014136189455
R² Score with 40 trees = 0.3237310780658885
R² Score with 45 trees = 0.3050185940455301
R² Score with 50 trees = 0.36290698077349237
R² Score with 55 trees = 0.3583953062936369
R² Score with 60 trees = 0.3259605291391725
R² Score with 65 trees = 0.2918322640585421
R² Score with 70 trees = 0.3218805385221173
R² Score with 75 trees = 0.32218313790438546
R² Score with 80 trees = 0.32484818860749864
R² Score with 85 trees = 0.2947793135141977
R² Score with 90 trees = 0.3104740504564768
R² Score with 95 trees = 0.29379707650842946
