## What Does It Take To Be an NBA All-Star? 
## A Data Science Analysis
## Samuel Frankel

Ever since 1951, the NBA All-Star game has occurred annually. Twenty-four of the best players in the NBA are selected, by a mix of fan voting and head coaches, to compete in this prestigious game. (There can end up being more than 24, due to injuries). Each year, as the teams are announced, there is an abundance of discussion regarding the choices. Why did this player make it over this player? Why was this player chosen? I think that if we can look at the analytics behind which players are selected and which are left out, we can better understand the process, as well as make predictions on who will make it before the teams are announced. 

## Step 1: Getting the data

In [6]:
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression


In [7]:
#Function that drops players who averaged less than 10 ppg
def drop_under_10(data):
    for i,r in data.iterrows():
        if (r['PTS'] < 10.0):
            data = data.drop(i)
    return data

data = pd.read_csv('NBA10.csv', sep=',')
data['Year'] = 2010
data = drop_under_10(data)

data2 = pd.read_csv('NBA11.csv', sep=',')
data2['Year'] = 2011
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA12.csv', sep=',')
data2['Year'] = 2012
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA13.csv', sep=',')
data2['Year'] = 2013
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA14.csv', sep=',')
data2['Year'] = 2014
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA15.csv', sep=',')
data2['Year'] = 2015
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA16.csv', sep=',')
data2['Year'] = 2016
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA17.csv', sep=',')
data2['Year'] = 2017
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA18.csv', sep=',')
data2['Year'] = 2018
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA19.csv', sep=',')
data2['Year'] = 2019
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data2 = pd.read_csv('NBA20.csv', sep=',')
data2['Year'] = 2020
data2 = drop_under_10(data2)
data = pd.concat([data,data2])

data = data.reset_index(drop=True)

In [8]:
#https://basketball.realgm.com/nba/allstar/game/rosters/2011
allstars = pd.read_csv('AllStars10.csv', sep=',')
allstars['Year'] = 2010 

allstars2 = pd.read_csv('AllStars11.csv', sep=',')
allstars2['Year'] = 2011
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars12.csv', sep=',')
allstars2['Year'] = 2012
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars13.csv', sep=',')
allstars2['Year'] = 2013
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars14.csv', sep=',')
allstars2['Year'] = 2014
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars15.csv', sep=',')
allstars2['Year'] = 2015
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars16.csv', sep=',')
allstars2['Year'] = 2016
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars17.csv', sep=',')
allstars2['Year'] = 2017
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars17.csv', sep=',')
allstars2['Year'] = 2017
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars18.csv', sep=',')
allstars2['Year'] = 2018
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars19.csv', sep=',')
allstars2['Year'] = 2019
allstars = pd.concat([allstars, allstars2])

allstars2 = pd.read_csv('AllStars20.csv', sep=',')
allstars2['Year'] = 2020
allstars = pd.concat([allstars, allstars2])


In [9]:
data['All Star'] = 0
def was_allstar(name, year):
    for i, r in allstars.iterrows():
        if (r['Player'] in name and year == r['Year']):
            return True 
for i, r in data.iterrows():
    if (was_allstar(r['Player'], r['Year'])):
        data.at[i, 'All Star'] = 1


In [11]:
# Cleaning out the names
for i, r in data.iterrows():
    if re.search(r'\\[a-z]', data.at[i, 'Player']):
       name = re.split(r'\\[a-z]', data.at[i, 'Player'])
       data.at[i,'Player'] = name[0] 

In [12]:
data.to_csv('PlayerStats.csv', sep='\t')

In [13]:
data = data.drop(columns=['Pos', 'Rk', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3P%', '2P', '2PA', '2P%', '3PA', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'PF'])

In [14]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [19]:
model = RandomForestClassifier()
inputs = data.drop(columns=['Player','Tm','Year','All Star'])
target = data['All Star']
X_train, X_test, y_train, y_test = train_test_split(inputs, target, random_state=1, test_size= 0.3)
model.fit(X_train,y_train)


RandomForestClassifier()

In [20]:
y_pred = model.predict(X_test)

In [21]:
metrics.accuracy_score(y_test, y_pred)

0.912621359223301

In [77]:
nba21 = pd.read_csv('NBA21.csv', sep=',')
nba21['All Star %'] = 0.0
nba21.head()


Unnamed: 0,Rk,Player,Pos,Age,Tm,FG%,3P%,TRB,AST,STL,BLK,TOV,PTS,All Star %
0,1,Precious Achiuwa\achiupr01,C,22,TOR,0.386,0.269,8.2,1.6,0.5,0.6,1.1,8.0,0.0
1,2,Steven Adams\adamsst01,C,28,MEM,0.535,,8.8,2.6,0.9,0.6,1.8,7.0,0.0
2,3,Bam Adebayo\adebaba01,C,24,MIA,0.519,0.0,10.2,3.2,1.1,0.3,2.9,18.7,0.0
3,4,Santi Aldama\aldamsa01,PF,21,MEM,0.364,0.111,2.6,0.8,0.1,0.2,0.3,3.6,0.0
4,5,LaMarcus Aldridge\aldrila01,C,36,BRK,0.573,0.367,5.7,0.9,0.4,1.2,0.8,14.0,0.0


In [90]:
nba21 = nba21.dropna()
inputs = nba21.drop(columns=['Pos','Rk', 'Player', 'Tm', '3P%', 'All Star %'])
inputs = inputs.dropna()

In [91]:
probabilities = model.predict_proba(inputs)


In [100]:
nba21 = nba21.dropna()
nba21 = nba21.reset_index()
for i,r in nba21.iterrows():
    nba21.at[i,'All Star %'] = probabilities[i][1]
    if i == 441:
        break


In [110]:
east = ['PHI', 'BRK', 'MIL', 'NYK', 'ATL', 'MIA', 'BOS', 'WAS','IND', 'CHA', 'CHI', 'TOR', 'CLE', 'ORL', 'DET']
nba21['Conference'] = ''
for i, r in nba21.iterrows():
    if r['Tm'] in east:
        nba21.at[i,'Conference'] = 'East'
    else:
        nba21.at[i,'Conference'] = 'West'

In [113]:
nba21 = nba21.sort_values(by=['Conference','All Star %'], ascending= False)
nba21.to_csv('AllStarPredictions.csv', sep='\t')

In [115]:
nba21

Unnamed: 0,index,Rk,Player,Pos,Age,Tm,FG%,3P%,TRB,AST,STL,BLK,TOV,PTS,All Star %,Conference
202,222,223,LeBron James\jamesle01,SF,37,LAL,0.504,0.358,5.9,6.9,1.8,0.9,3.6,26.1,0.95,West
399,435,436,Karl-Anthony Towns\townska01,C,26,MIN,0.508,0.423,9.0,3.3,1.1,1.1,3.5,24.3,0.89,West
98,108,109,Anthony Davis\davisan02,C,28,LAL,0.520,0.182,10.2,3.0,1.2,2.1,2.2,23.8,0.88,West
212,233,234,Nikola Jokić\jokicni01,C,26,DEN,0.588,0.379,13.6,7.5,1.3,0.7,3.4,26.6,0.86,West
106,118,119,Luka Dončić\doncilu01,PG,22,DAL,0.447,0.326,8.0,8.5,1.0,0.5,4.6,25.6,0.85,West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,455,456,Yuta Watanabe\watanyu01,SF,27,TOR,0.360,0.343,4.0,1.2,0.4,0.7,0.2,6.1,0.00,East
422,459,460,Coby White\whiteco01,PG,21,CHI,0.349,0.226,2.6,1.3,0.1,0.1,0.9,6.1,0.00,East
429,467,468,Lou Williams\willilo02,PG,35,ATL,0.395,0.341,1.6,1.2,0.7,0.1,1.0,6.8,0.00,East
432,471,472,Dylan Windler\windldy01,SF,25,CLE,0.442,0.379,1.7,0.9,0.3,0.1,0.4,2.7,0.00,East
