# <ins>Recreating the 2023 NBA All-Star Pool - Logistic Regression Classifier</ins>


In [42]:
from bs4 import BeautifulSoup as bs
import pandas as pd
from urllib.request import urlopen
import re
import import_ipynb
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

- ### **Build table of players' per-game stats from 2012 - 2022 with added features of Season and All-Star selection outcome (1 for selected, 0 for not selected)**

In [53]:
def allstar_names(year):
    '''
    Returns the list of NBA all-stars for a given season 
    
    input: int - NBA season/year of choice
    
    returns: array - list of NBA all-stars
    '''  
    # extract data from webpage 
    url = f'https://www.basketball-reference.com/allstar/NBA_{year}.html'   
    page = urlopen(url)    
    soup = bs(page)    
    names = list(soup.body.findAll('a'))    
    
    # clean and compile into list   
    flag = ['players' in str(name) for name in names]   
    player_names = []
    for i, elem in enumerate(flag):
        if elem==True:
            player_names.append(str(names[i]))
    
    result = []    
    for player in player_names[1:]:     
        name = player.split('<')    
        name = name[1].split('>')[1]    
        if name not in result and len(result) < 24:
            result.append(name)           
    return result


def player_stats(year): 
    '''
    Generates a table of all players per-game stats for a given season
    
    inputs: int - NBA season/year of choice   
    
    returns: DataFrame - individual player per-game stats
    '''   
    # extract data from webpage    
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    html = urlopen(url)
    soup = bs(html)   
    headers = ([th.getText() for th in soup.findAll('tr', limit=1)[0].findAll('th')])[1:]   
    rows = soup.findAll('tr')[1:]  
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
        
    # compile into table and clean
    stats = pd.DataFrame(player_stats, columns = headers)
    stats = stats.dropna(how='all')  
    stats['Player'] = stats['Player'].str.replace('*', '')       
    return stats


def table(year):   
    '''
    Generates a table of all players per-game stats for a given season
    with the inclusion of binary all-star attribute
    
    inputs: int - NBA season/year of choice
    
    returns: DataFrame - individual player per-game stats with all-star attribute
    '''  
    # remove player duplicates to only include stats with last team of season   
    df = player_stats(year).drop_duplicates(subset=['Player'], keep='last')
       
    # list of allstars for given season   
    allstars = allstar_names(year)
        
    # add season columns    
    df['Season'] = year
       
    # add allstar column
    df['Allstar'] = 0
    
    # set table index as player name   
    df = df.set_index('Player')
      
    # flag players who made allstar team   
    for name in allstars:
        df.loc[name, 'Allstar'] = 1      
    df = df.reset_index()
    
    return df


def dataset(years):   
    '''
    Generates a cumulative table of NBA player per-game stats for a
    given range of seasons.
    
    inputs: array - list of years for which to include player stats  
    
    returns: DataFrame - cumulative table of player stats
    '''   
    # remove 1999 season if applicable (no all-star game)    
    if 1999 in years:
        years.remove(1999)
    
    # instantiate dataframe as accumulator with first season    
    df = table(years[0])
    
    # create and concatenate remaining season stats    
    for year in years[1:]:
        df = pd.concat([df, table(year)])         
    return df

In [63]:
data = dataset(list(range(2012, 2024)))
data.to_csv('data.csv', index = False)
df = pd.read_csv('data.csv')
df = df[df['Season'] < 2024]
df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season,Allstar
0,Jeff Adrien,PF,25.0,HOU,8.0,0.0,7.9,0.9,2.0,0.438,...,2.1,2.8,0.1,0.0,0.3,0.3,1.6,2.6,2012.0,0.0
1,Arron Afflalo,SG,26.0,DEN,62.0,62.0,33.6,5.3,11.3,0.471,...,2.5,3.2,2.4,0.6,0.2,1.4,2.2,15.2,2012.0,0.0
2,Blake Ahearn,PG,27.0,UTA,4.0,0.0,7.5,1.0,3.5,0.286,...,0.5,0.5,0.3,0.0,0.0,1.3,1.0,2.5,2012.0,0.0
3,Solomon Alabi,C,23.0,TOR,14.0,0.0,8.7,0.9,2.6,0.361,...,2.3,3.4,0.2,0.1,0.6,0.4,0.8,2.4,2012.0,0.0
4,Cole Aldrich,C,23.0,OKC,26.0,0.0,6.7,0.8,1.6,0.524,...,1.3,1.8,0.1,0.3,0.6,0.3,0.8,2.2,2012.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6126,Delon Wright,PG,30.0,WAS,29.0,2.0,22.5,2.3,5.2,0.440,...,2.2,3.2,3.7,2.0,0.3,1.0,1.4,6.2,2023.0,0.0
6127,McKinley Wright IV,PG,24.0,DAL,20.0,1.0,10.3,1.2,2.5,0.469,...,1.0,1.3,1.9,0.4,0.2,0.6,0.9,2.9,2023.0,0.0
6128,Thaddeus Young,PF,34.0,TOR,47.0,9.0,16.0,2.2,3.9,0.562,...,1.9,3.3,1.5,1.1,0.1,0.8,1.8,4.9,2023.0,0.0
6129,Trae Young,PG,24.0,ATL,52.0,52.0,35.4,8.4,19.7,0.428,...,2.2,2.9,10.3,1.0,0.2,4.1,1.5,26.7,2023.0,0.0


- ### **Create balanced training set with SMOTE oversampling**
- ### **Train logistic regression classifier on player stats from previous 10 seasons**
- ### **Predict probability of All-Star selection for players in 2023 season**
- ### **Return table of player's per-game stats with added features of *Conference*, *G/F* (Guard/Foward position), and *Allstar* (probability of All-Star selection)**

In [55]:
def smote(X, y):
    '''
    Generates a balanced DataFrame using SMOTE oversampling. 
    
    input: training data and target variables    
    
    returns: balanced DataFrame     
    '''    
    sm = SMOTE()    
    X_smote, y_smote = sm.fit_resample(X, y)    
    return pd.concat([X_smote, y_smote], axis=1)


def proba_table(df):   
    '''
    Creates a table of player's per game stats with inclusing of conference,
    Forward/Guard position and probability of allstar selection.
    
    inputs: DataFrame - table of player's per game stats for a given season.
    
    returns: DataFrame - player's per game stats with inclusing of conference,
                         F/G position and probability of selection.   
    '''
    
    # year for which team will be drafted    
    season = df['Season'].max()
        
    # drop non-useful columns    
    df = df.drop(columns = ['Pos', 'Player', 'Tm', 'Age'])
       
    # create training set using player data from previous 10 seasons    
    train = df[(df['Season'] >= (season - 10)) & (df['Season'] < season)].dropna()

    # drop season attribute
    train = train.drop(columns = 'Season')

    # extract explanatory variables and labels    
    X_train = train.drop(columns = 'Allstar')
    y_train = train['Allstar']
        
    # extract data from season for test set
    test = df[df['Season'] == season].dropna()

    # drop season attribute
    test = test.drop(columns = 'Season')

    # extract explanatory variables and labels
    X_test = test.drop(columns = 'Allstar')
    y_test = test['Allstar']    
    
    # create balanced training set     
    train_balanced = smote(X_train, y_train)
        
    # isolate explanatory variables and labels
    X_train_balanced = train_balanced.drop(columns = 'Allstar')
    y_train_balanced = train_balanced['Allstar']
        
    # instantiate and train model
    lr = LogisticRegression(random_state = 0).fit(X_train_balanced, y_train_balanced)
        
    # predict and isolate probabilities of allstar selection
    probas = lr.predict_proba(X_test)
    probas = [proba[1] for proba in probas]
        
    # reattach names and attach probabilities
    data = pd.read_csv('data.csv').dropna()
    df_season = data[data['Season']==season].drop(columns='Allstar')

    # add probability columns
    df_season['Allstar'] = probas
        
    # sort by probability of selection    
    df_season = df_season.sort_values(by = ['Allstar'], ascending = False)    
    
    # add conference tags     
    east = ['MIL', 'CLE', 'ATL', 'CHO', 'CHI', 'DET', 'IND', 'MIA', 'MIL', 'BRK', 'ORL',
            'PHI', 'TOR', 'BOS', 'WAS', 'NYK', 'NJN']

    west = ['DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'LAL', 'MEM', 'MIN', 'NOP', 'PHO', 
            'POR', 'SAS', 'SAC', 'OKC', 'UTA', 'SEA']
    
    team = list(df_season['Tm'])   
    conf_flag = []
    for tm in team:    
        if tm in east:
            conf_flag.append('E')   
        if tm in west:
            conf_flag.append('W')     
            
    df_season['Conference'] = ['E' if tm in east else 'W' for tm in team]    
   
    # add guard/forward tags
    position = df_season['Pos']
    guard = ['PG', 'SG']
    gf_flag = []
    for pos in position:
        if pos in guard:
            gf_flag.append('G')
        else:
            gf_flag.append('F')   
    df_season['G/F'] = gf_flag    
    return df_season

In [56]:
df_2023 = proba_table(df)
df_2023

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Season,Allstar,Conference,G/F
5743,Luka Dončić,PG,23.0,DAL,50.0,50.0,36.5,11.2,22.2,0.505,...,8.1,1.5,0.5,3.7,2.7,33.3,2023.0,9.999699e-01,W,G
5861,Nikola Jokić,C,27.0,DEN,51.0,51.0,33.6,9.4,14.9,0.632,...,10.1,1.3,0.7,3.5,2.6,24.7,2023.0,9.999552e-01,W,F
5760,Joel Embiid,C,28.0,PHI,45.0,45.0,34.9,11.0,20.5,0.537,...,4.1,1.2,1.5,3.5,3.2,33.1,2023.0,9.999335e-01,E,F
5635,Giannis Antetokounmpo,PF,28.0,MIL,47.0,47.0,33.1,11.3,21.0,0.538,...,5.4,0.8,0.8,4.0,3.4,31.8,2023.0,9.998545e-01,E,F
5851,LeBron James,PF,38.0,LAL,45.0,45.0,36.3,11.5,22.6,0.508,...,7.0,1.0,0.6,3.2,1.6,30.0,2023.0,9.997521e-01,W,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6123,Justise Winslow,SF,26.0,POR,29.0,11.0,26.8,2.8,6.8,0.409,...,3.4,1.0,0.4,1.5,3.1,6.8,2023.0,1.750842e-05,W,F
6091,Ish Wainright,SF,28.0,PHO,42.0,1.0,15.5,1.5,3.8,0.384,...,1.0,0.8,0.3,0.5,2.1,4.3,2023.0,1.552186e-05,W,F
5814,Isaiah Hartenstein,C,24.0,NYK,60.0,7.0,19.7,2.2,4.3,0.518,...,0.9,0.6,0.8,0.8,2.5,5.2,2023.0,8.400245e-06,E,F
5763,Dorian Finney-Smith,PF,29.0,BRK,4.0,4.0,29.8,2.3,6.8,0.333,...,1.0,0.0,0.0,1.0,2.8,6.3,2023.0,3.742256e-06,E,F


- ## **Assemble All-Star player pool for each conference according to players' predicted probability of All-Star selection**

In [57]:
def draft(df_season):
    
    '''
    Drafts Eastern and Western conference allstar teams according to probability
    of selection via logistic regression. 
    
    inputs: DataFrame - table of player's per game stats with inclusing of conference,
                        F/G position and probability of selection.
                        
    returns: dictionary - western conference starters and reserves 
             dictionary - eastern conference starters and reserves  
    ''' 
    # instantiate teams
    team_W = {'Starters': [], 'Reserves': []}
    team_E = {'Starters': [], 'Reserves': []}
        
    # divide players by east and west
    W = df_season[df_season['Conference']=='W']
    E = df_season[df_season['Conference']=='E']

    # separate guards and forwards    
    WG = W[W['G/F']=='G']
    WF = W[W['G/F']=='F']
    EG = E[E['G/F']=='G']
    EF = E[E['G/F']=='F']
        
    # add starting guards    
    for i in range(2):        
        team_W['Starters'].append(list(WG['Player'])[i])        
        team_E['Starters'].append(list(EG['Player'])[i])
        
        
    # add starting forwards   
    for i in range(3):        
        team_W['Starters'].append(list(WF['Player'])[i])        
        team_E['Starters'].append(list(EF['Player'])[i])
        
    # add reserve guards    
    for i in range(2, 4):        
        team_W['Reserves'].append(list(WG['Player'])[i])        
        team_E['Reserves'].append(list(EG['Player'])[i])
                
    # add reserve forwards    
    for i in range(3, 6):        
        team_W['Reserves'].append(list(WF['Player'])[i])        
        team_E['Reserves'].append(list(EF['Player'])[i])
        
    # add wildcards
    for player in list(W['Player']):        
        if player not in team_W['Reserves'] and player not in team_W['Starters']:                
            team_W['Reserves'].append(player)            
        if len(team_W['Reserves']) == 7:                
            break
    
    for player in list(E['Player']):        
        if player not in team_E['Reserves'] and player not in team_E['Starters']:               
            team_E['Reserves'].append(player)            
        if len(team_E['Reserves']) == 7:                
            break
 
    print(f'''Western Confernce \n  
              Starters - {team_W['Starters']} \n 
              Reserves - {team_W['Reserves']} \n\n
              
Eastern Conference \n 
              Starters - {team_E['Starters']} \n 
              Reserves - {team_E['Reserves']}
              ''')

In [58]:
draft(df_2023)

Western Confernce 
  
              Starters - ['Luka Dončić', 'Shai Gilgeous-Alexander', 'Nikola Jokić', 'LeBron James', 'Anthony Davis'] 
 
              Reserves - ['Damian Lillard', 'Ja Morant', 'Zion Williamson', 'Lauri Markkanen', 'Kawhi Leonard', 'Stephen Curry', "De'Aaron Fox"] 


              
Eastern Conference 
 
              Starters - ['Trae Young', 'James Harden', 'Joel Embiid', 'Giannis Antetokounmpo', 'Kevin Durant'] 
 
              Reserves - ['Tyrese Haliburton', 'Jalen Brunson', 'Jayson Tatum', 'Jimmy Butler', 'DeMar DeRozan', 'Donovan Mitchell', 'Julius Randle']
              


- ## **Replace injured players**
    - ### **Stephen Curry**
    - ### **Kevin Durant**
    - ### **Zion WIlliamson**

### **Select the Western Conference guard,  Western Conference forward, and Eastern Conference forward with the next highest probabilities of selection**

In [59]:
list(df_2023[(df_2023['Conference']=='W') & (df_2023['G/F']=='G')]['Player'])[6]

'Anthony Edwards'

In [61]:
list(df_2023[(df_2023['Conference']=='W') & (df_2023['G/F']=='G')]['Player'])[7]

'Jamal Murray'

In [62]:
list(df_2023[(df_2023['Conference']=='E') & (df_2023['G/F']=='F')]['Player'])[8]

'Nikola Vučević'

# <ins>Final Pools<ins>

    Western Confernce 
  
              Starters - ['Luka Dončić', 'Shai Gilgeous-Alexander', 'Nikola Jokić', 'LeBron James', 'Anthony Davis'] 
 
              Reserves - ['Damian Lillard', 'Ja Morant', 'Anthony Edwards', 'Lauri Markkanen', 'Kawhi Leonard', 'Jamal Murray', "De'Aaron Fox"] 


              
    Eastern Conference 
 
              Starters - ['Trae Young', 'James Harden', 'Joel Embiid', 'Giannis Antetokounmpo', 'Jayson Tatum'] 
 
              Reserves - ['Tyrese Haliburton', 'Jalen Brunson', 'Jimmy Butler', 'DeMar DeRozan', 'Donovan Mitchell', 'Julius Randle', 'Nikola Vučević']
              