# <ins>Recreating the 2023 NBA All-Star Pool - Logistic Regression Classifier</ins>


In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
from urllib.request import urlopen
import re
import import_ipynb
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

- **Build table of players' per-game stats from 2012 - 2022 with added features of Season and All-Star selection outcome (1 for selected, 0 for not selected)**

In [2]:
def allstar_names(year):
    '''
    Returns the list of NBA all-stars for a given season 
    
    input: int - NBA season/year of choice
    
    returns: array - list of NBA all-stars
    '''  
    # extract data from webpage 
    url = f'https://www.basketball-reference.com/allstar/NBA_{year}.html'   
    page = urlopen(url)    
    soup = bs(page)    
    names = list(soup.body.findAll('a'))    
    
    # clean and compile into list   
    flag = ['players' in str(name) for name in names]   
    player_names = []
    for i, elem in enumerate(flag):
        if elem==True:
            player_names.append(str(names[i]))
    
    result = []    
    for player in player_names[1:]:     
        name = player.split('<')    
        name = name[1].split('>')[1]    
        if name not in result and len(result) < 24:
            result.append(name)           
    return result


def player_stats(year): 
    '''
    Generates a table of all players per-game stats for a given season
    
    inputs: int - NBA season/year of choice   
    
    returns: DataFrame - individual player per-game stats
    '''   
    # extract data from webpage    
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    html = urlopen(url)
    soup = bs(html)   
    headers = ([th.getText() for th in soup.findAll('tr', limit=1)[0].findAll('th')])[1:]   
    rows = soup.findAll('tr')[1:]  
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
        
    # compile into table and clean
    stats = pd.DataFrame(player_stats, columns = headers)
    stats = stats.dropna(how='all')  
    stats['Player'] = stats['Player'].str.replace('*', '')       
    return stats


def table(year):   
    '''
    Generates a table of all players per-game stats for a given season
    with the inclusion of binary all-star attribute
    
    inputs: int - NBA season/year of choice
    
    returns: DataFrame - individual player per-game stats with all-star attribute
    '''  
    # remove player duplicates to only include stats with last team of season   
    df = player_stats(year).drop_duplicates(subset=['Player'], keep='last')
       
    # list of allstars for given season   
    allstars = allstar_names(year)
        
    # add season columns    
    df['Season'] = year
       
    # add allstar column
    df['Allstar'] = 0
    
    # set table index as player name   
    df = df.set_index('Player')
      
    # flag players who made allstar team   
    for name in allstars:
        df.loc[name, 'Allstar'] = 1      
    df = df.reset_index()
    
    return df


def dataset(years):   
    '''
    Generates a cumulative table of NBA player per-game stats for a
    given range of seasons.
    
    inputs: array - list of years for which to include player stats  
    
    returns: DataFrame - cumulative table of player stats
    '''   
    # remove 1999 season if applicable (no all-star game)    
    if 1999 in years:
        years.remove(1999)
    
    # instantiate dataframe as accumulator with first season    
    df = table(years[0])
    
    # create and concatenate remaining season stats    
    for year in years[1:]:
        df = pd.concat([df, table(year)])         
    return df

In [3]:
data = dataset(list(range(2012, 2023)))
data.to_csv('data.csv', index = False)
df = pd.read_csv('data.csv')
df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season,Allstar
0,Jeff Adrien,PF,25,HOU,8,0,7.9,0.9,2.0,0.438,...,2.1,2.8,0.1,0.0,0.3,0.3,1.6,2.6,2012,0
1,Arron Afflalo,SG,26,DEN,62,62,33.6,5.3,11.3,0.471,...,2.5,3.2,2.4,0.6,0.2,1.4,2.2,15.2,2012,0
2,Blake Ahearn,PG,27,UTA,4,0,7.5,1.0,3.5,0.286,...,0.5,0.5,0.3,0.0,0.0,1.3,1.0,2.5,2012,0
3,Solomon Alabi,C,23,TOR,14,0,8.7,0.9,2.6,0.361,...,2.3,3.4,0.2,0.1,0.6,0.4,0.8,2.4,2012,0
4,Cole Aldrich,C,23,OKC,26,0,6.7,0.8,1.6,0.524,...,1.3,1.8,0.1,0.3,0.6,0.3,0.8,2.2,2012,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,0.465,...,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022,0
5621,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,0.460,...,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022,1
5622,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,0.526,...,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022,0
5623,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,0.567,...,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022,0


- **Create balanced training set with SMOTE oversampling**
- **Train logistic regression classifier on player stats from previous 10 seasons**
- **Predict probability of All-Star selection for players in 2023 season**
- **Return table of player's per-game stats with added features of *Conference*, *G/F* (Guard/Foward position), and *Allstar* (probability of All-Star selection)**

In [4]:
def smote(X, y):
    '''
    Generates a balanced DataFrame using SMOTE oversampling. 
    
    input: training data and target variables    
    
    returns: balanced DataFrame     
    '''    
    sm = SMOTE()    
    X_smote, y_smote = sm.fit_resample(X, y)    
    return pd.concat([X_smote, y_smote], axis=1)


def proba_table(df):   
    '''
    Creates a table of player's per game stats with inclusing of conference,
    Forward/Guard position and probability of allstar selection.
    
    inputs: DataFrame - table of player's per game stats for a given season.
    
    returns: DataFrame - player's per game stats with inclusing of conference,
                         F/G position and probability of selection.   
    '''
    
    # year for which team will be drafted    
    season = df['Season'].max()
        
    # drop non-useful columns    
    df = df.drop(columns = ['Pos', 'Player', 'Tm', 'Age'])
       
    # create training set using player data from previous 10 seasons    
    train = df[(df['Season'] >= (season - 10)) & (df['Season'] < season)].dropna()

    # drop season attribute
    train = train.drop(columns = 'Season')

    # extract explanatory variables and labels    
    X_train = train.drop(columns = 'Allstar')
    y_train = train['Allstar']
        
    # extract data from season for test set
    test = df[df['Season'] == season].dropna()

    # drop season attribute
    test = test.drop(columns = 'Season')

    # extract explanatory variables and labels
    X_test = test.drop(columns = 'Allstar')
    y_test = test['Allstar']    
    
    # create balanced training set     
    train_balanced = smote(X_train, y_train)
        
    # isolate explanatory variables and labels
    X_train_balanced = train_balanced.drop(columns = 'Allstar')
    y_train_balanced = train_balanced['Allstar']
        
    # instantiate and train model
    lr = LogisticRegression(random_state = 0).fit(X_train_balanced, y_train_balanced)
        
    # predict and isolate probabilities of allstar selection
    probas = lr.predict_proba(X_test)
    probas = [proba[1] for proba in probas]
        
    # reattach names and attach probabilities
    data = pd.read_csv('data.csv').dropna()
    df_season = data[data['Season']==season].drop(columns='Allstar')

    # add probability columns
    df_season['Allstar'] = probas
        
    # sort by probability of selection    
    df_season = df_season.sort_values(by = ['Allstar'], ascending = False)    
    
    # add conference tags     
    east = ['MIL', 'CLE', 'ATL', 'CHO', 'CHI', 'DET', 'IND', 'MIA', 'MIL', 'BRK', 'ORL',
            'PHI', 'TOR', 'BOS', 'WAS', 'NYK', 'NJN']

    west = ['DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'LAL', 'MEM', 'MIN', 'NOP', 'PHO', 
            'POR', 'SAS', 'SAC', 'OKC', 'UTA', 'SEA']
    
    team = list(df_season['Tm'])   
    conf_flag = []
    for tm in team:    
        if tm in east:
            conf_flag.append('E')   
        if tm in west:
            conf_flag.append('W')     
            
    df_season['Conference'] = ['E' if tm in east else 'W' for tm in team]    
   
    # add guard/forward tags
    position = df_season['Pos']
    guard = ['PG', 'SG']
    gf_flag = []
    for pos in position:
        if pos in guard:
            gf_flag.append('G')
        else:
            gf_flag.append('F')   
    df_season['G/F'] = gf_flag    
    return df_season

In [7]:
df_2023 = proba_table(df)
df_2023

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,Season,Allstar,Conference,G/F
5181,Joel Embiid,C,27,PHI,68,68,33.8,9.8,19.6,0.499,...,4.2,1.1,1.5,3.1,2.7,30.6,2022,9.999962e-01,E,F
5309,Nikola Jokić,C,26,DEN,74,74,33.5,10.3,17.7,0.583,...,7.9,1.5,0.9,3.8,2.6,27.1,2022,9.999740e-01,W,F
5031,Giannis Antetokounmpo,PF,27,MIL,67,67,32.9,10.3,18.6,0.553,...,5.8,1.1,1.4,3.3,3.2,29.9,2022,9.999734e-01,E,F
5160,Luka Dončić,PG,22,DAL,65,65,35.4,9.9,21.6,0.457,...,8.7,1.2,0.6,4.5,2.2,28.4,2022,9.999479e-01,W,G
5173,Kevin Durant,PF,33,BRK,55,55,37.2,10.5,20.3,0.518,...,6.4,0.9,0.9,3.5,2.1,29.9,2022,9.998578e-01,E,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5286,Wes Iwundu,SF,27,ATL,3,1,27.3,2.7,6.0,0.444,...,0.0,0.3,0.0,0.3,2.7,7.3,2022,1.658114e-05,E,F
5209,Taj Gibson,C,36,NYK,52,4,18.2,1.7,3.2,0.518,...,0.6,0.4,0.8,0.5,2.6,4.4,2022,9.561888e-06,E,F
5428,Georges Niang,PF,28,PHI,76,7,22.8,3.2,7.4,0.437,...,1.3,0.4,0.2,0.8,2.5,9.2,2022,4.622987e-06,E,F
5380,Mac McClung,SG,23,LAL,1,0,22.0,2.0,5.0,0.400,...,1.0,1.0,1.0,2.0,5.0,6.0,2022,5.742076e-07,W,G


- **Assemble All-Star player pool for each conference according to players' predicted probability of All-Star selection (according traditional All-Star roster construction).**

In [8]:
def draft(df_season):
    
    '''
    Drafts Eastern and Western conference allstar teams according to probability
    of selection via logistic regression. 
    
    inputs: DataFrame - table of player's per game stats with inclusing of conference,
                        F/G position and probability of selection.
                        
    returns: dictionary - western conference starters and reserves 
             dictionary - eastern conference starters and reserves  
    ''' 
    # instantiate teams
    team_W = {'Starters': [], 'Reserves': []}
    team_E = {'Starters': [], 'Reserves': []}
        
    # divide players by east and west
    W = df_season[df_season['Conference']=='W']
    E = df_season[df_season['Conference']=='E']

    # separate guards and forwards    
    WG = W[W['G/F']=='G']
    WF = W[W['G/F']=='F']
    EG = E[E['G/F']=='G']
    EF = E[E['G/F']=='F']
        
    # add starting guards    
    for i in range(2):        
        team_W['Starters'].append(list(WG['Player'])[i])        
        team_E['Starters'].append(list(EG['Player'])[i])
        
        
    # add starting forwards   
    for i in range(3):        
        team_W['Starters'].append(list(WF['Player'])[i])        
        team_E['Starters'].append(list(EF['Player'])[i])
        
    # add reserve guards    
    for i in range(2, 4):        
        team_W['Reserves'].append(list(WG['Player'])[i])        
        team_E['Reserves'].append(list(EG['Player'])[i])
                
    # add reserve forwards    
    for i in range(3, 6):        
        team_W['Reserves'].append(list(WF['Player'])[i])        
        team_E['Reserves'].append(list(EF['Player'])[i])
        
    # add wildcards
    for player in list(W['Player']):        
        if player not in team_W['Reserves'] and player not in team_W['Starters']:                
            team_W['Reserves'].append(player)            
        if len(team_W['Reserves']) == 7:                
            break
    
    for player in list(E['Player']):        
        if player not in team_E['Reserves'] and player not in team_E['Starters']:               
            team_E['Reserves'].append(player)            
        if len(team_E['Reserves']) == 7:                
            break
 
    print(f'''Western Confernce \n  
              Starters - {team_W['Starters']} \n 
              Reserves - {team_W['Reserves']} \n\n
              
              Eastern Conference \n 
              Starters - {team_E['Starters']} \n 
              Reserves - {team_E['Reserves']}
              ''')

In [9]:
draft(df_2023)

Western Confernce 
  
              Starters - ['Luka Dončić', 'Ja Morant', 'Nikola Jokić', 'LeBron James', 'Anthony Davis'] 
 
              Reserves - ['Dejounte Murray', 'Stephen Curry', 'Rudy Gobert', 'Brandon Ingram', 'Paul George', 'Shai Gilgeous-Alexander', 'Devin Booker'] 


              
              Eastern Conference 
 
              Starters - ['Trae Young', 'James Harden', 'Joel Embiid', 'Giannis Antetokounmpo', 'Kevin Durant'] 
 
              Reserves - ['Zach LaVine', 'Darius Garland', 'DeMar DeRozan', 'Jayson Tatum', 'Jimmy Butler', 'Julius Randle', 'Kyle Kuzma']
              


- **Replace injured players**
    - **Stephen Curry**
    - **Kevin Durant**

**Select the Western Conference guard and Eastern Conference forward with the next highest probabilities of selection.**

In [10]:
list(df_2023[(df_2023['Conference']=='W') & (df_2023['G/F']=='G')]['Player'])[6]

'Damian Lillard'

In [11]:
list(df_2023[(df_2023['Conference']=='E') & (df_2023['G/F']=='F')]['Player'])[8]

'Nikola Vučević'

# <ins>Final Pools<ins>

    Western Confernce 
  
              Starters - ['Luka Dončić', 'Ja Morant', 'Nikola Jokić', 'LeBron James', 'Anthony Davis'] 
 
              Reserves - ['Dejounte Murray', 'Damian Lillard', 'Rudy Gobert', 'Paul George', 'Brandon Ingram', 'Devin Booker', 'Shai Gilgeous-Alexander']
              

     Eastern Conference 
 
              Starters - ['Trae Young', 'Zach LaVine', 'Joel Embiid', 'Giannis Antetokounmpo', 'Jayson Tatum'] 
 
              Reserves - ['James Harden', 'Darius Garland', 'Nikola Vučević', 'DeMar DeRozan', 'Jimmy Butler', 'Julius Randle', 'Nikola Vučević']