In [183]:
import bs4
import pandas as pd
import numpy as np
from src.util.util_dicts import nba_teams_post_2000, home_map
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import time
import random

## Scraping Player logs

#### Oct 24th Thinking

- Got way to search player game logs by season, next need to clean up the returned table into a single clean df
- Showing need to extract all selenium related functions into their own specific python file in the future (selenium_helper.py)


#### Oct 23rd Thinking

Need to start with some base functions that will get called a ton, these will include:

- Open, navigate, pull single year player game log (input player, year, output raw HTML table)
- Take HTML table, parse into rough df that needs formatting
- Take rough df and clean, output single year player stats

Once this is ready, create list of players that played since 2000, could be its own table? (player name, first year, last year)

For now lets use LBJ as our first player to put this in practice on

In [5]:
first = "Lebron"
last = "James"

In [7]:
# TODO extract into player specific helpers/util

def player_name_to_bbref_code(first, last):
    if len(last) >= 5:
        return "{}/{}{}01".format(last[:1], last[:5], first[:2]).lower()
    else:
        return "{}/{}{}01".format(last[:1], last, first[:2]).lower()

In [28]:
# TODO extract into player specific helpers/util


def get_player_season_games_log(first, last, season):
    player_code = player_name_to_bbref_code(first, last)
    url = f"https://www.basketball-reference.com/players/{player_code}/gamelog/{season}"
    html = get_webpage_html(url)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == 'pgl_basic')
    df = pd.read_html(str(table))[0]
    return df

In [33]:
# TODO extract into selenium specific helper/util

# Options for selenium
options = Options()
options.page_load_strategy = 'eager'  # Faster load so it does not wait for video ads to render
options.add_argument("--headless")  # Run Chrome in headless mode

def get_webpage_html(url):
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    driver.close()
    return html

In [139]:
df = get_player_season_games_log(first, last, 2023)

In [142]:
df = clean_player_games_log(df)

In [40]:
df_original = df.copy()

In [141]:
# Specific for the player game log table

# TODO, investigate try/catch for this in case missing values?
def column_type_conversion(df):
    df.loc[:, 'tm_game_num'] = df['tm_game_num'].astype(int)
    df.loc[:, 'p_game_num'] = df['p_game_num'].astype(int)
    df.loc[:, 'home'] = df['home'].astype(bool)
    df.loc[:, 'GS'] = df['GS'].astype(bool)
    df.loc[:, 'FG'] = df['FG'].astype(int)
    df.loc[:, 'FGA'] = df['FGA'].astype(int)
    df.loc[:, 'FG%'] = df['FG%'].astype(float)
    df.loc[:, '3P'] = df['3P'].astype(int)
    df.loc[:, '3PA'] = df['3PA'].astype(int)
    df.loc[:, '3P%'] = df['3P%'].astype(float)
    df.loc[:, 'FT'] = df['FT'].astype(int)
    df.loc[:, 'FTA'] = df['FTA'].astype(int)
    df.loc[:, 'FT%'] = df['FT%'].astype(float)
    df.loc[:, 'ORB'] = df['ORB'].astype(int)
    df.loc[:, 'DRB'] = df['DRB'].astype(int)
    df.loc[:, 'TRB'] = df['TRB'].astype(int)
    df.loc[:, 'AST'] = df['AST'].astype(int)
    df.loc[:, 'STL'] = df['STL'].astype(int)
    df.loc[:, 'BLK'] = df['BLK'].astype(int)
    df.loc[:, 'TOV'] = df['TOV'].astype(int)
    df.loc[:, 'PF'] = df['PF'].astype(int)
    df.loc[:, 'PTS'] = df['PTS'].astype(int)
    df.loc[:, 'GmSc'] = df['GmSc'].astype(float)
    # df.loc[:, 'PM'] = df['PM'].astype(int)
    df.loc[:, 'age_yrs'] = df['age_yrs'].astype(int)
    df.loc[:, 'age_days'] = df['age_days'].astype(int)
    df.loc[:, 'score_diff'] = df['score_diff'].astype(int)    
    if type(df.date[1]) == str:
        df.loc[:, 'date'] = pd.to_datetime(df['date'], format='mixed')
    return df

In [137]:
col_order = ['gameID', 'tm_game_num', 'p_game_num', 'date', 'age_yrs', 'age_days', 'Tm', 
             'Opp', 'home','result', 'score_diff', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', 
             '3PA', '3P%', 'FT','FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 
             'TOV', 'PF','PTS', 'GmSc', 'PM',] # Specific column order because cleaner

home_map = {np.nan: 1, "@": 0}

def extract_date(dt): # To create gameID column
    return dt.date().strftime("%Y%m%d")

def clean_player_games_log(df):
    
    df = df[df['G'] != 'G'].copy() # remove formatting rows
    column_rename = {'Rk':'tm_game_num', 'G':'p_game_num', 'Date': 'date', 'Age':'age','Unnamed: 5':'home', 'Unnamed: 7':'result','+/-':'PM'} 
    df.rename(columns=column_rename, inplace=True) # Rename necessary columns

    df = df[df['GS'] != 'Inactive'].copy() # Remove rows where the player was inactive for the game
    df = df[df['GS'] != 'Did Not Dress'].copy() # Remove rows where player was not active (for player stats research would want to keep, but for ML not needed
    df = df[df['GS'] != 'Did Not Play'].copy() # Remove rows where player was not active, mutual decision? lol
    df = df[df['GS'] != 'Not With Team'].copy() # Remove rows where player was not with team?


    # NOTE: Will be more of these to drop, e.g. did not dress
    
    df.loc[:, 'home'] = df['home'].replace(home_map) # Map home to a bool column, true = home

    # Split compound columns
    df[['age_yrs', 'age_days']] = df['age'].str.split('-', expand=True)
    df[['result', 'score_diff']] = df['result'].str.split('(', expand=True)
    df['score_diff'] = df['score_diff'].str.rstrip(')')

    df = df.drop(columns = ['age'])

    
    df = column_type_conversion(df)

    df['gameID'] = df['date'].apply(extract_date) + df["Tm"] + df["Opp"]

    df = df[col_order]
    return df
    

In [124]:
df = df_original.copy()

In [125]:
df = clean_player_games_log(df)

In [145]:
# Test to import all lebron james seasons to single df

df = pd.DataFrame()
first = "Lebron"
last = "James"

for year in range(2004, 2024):
    temp = get_player_season_games_log(first, last, year)
    temp = clean_player_games_log(temp)
    df = pd.concat([df, temp])  
    print('appended ', year)

df

appended  2004
appended  2005
appended  2006
appended  2007
appended  2008
appended  2009
appended  2010
appended  2011
appended  2012
appended  2013
appended  2014
appended  2015
appended  2016
appended  2017
appended  2018
appended  2019
appended  2020
appended  2021
appended  2022
appended  2023


Unnamed: 0,gameID,tm_game_num,p_game_num,date,age_yrs,age_days,Tm,Opp,home,result,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,PM
0,20031029CLESAC,1,1,2003-10-29 00:00:00,18,303,CLE,SAC,False,L,...,4,6,9,4,0,2,3,25,24.7,-9
1,20031030CLEPHO,2,2,2003-10-30 00:00:00,18,304,CLE,PHO,False,L,...,10,12,8,1,0,7,1,21,14.7,-3
2,20031101CLEPOR,3,3,2003-11-01 00:00:00,18,306,CLE,POR,False,L,...,4,4,6,2,0,2,3,8,5.0,-21
3,20031105CLEDEN,4,4,2003-11-05 00:00:00,18,310,CLE,DEN,True,L,...,9,11,7,2,3,2,1,7,11.2,-3
4,20031107CLEIND,5,5,2003-11-07 00:00:00,18,312,CLE,IND,False,L,...,5,5,3,0,0,7,2,23,9.0,-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,20230402LALHOU,78,51,2023-04-02 00:00:00,38,93,LAL,HOU,False,W,...,8,10,11,0,1,1,1,18,19.4,+23
81,20230404LALUTA,79,52,2023-04-04 00:00:00,38,95,LAL,UTA,False,W,...,5,5,6,1,1,5,2,37,25.3,-7
82,20230405LALLAC,80,53,2023-04-05 00:00:00,38,96,LAL,LAC,False,L,...,8,8,7,1,1,6,0,33,26.4,-10
84,20230407LALPHO,81,54,2023-04-07 00:00:00,38,98,LAL,PHO,True,W,...,6,6,6,0,0,5,1,16,5.3,+11


In [146]:
df_lebron = df.copy()

In [147]:
# Test to import all Jokic seasons to single df

df = pd.DataFrame()
first = "Nikola"
last = "Jokic"

for year in range(2016, 2024):
    temp = get_player_season_games_log(first, last, year)
    temp = clean_player_games_log(temp)
    df = pd.concat([df, temp])  
    print('appended ', year)

df_jokic = df.copy()
df_jokic

appended  2016
appended  2017
appended  2018
appended  2019
appended  2020
appended  2021
appended  2022
appended  2023


Unnamed: 0,gameID,tm_game_num,p_game_num,date,age_yrs,age_days,Tm,Opp,home,result,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,PM
0,20151028DENHOU,1,1,2015-10-28 00:00:00,20,251,DEN,HOU,False,W,...,0,0,0,0,0,1,0,2,0.7,+1
1,20151030DENMIN,2,2,2015-10-30 00:00:00,20,253,DEN,MIN,True,L,...,6,9,1,0,0,2,3,10,7.8,+10
2,20151101DENOKC,3,3,2015-11-01 00:00:00,20,255,DEN,OKC,False,L,...,1,4,0,0,0,1,2,8,3.9,-2
4,20151105DENUTA,5,4,2015-11-05 00:00:00,20,259,DEN,UTA,True,L,...,2,3,0,0,1,1,1,5,2.8,-6
5,20151106DENGSW,6,5,2015-11-06 00:00:00,20,260,DEN,GSW,False,L,...,2,4,1,1,1,1,1,4,4.2,-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,20230322DENWAS,73,65,2023-03-22 00:00:00,28,31,DEN,WAS,False,W,...,7,12,7,3,0,2,1,31,35.7,+28
76,20230325DENMIL,74,66,2023-03-25 00:00:00,28,34,DEN,MIL,True,W,...,5,6,11,1,0,3,2,31,27.7,+19
77,20230327DENPHI,75,67,2023-03-27 00:00:00,28,36,DEN,PHI,True,W,...,14,17,12,0,2,3,3,25,31.6,+11
81,20230404DENHOU,79,68,2023-04-04 00:00:00,28,44,DEN,HOU,False,L,...,8,10,4,2,3,8,1,14,9.6,-21


In [None]:
'2021-01-01'

In [154]:
jokic_recent = df_jokic[df_jokic['date'] >= pd.to_datetime('2021-01-01')]

In [155]:
jokic_bt = len(df_jokic[df_jokic['PTS'] < 15])
jokic_total = df_jokic.shape[0]
lebron_bt = len(df_lebron[df_lebron['PTS'] < 15])
lebron_total = df_lebron.shape[0]
print(f"Jokic games under threshold {jokic_bt} out of {jokic_total}, below threshold {jokic_bt/jokic_total}% of the time")
print(f"Lebron games under threshold {lebron_bt} out of {lebron_total}, below threshold {lebron_bt/lebron_total}% of the time")

Jokic games under threshold 173 out of 596, below threshold 0.2902684563758389% of the time
Lebron games under threshold 59 out of 1421, below threshold 0.04152005629838142% of the time


In [156]:
jokic_bt = len(jokic_recent[jokic_recent['PTS'] < 15])
jokic_total = jokic_recent.shape[0]
lebron_bt = len(df_lebron[df_lebron['PTS'] < 15])
lebron_total = df_lebron.shape[0]
print(f"Jokic games under threshold {jokic_bt} out of {jokic_total}, below threshold {jokic_bt/jokic_total}% of the time")
print(f"Lebron games under threshold {lebron_bt} out of {lebron_total}, below threshold {lebron_bt/lebron_total}% of the time")

Jokic games under threshold 18 out of 211, below threshold 0.08530805687203792% of the time
Lebron games under threshold 59 out of 1421, below threshold 0.04152005629838142% of the time


## Scrape NBA Players list

Need to create a list of all NBA players that have played in the league since 2000, use the BBref player directory, pull each last name letter (A->Z), clean the table, will have list of all players in nba/aba history along with their first and last year in the league

In [179]:
def get_player_years_active(letter):
    url = f"https://www.basketball-reference.com/players/{letter}"
    html = get_webpage_html(url)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == 'players')
    df = pd.read_html(str(table))[0]
    return df

In [224]:
def clean_players_table(df):
    df = df[df['From'] != 'From']
    cols_to_drop = ['Colleges', 'Birth Date', 'Wt', 'Ht', 'Pos']
    df = df.drop(columns = cols_to_drop)
    df[['first_name', 'last_name']] = df['Player'].str.split(' ', n=1, expand=True)
    df['last_name'] = df['last_name'].str.rstrip('*')
    df['From'] = df['From'].astype(int)
    df['To'] = df['To'].astype(int)
    df = df[['first_name', 'last_name','From','To']]
    return df

In [185]:
players_df = pd.DataFrame()

for i in range(ord('a'), ord('z')+1):
    print(chr(i))
    if(chr(i) == 'x'):
        continue
    temp = get_player_years_active(chr(i))
    players_df = pd.concat([players_df, temp])
    num = random.randint(1, 3) # Otherwise might have basketball ref find out you are a bot
    print(f"sleeping {num}")
    time.sleep(num)

players_df = clean_players_table(players_df)
players_df = players_df[(players_df['From'] > 2000) | (players_df['To'] > 2000)]
player_df.to_csv('players_active_in_2000s.csv')

a
sleeping 4
b
sleeping 5
c
sleeping 1
d
sleeping 2
e
sleeping 1
f
sleeping 2
g
sleeping 4
h
sleeping 1
i
sleeping 3
j
sleeping 2
k
sleeping 2
l
sleeping 2
m
sleeping 5
n
sleeping 3
o
sleeping 5
p
sleeping 3
q
sleeping 5
r
sleeping 3
s
sleeping 5
t
sleeping 5
u
sleeping 2
v
sleeping 1
w
sleeping 4
x
y
sleeping 2
z
sleeping 3


In [190]:
players_df

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240,"June 24, 1968",Duke
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235,"April 7, 1946",Iowa State
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225,"April 16, 1947",UCLA
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162,"March 9, 1969",LSU
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223,"November 3, 1974","Michigan, San Jose State"
...,...,...,...,...,...,...,...,...
15,Ante Žižić,2018,2020,F-C,6-10,266,"January 4, 1997",
16,Jim Zoet,1983,1983,C,7-1,240,"December 20, 1953",Kent State University
17,Bill Zopf,1971,1971,G,6-1,170,"June 7, 1948",Duquesne
18,Ivica Zubac,2017,2023,C,7-0,240,"March 18, 1997",


In [188]:
players_df_orig = players_df.copy()

In [225]:
test = clean_players_table(players_df)

In [228]:
players_df = clean_players_table(players_df)
players_df

Unnamed: 0,first_name,last_name,From,To
0,Alaa,Abdelnaby,1991,1995
1,Zaid,Abdul-Aziz,1969,1978
2,Kareem,Abdul-Jabbar,1970,1989
3,Mahmoud,Abdul-Rauf,1991,2001
4,Tariq,Abdul-Wahad,1998,2003
...,...,...,...,...
15,Ante,Žižić,2018,2020
16,Jim,Zoet,1983,1983
17,Bill,Zopf,1971,1971
18,Ivica,Zubac,2017,2023


In [231]:
start_post_2000 = players_df[(players_df['From'] > 2000) | (players_df['To'] > 2000)]


In [232]:
start_post_2000

Unnamed: 0,first_name,last_name,From,To
3,Mahmoud,Abdul-Rauf,1991,2001
4,Tariq,Abdul-Wahad,1998,2003
5,Shareef,Abdur-Rahim,1997,2008
9,Álex,Abrines,2017,2019
10,Precious,Achiuwa,2021,2023
...,...,...,...,...
12,Derrick,Zimmerman,2006,2006
13,Stephen,Zimmerman,2017,2017
14,Paul,Zipser,2017,2018
15,Ante,Žižić,2018,2020
