In [9]:
import pandas as pd
import datetime as dt
import requests
from pprint import pprint
from bs4 import BeautifulSoup, Tag
import numpy as np
import time
import plotly.graph_objects as go
import re
import os
from dotenv import dotenv_values
import logging

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename='nba_scraping.log', filemode='a', encoding='utf-8', datefmt='%Y-%m-%d %H:%M:%S')

secrets = dotenv_values(r'C:\Users\lianz\Python\personal_projects\nba-airflow\.env')

In [2]:
#TODO: perform analysis in Jupyter notebook - decide on information to query
#TODO: choose information to store in RDBMS
#TODO: define schema in RDBMS
#TODO: build airflow pipeline to extract, transform and load into RDBMS
#TODO: build dash (dash over streamlit for adaptability)
#TODO: deploy dash
#TODO: have highlighted areas in basketball court that shows top 10 players in every range of shooting based on FG%

># Request for Traditional Endpoint Data

# Scrape from NBA.com

In [10]:
url = "https://stats.nba.com/stats/leaguedashplayerstats?" # /leaguedashplayerstats?, /playerestimatedmetrics?, /leaguedashplayerclutch?, /leagueleaders? /playergamelogs?, /leaguedashplayershotlocations?

parameters = {
'College': '', 
'Conference': '', # East, West
'Country': '',
'DateFrom': '',
'DateTo': '',
'DistanceRange': '', # (Only for leaguedashplayershotlocations) 5ft Range, 8ft Range, By Zone
'Division': '', # Central, Atlantic, Southeast, Southwest, Northwest, Pacific 
'DraftPick': '', # 1st Round, 2nd Round, 1st Pick, Lottery Pick, Top 5 Pick, Top 10 Pick, Top 15 Pick, Top 20 Pick, Top 25 Pick, Picks 11 Thru 20, Picks 21 Thru 30, Undrafted
'DraftYear': '',
'GameScope': '',
'GameSegment': '', # First Half, Second Half, Overtime
'Height': '', # LT 6-0, GT 6-9
'LastNGames': '0',
'LeagueID': '00',
'Location': '', # Home, Road
'MeasureType': 'Base', # Base, Advanced, Misc, Scoring, Usage, Opponent, Defense, 
'Month': '0', # January=4, February=5, ..., September=9, October=1, November=2, December=3
'OpponentTeamID': '0',
'Outcome': '', # W, L
'PORound': '0', #Conference Quarter-Finals=1, Conference Semi-Finals=2, Conference Finals=3, Finals=4
'PaceAdjust': 'N',
'PerMode': 'Totals', #Per Mode: Totals, PerGame, Per100Possessions, Per100Plays, Per48, Per40, Per36, PerMinute, PerPossession, PerPlay, MinutesPer
'Period': '0', #Quarter: 1,2,3,4,5,6,7,8,9,10
'PlayerExperience': '', # Rookie, Sophomore, Veteran
'PlayerPosition': '', # Center=C, Forward=F, Guard=G
'PlusMinus': 'N',
'Rank': 'N',
'Season': '2022-23', # YYYY-YY, All Time for leagueleaders endpoint
'SeasonSegment': '', # Pre All-Star, Post All-Star
'SeasonType': 'Regular Season', # Season Type: Regular Season, Playoffs, Pre Season, All Star, PlayIn
'StatCategory': '', # (Only for leagueleaders endpoint) PTS, MIN, FGA, FGM, TOV, FG_PCT, FG3_PCT, FG3A, FG3M, FTM, FTA, FT_PCT, OREB, DREB, REB, AST, BLK, STL, 
'ShotClockRange': '', # 24-22, 22-18 Very Early, 18-15 Very Early, 15-7 Average, 7-4 Late, 4-0 Very Late
'StarterBench': '', # Starters, Bench
'TeamID': '0',
'VsConference': '', # East, West
'VsDivision': '', # Central, Atlantic, Southeast, Southwest, Northwest, Pacific 
'Weight': '', # GT 200, LT 200, 
'AheadBehind': '', # (Only for clutch endpoint) Ahead or Behind, Behind or Tied, Ahead or Tied
'ActiveFlag': '', # (Only for leagueleaders endpoint) No, Yes
'ClutchTime': '', # (Only for clutch endpoint) Last 5 mins, Last 4 mins, Last 3 mins, Last 2 mins, Last 1 mins, Last 30 Seconds, Last 10 Seconds 
'PointDiff': '', # (Only for clutch endpoint) 5, 4, 3, 2, 1
}

headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Host': 'stats.nba.com',
'Origin': 'https://www.nba.com',
'Referer': 'https://www.nba.com/',
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}

r = requests.request('GET', url=url, headers=headers, params=parameters)

In [13]:
r = r.json()

In [None]:
r['resultSets'][0]['headers']

In [None]:
n_group = 3
to_be_grouped = r['resultSets']['headers'][1]['columnNames'][6:]
grouped = []

for i in range(0, len(to_be_grouped), n_group):
    mini_groups = to_be_grouped[i:i+n_group]
    grouped.append(mini_groups)

grouped

In [None]:
shotDistance = r['resultSets'][0]['headers']['columnNames']
stats_category = grouped[0]

In [None]:
new_column_index = [shotDistance, stats_category]
index = pd.MultiIndex.from_product(new_column_index, names=['shotDistance','category'])

In [7]:
index = [
    ("", 'PLAYER_ID'),
    ("", 'PLAYER_NAME'),
    ("", 'TEAM_ID'),
    ("", 'TEAM_ABBREVIATION'),
    ("", 'AGE'),
    ("", 'NICKNAME'),
    ('Less Than 5 ft.',    'FGM'),
    ('Less Than 5 ft.',    'FGA'),
    ('Less Than 5 ft.', 'FG_PCT'),
    (        '5-9 ft.',    'FGM'),
    (        '5-9 ft.',    'FGA'),
    (        '5-9 ft.', 'FG_PCT'),
    (      '10-14 ft.',    'FGM'),
    (      '10-14 ft.',    'FGA'),
    (      '10-14 ft.', 'FG_PCT'),
    (      '15-19 ft.',    'FGM'),
    (      '15-19 ft.',    'FGA'),
    (      '15-19 ft.', 'FG_PCT'),
    (      '20-24 ft.',    'FGM'),
    (      '20-24 ft.',    'FGA'),
    (      '20-24 ft.', 'FG_PCT'),
    (      '25-29 ft.',    'FGM'),
    (      '25-29 ft.',    'FGA'),
    (      '25-29 ft.', 'FG_PCT'),
    (      '30-34 ft.',    'FGM'),
    (      '30-34 ft.',    'FGA'),
    (      '30-34 ft.', 'FG_PCT'),
    (      '35-39 ft.',    'FGM'),
    (      '35-39 ft.',    'FGA'),
    (      '35-39 ft.', 'FG_PCT'),
    (        '40+ ft.',    'FGM'),
    (        '40+ ft.',    'FGA'),
    (        '40+ ft.', 'FG_PCT')
]

In [None]:
df = pd.DataFrame.from_records(r['resultSets']['rowSet'], columns=pd.MultiIndex.from_tuples(index))
df

In [None]:
for i in df.columns.to_flat_index():
    print(" ".join(i).strip())

### Define years to scrape

In [11]:
start_year = 2022
end_year = 2023
years = []

while start_year <= end_year:
    years.append(f'{start_year}-{str(start_year+1)[-2:]}')
    start_year += 1

### Function for Scraping

In [13]:
def scrape_nba(year, **kwargs):
    url = "https://stats.nba.com/stats/leaguedashplayershotlocations?" # /leaguedashplayerstats?, /playerestimatedmetrics?, /leaguedashplayerclutch?, /leagueleaders? /playergamelogs?, /leaguedashplayershotlocations?
    
    parameters = {
    # 'AheadBehind': '', # (Only for clutch endpoint) Ahead or Behind, Behind or Tied, Ahead or Tied
    # 'ActiveFlag': '', # (Only for leagueleaders endpoint) No, Yes
    # 'ClutchTime': '', # (Only for clutch endpoint) Last 5 mins, Last 4 mins, Last 3 mins, Last 2 mins, Last 1 mins, Last 30 Seconds, Last 10 Seconds 
    'College': '', 
    'Conference': '', # East, West
    'Country': '',
    'DateFrom': '',
    'DateTo': '',
    'DistanceRange': '5ft Range', # (Only for leaguedashplayershotlocations) 5ft Range, 8ft Range, By Zone
    'Division': '', # Central, Atlantic, Southeast, Southwest, Northwest, Pacific 
    'DraftPick': '', # 1st Round, 2nd Round, 1st Pick, Lottery Pick, Top 5 Pick, Top 10 Pick, Top 15 Pick, Top 20 Pick, Top 25 Pick, Picks 11 Thru 20, Picks 21 Thru 30, Undrafted
    'DraftYear': '',
    'GameScope': '',
    'GameSegment': '', # First Half, Second Half, Overtime
    'Height': '', # LT 6-0, GT 6-9
    'LastNGames': '0',
    'LeagueID': '00',
    'Location': '', # Home, Road
    'MeasureType': 'Base', # Base, Advanced, Misc, Scoring, Usage, Opponent, Defense, 
    'Month': '0', # January=4, February=5, ..., September=9, October=1, November=2, December=3
    'OpponentTeamID': '0',
    'Outcome': '', # W, L
    'PORound': '0', #Conference Quarter-Finals=1, Conference Semi-Finals=2, Conference Finals=3, Finals=4
    'PaceAdjust': 'N',
    'PerMode': 'Totals', #Per Mode: Totals, PerGame, Per100Possessions, Per100Plays, Per48, Per40, Per36, PerMinute, PerPossession, PerPlay, MinutesPer
    'Period': '0', #Quarter: 1,2,3,4,5,6,7,8,9,10
    'PlayerExperience': '', # Rookie, Sophomore, Veteran
    'PlayerPosition': '', # Center=C, Forward=F, Guard=G
    'PlusMinus': 'N',
    # 'PointDiff': '', # (Only for clutch endpoint) 5, 4, 3, 2, 1
    'Rank': 'N',
    'Season': year, # YYYY-YY, All Time for leagueleaders endpoint
    'SeasonSegment': '', # Pre All-Star, Post All-Star
    'SeasonType': 'Regular Season', # Season Type: Regular Season, Playoffs, Pre Season, All Star, PlayIn
    'StatCategory': '', # (Only for leagueleaders endpoint) PTS, MIN, FGA, FGM, TOV, FG_PCT, FG3_PCT, FG3A, FG3M, FTM, FTA, FT_PCT, OREB, DREB, REB, AST, BLK, STL, 
    'ShotClockRange': '', # 24-22, 22-18 Very Early, 18-15 Very Early, 15-7 Average, 7-4 Late, 4-0 Very Late
    'StarterBench': '', # Starters, Bench
    'TeamID': '0',
    'VsConference': '', # East, West
    'VsDivision': '', # Central, Atlantic, Southeast, Southwest, Northwest, Pacific 
    'Weight': '', # GT 200, LT 200, 
    }

    headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'stats.nba.com',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
    'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': "Windows",
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
    }

    r = requests.request('GET', url=url, headers=headers, params=parameters)
    r = r.json()
    return r

In [14]:
def process_scraped_data(result, index):
    df = pd.DataFrame.from_records(result['resultSets']['rowSet'], columns=pd.MultiIndex.from_tuples(index))
    return df

### Scrape

In [None]:
combined_df = []
begin_time = time.time()
lag_time = 0
for year in years:
    r = scrape_nba(year)
    df = process_scraped_data(result=r, index=index)
    df['year'] = year
    combined_df.append(df)
    print(f'Successfully retrieved data for Season {year}')
    lag = np.random.randint(low=5, high=25)
    lag_time += lag
    print(f'Delaying for: {lag}s\nCurrent total lag time: {lag_time}s')
    time.sleep(lag)
time_taken = time.time() - begin_time
print(f'Total time taken: {round(time_taken/60, 2)}mins')

In [None]:
final_df = pd.concat(combined_df, axis=0)

In [None]:
final_df.to_csv('player_shotLocations.csv', index=False)

In [None]:
player_ID = final_df[[("", 'PLAYER_ID'),("", 'PLAYER_NAME'), ("", 'NICKNAME')]].drop_duplicates(subset=("", 'PLAYER_ID'))
team_ID = final_df[[("", 'TEAM_ID'),("","TEAM_ABBREVIATION")]].drop_duplicates()

In [None]:
print(f'''
Number of players from 1996-2023: {len(player_ID)}
Number of teams from 1996-2023: {len(team_ID)}
''')

In [None]:
from nba_api import NBA

In [None]:
len([
            {"text": name, "stretch": False} for name in [" ".join(i).strip()
                                                          for i in index]
        ])

In [None]:
pd.read_csv('player_shotLocations.csv', header=1, index_col=0)

In [None]:
[tuple(row) for row in df.itertuples()]

# Scrape from Basketball Reference

In [22]:
team_abbs = {
"Atlanta Hawks": 'ATL',
"Brooklyn Nets": 'BKN',
"Boston Celtics": 'BOS',
"Charlotte Hornets": 'CHH',
"Chicago Bulls": 'CHI',
"Cleveland Cavaliers": 'CLE',
"Dallas Mavericks": 'DAL',
"Denver Nuggets": 'DEN',
"Detroit Pistons": 'DET',
"Golden State Warriors": 'GSW',
"Houston Rockets": 'HOU',
"Indiana Pacers": 'IND',
"Los Angeles Clippers": 'LAC',
"Los Angeles Lakers": 'LAL',
"Memphis Grizzlies": 'MEM',
"Miami Heat": 'MIA',
"Milwaukee Bucks": 'MIL',
"Minnesota Timberwolves": 'MIN',
"New Orleans Pelicans": 'NOP',
"New York Knicks": 'NYK',
"Oklahoma City Thunder": 'OKC',
"Orlando Magic": 'ORL',
"Philadelphia 76ers": 'PHI',
"Phoenix Suns": 'PHX',
"Portland Trail Blazers": 'POR',
"Sacramento Kings": 'SAC',
"San Antonio Spurs": 'SAS',
"Toronto Raptors": 'TOR',
"Utah Jazz": 'UTA',
"Washington Wizards": 'WAS',
'Seattle SuperSonics': 'SEA',
'Vancouver Grizzlies': 'VAN',
'New Jersey Nets': 'NJN',
'New Orleans Hornets': 'NOH',
'Charlotte Bobcats': 'CAB',
'New Orleans/Oklahoma City Hornets': 'NCH',
}

## Schedule Scraping

In [23]:
def get_schedule_html(year: str, month: str,):
    games = requests.get(f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html#schedule")
    soup = BeautifulSoup(games.content, "html.parser")
    if soup.find('h1').text == 'Page Not Found (404 error)':
        return False
    else:
        return soup

In [24]:
def get_schedule_table(soup,):
    df = pd.read_html(str(soup.find_all('table')))[0].drop(['Unnamed: 6',], axis=1)
    return df

In [25]:
def process_schedule(df: pd.DataFrame):
    df = df.loc[df['Date'] != 'Playoffs'].copy()
    df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Start (ET)'] + 'm')
    df['DateStr'] = [dt.datetime.strftime(i, "%Y%m%d%H%M") for i in df['DateTime']]
    df = df.drop(['Date','Start (ET)','Notes'], axis=1)
    df.rename(columns={'Visitor/Neutral':'Visitor', 'PTS': 'Visitor PTS', 'Home/Neutral':'Home','PTS.1':'Home PTS', 'Unnamed: 7':'OT', 'Attend.':'Attendance'}, inplace=True)
    df['Visitor_short'] = [team_abbs[name] for name in df['Visitor']]
    df['Home_short'] = [team_abbs[name] for name in df['Home']]
    df['game_id'] = df['DateStr'] + df['Visitor_short'] + df['Home_short']
    df.set_index('DateTime', inplace=True)
    df['year'] = df.index.year
    df['month'] = df.index.month
    return df

In [None]:
soup = get_schedule_html('2001','april')
df = get_schedule_table(soup)
df = process_schedule(df)
df

In [29]:
MONTHS = ['october','november','december','january','february','march','april','may','june','july','august','september']
YEARS = [str(i) for i in range(2000, dt.date.today().year + 1)]

In [21]:
df_list = []
request_count = 0
start = time.time()
for year in YEARS:
    for month in MONTHS[:]:
        lag = np.random.randint(4,6)
        logging.info(f'Cycle lag time: {lag}')
        time.sleep(lag)
        logging.info(f'Processing {month}-{year} request!')
        try:
            soup = get_schedule_html(year, month)
        except Exception as e:
            logging.error(f'Could not process {month}-{year} request!', exc_info=True)
            continue
        if not soup:
            continue
        request_count += 1
        try:
            df = get_schedule_table(soup)
            df = process_schedule(df)
            df_list.append(df)
            logging.info(f'Finished {month} request!\n')
        except:
            logging.error(f'Error while processing {month}-{year}\n', exc_info=True)
            continue
end = time.time()
logging.info(f'Total time taken: {round(end-start)}s')
logging.info(f'Total requests count: {request_count}')
logging.info(f'Average Requests per Second: {round((end-start)/request_count)}')
full_df = pd.concat(df_list, axis=0)

In [27]:
full_df

Unnamed: 0_level_0,Visitor,Visitor PTS,Home,Home PTS,OT,Attendance,Arena,DateStr,Visitor_short,Home_short,game_id,year,month
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000-10-31 19:30:00,Charlotte Hornets,106,Atlanta Hawks,82,,18465,Philips Arena,200010311930,CHH,ATL,200010311930CHHATL,2000,10
2000-10-31 19:30:00,Cleveland Cavaliers,86,New Jersey Nets,82,,14505,Continental Airlines Arena,200010311930,CLE,NJN,200010311930CLENJN,2000,10
2000-10-31 19:30:00,Washington Wizards,86,Orlando Magic,97,,13349,TD Waterhouse Centre,200010311930,WAS,ORL,200010311930WASORL,2000,10
2000-10-31 20:00:00,Milwaukee Bucks,93,Dallas Mavericks,97,,16600,Reunion Arena,200010312000,MIL,DAL,200010312000MILDAL,2000,10
2000-10-31 20:00:00,Philadelphia 76ers,101,New York Knicks,72,,19763,Madison Square Garden (IV),200010312000,PHI,NYK,200010312000PHINYK,2000,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-01 20:30:00,Miami Heat,93,Denver Nuggets,104,,19528,Ball Arena,202306012030,MIA,DEN,202306012030MIADEN,2023,6
2023-06-04 20:00:00,Miami Heat,111,Denver Nuggets,108,,19537,Ball Arena,202306042000,MIA,DEN,202306042000MIADEN,2023,6
2023-06-07 20:30:00,Denver Nuggets,109,Miami Heat,94,,20019,Kaseya Center,202306072030,DEN,MIA,202306072030DENMIA,2023,6
2023-06-09 20:30:00,Denver Nuggets,108,Miami Heat,95,,20184,Kaseya Center,202306092030,DEN,MIA,202306092030DENMIA,2023,6


In [30]:
if os.path.isfile(r'C:\Users\lianz\Python\personal_projects\nba-airflow\games_played.csv'):
    read_df = pd.read_csv(r'C:\Users\lianz\Python\personal_projects\nba-airflow\games_played.csv', index_col=0)
    final_df = pd.concat([read_df, full_df], axis=0).reset_index().drop_duplicates(subset='game_id')
    final_df.to_csv(r'C:\Users\lianz\Python\personal_projects\nba-airflow\games_played.csv')
else:
    full_df.to_csv(r'C:\Users\lianz\Python\personal_projects\nba-airflow\games_played.csv')

In [32]:
df_final = pd.read_csv(r'C:\Users\lianz\Python\personal_projects\nba-airflow\data\games_played.csv', index_col=0, parse_dates=['DateTime'], dtype={'DateStr':'str'}).sort_values('DateTime')

## Game shots Scraping

In [30]:
def request_soup_for_game(year: str, month: str, day: str, team: str):
    url = f'https://www.basketball-reference.com/boxscores/shot-chart/{year}{month}{day}0{team}.html'
    page = requests.get(url=url)
    soup = BeautifulSoup(page.content, "html.parser")
    if soup.find('h1').text == 'Page Not Found (404 error)':
        return False
    else:
        return soup


def get_shot_area(soup):
    shot_area = soup.find_all("div", class_="shot-area")
    return shot_area


def get_game_meta(soup):
    game_meta = soup.find_all("div", class_='scorebox_meta')[0].text
    return game_meta


def get_customdata(df: pd.DataFrame) -> list:
    customdata = []
    for row in df.itertuples(index=False):
        customdata.append(tuple(row))
    return customdata

def create_df(shot_area, game_meta):
    df = dict(player_name=[], time_left=[], team_name=[], score_status=[],
              x_shot_pos=[], y_shot_pos=[], quarter=[], shot_status=[], full_text=[])
    for elem in shot_area:
        for content in elem.contents:
            if isinstance(content, Tag) and not re.search(r'alt="nbahalfcourt"', str(content)):
                shot_pos = re.findall(
                    r'\d+(?=px)', str(content.attrs.get('style')))
                tooltip = content.attrs.get('tip')
                status = content.attrs.get('class')
                df['x_shot_pos'].append(int(shot_pos[1]))
                df['y_shot_pos'].append(470 - int(shot_pos[0]))
                df['quarter'].append(tooltip.split(',')[0])
                df['time_left'].append(re.findall(
                    r'(?!\s)(\d+:\d+.\d)(?= remaining)', tooltip)[0])
                df['shot_status'].append(status[-1])
                df['player_name'].append(re.findall(
                    r"(?=<br>)(.*)((?= missed)|(?= made))", tooltip)[0][0][4:])
                df['team_name'].append(re.findall(
                    r"(?=ft<br>)(.*)((?= tied)|(?= now trails)|(?= trails)|(?= leads))", tooltip)[0][0][6:].replace('now', ''))
                df['score_status'].append(re.findall(
                    r"(?=ft<br>).*", tooltip)[0][6:])
                df['full_text'].append(tooltip)

    df = pd.DataFrame.from_dict(df,).astype({'quarter': 'category',
                                            'shot_status': 'category', })
    df['datetime'] = dt.datetime.strptime(re.findall(
        r'\d+:\d+ \w+, \w+ \d+, \d+', game_meta)[0], "%I:%M %p, %B %d, %Y")
    # location = re.findall(
    #     r'(?=\d{4})(\w+\s\w+,.+,\s.+)(?=\nLogos)', game_meta)[0][4:].split(',')
    # # arena = location[0].strip()
    # city = location[1].strip()
    # state = location[2].strip()

    # df['arena'] = arena
    # df['city'] = city
    # df['state'] = state

    return df

In [34]:
df_list = []
failed_games = []
total_lag_time = 0
game_ids_to_scrape = df_final.iloc[:,-3]

In [None]:
start_time = time.time()
logging.warning('\n\n\n-------------------------------------Start of Scraping Shot Locations!-------------------------------------\n\n\n')
for i, x in enumerate(game_ids_to_scrape):
    lag = np.random.randint(4,6)
    total_lag_time += lag
    logging.info(f'Cycle lag time: {lag}')
    time.sleep(lag)
    team = df_final.loc[df_final['game_id'] == x, 'Home_short'][0]
    date = df_final.loc[df_final['game_id'] == x,].index[0]
    year = '{:04d}'.format(date.year)
    month = '{:02d}'.format(date.month)
    day = '{:02d}'.format(date.day)
    logging.info(f'Processing {x} request!')
    try:
        test_soup = request_soup_for_game(year=year, month=month, day=day, team=team)
        logging.info(f'Successfully processed {x} request!')
        
    except Exception as e:
        logging.error(f'Request failed for {x}', exc_info=True)
        failed_games.append(x)
    try:
        shot_area = get_shot_area(test_soup)
        game_meta = get_game_meta(test_soup)
        df = create_df(shot_area, game_meta)
        df['game_id'] = x
        df_list.append(df)
        logging.info(f'Successfully processed {x} shots!\n')

    except Exception as e:
        logging.error(f'Could not get and process shots for {x}\n', exc_info=True)
        failed_games.append(x)

logging.info(f'Total lag time: {total_lag_time} seconds | {round(total_lag_time / 60, 2)} minutes | {round(total_lag_time / 3600, 2)} hours')
logging.info(f'Failed games: {len(failed_games)} games out of {len(game_ids_to_scrape)} games')
logging.info(f'Time elapsed: {time.time() - start_time} seconds | {round((time.time() - start_time) / 60, 2)} minutes | {round((time.time() - start_time) / 3600, 2)} hours')
logging.info(f'Averaged {round((time.time() - start_time) / len(game_ids_to_scrape), 2)} seconds per game')

logging.warning('-------------------------------------End of Scraping Shot Locations!-------------------------------------')

pbp_all = pd.concat(df_list, axis=0)

### Write to Parquet (Append if exists)

In [85]:
if os.path.exists(r'C:\Users\lianz\Python\personal_projects\nba_airflow\data\play_by_play.parquet'):
    pbp_df = pd.read_parquet(r'C:\Users\lianz\Python\personal_projects\nba_airflow\data\play_by_play.parquet')
    pbp_df = pd.concat([pbp_df, pbp_all])
    pbp_df.to_parquet(r'C:\Users\lianz\Python\personal_projects\nba_airflow\data\play_by_play.parquet')
else:
    pbp_all.to_parquet(r'C:\Users\lianz\Python\personal_projects\nba_airflow\data\play_by_play.parquet')

# Get NBA Players Info

In [10]:
def get_players_html(letter: str):
    players = requests.get(f"https://www.basketball-reference.com/players/{letter}/")
    soup = BeautifulSoup(players.content, "html.parser")
    if soup.find('h1').text == 'Page Not Found (404 error)':
        return False
    else:
        return soup

In [11]:
all_letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]


all_letters.index('a')

0

## Scraping NBA Players Info

In [70]:
all_letters = [chr(i) for i in range(ord('a'), ord('z') + 1)]
failed_letters = []
players_df = pd.DataFrame()
total_lag_time = 0

logging.warning('\n\n\n-------------------------------------Start of Scraping Player Info!-------------------------------------\n\n\n')

start_time = time.time()

for i in all_letters:
    logging.info(f'Scraping letter {i}')

    try:
        loop_players_df = pd.read_html(str(get_players_html(i).find_all('table', id='players')[0]))[0]
        logging.info(f'Scraped letter {i} | {len(players_df)} players')

    except Exception as e:
        logging.error(f'Failed to scrape letter {i}: {e}\n')
        failed_letters.append(i)
    
    players_df = pd.concat([players_df, loop_players_df], axis=0)
    logging.info(f'Concatenated letter {i} | {len(players_df)} players')

    lag = np.random.randint(4,6)
    total_lag_time += lag

    logging.info(f'Cycle lag time: {lag}')
    logging.info(f'Total lag time after {all_letters.index(i)+1} cycles: {total_lag_time}\n')

    time.sleep(lag)
players_df = players_df.set_index('Player').reset_index()
players_df.to_csv(r'C:\Users\lianz\Python\personal_projects\nba_airflow\data\players.csv', index=False)

logging.info(f'Total lag time: {total_lag_time} seconds | {round(total_lag_time / 60, 2)} minutes | {round(total_lag_time / 3600, 2)} hours')
logging.info(f'Failed letters: {len(failed_letters)} out of {len(all_letters)} letters')
logging.info(f'Time elapsed: {time.time() - start_time} seconds | {round((time.time() - start_time) / 60, 2)} minutes | {round((time.time() - start_time) / 3600, 2)} hours')
logging.info(f'Averaged {round((time.time() - start_time) / len(all_letters), 2)} seconds per letter')

In [3]:
# read play_by_play data
import pandas as pd
df = pd.read_parquet(r'C:\Users\lianz\Python\personal_projects\nba-airflow\data\play_by_play.parquet')

In [4]:
df.loc[df['player_name'].str.contains('Nikola')]

Unnamed: 0,player_name,time_left,team_name,score_status,x_shot_pos,y_shot_pos,quarter,shot_status,full_text,datetime,game_id
95,Nikola Jokić,11:26.0,Denver,Denver trails 0-2,207,438,1st quarter,miss,"1st quarter, 11:26.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
96,Nikola Jokić,10:58.0,Denver,Denver now leads 3-2,329,207,1st quarter,make,"1st quarter, 10:58.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
97,Nikola Jokić,10:13.0,Denver,Denver now leads 5-4,219,434,1st quarter,make,"1st quarter, 10:13.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
98,Nikola Jokić,9:09.0,Denver,Denver trails 5-6,334,206,1st quarter,miss,"1st quarter, 9:09.0 remaining<br>Nikola Jokić ...",2023-05-01 22:00:00,202305012200PHXDEN
100,Nikola Jokić,8:24.0,Denver,Denver now leads 7-6,197,429,1st quarter,make,"1st quarter, 8:24.0 remaining<br>Nikola Jokić ...",2023-05-01 22:00:00,202305012200PHXDEN
...,...,...,...,...,...,...,...,...,...,...,...
164,Nikola Jokić,10:03.0,Denver,Denver now leads 77-73,246,405,4th quarter,make,"4th quarter, 10:03.0 remaining<br>Nikola Jokić...",2023-06-12 20:30:00,202306122030MIADEN
165,Nikola Jokić,9:18.0,Denver,Denver now leads 79-76,283,354,4th quarter,make,"4th quarter, 9:18.0 remaining<br>Nikola Jokić ...",2023-06-12 20:30:00,202306122030MIADEN
170,Nikola Jokić,4:43.0,Denver,Denver now leads 83-76,204,390,4th quarter,make,"4th quarter, 4:43.0 remaining<br>Nikola Jokić ...",2023-06-12 20:30:00,202306122030MIADEN
174,Nikola Jokić,2:24.0,Denver,Denver now leads 88-87,274,423,4th quarter,make,"4th quarter, 2:24.0 remaining<br>Nikola Jokić ...",2023-06-12 20:30:00,202306122030MIADEN


## Clean Players Data

In [5]:
players = pd.read_csv(r'C:\Users\lianz\Python\personal_projects\nba-airflow\data\players.csv')

In [8]:
players.loc[players['Player'].str.contains('Nikola')].merge(df.loc[df['player_name'].str.contains('Nikola')], left_on='Player', right_on='player_name')

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,player_name,time_left,team_name,score_status,x_shot_pos,y_shot_pos,quarter,shot_status,full_text,datetime,game_id
0,Nikola Jokić,2016,2023,C-F,6-11,284.0,"February 19, 1995",,Nikola Jokić,11:26.0,Denver,Denver trails 0-2,207,438,1st quarter,miss,"1st quarter, 11:26.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
1,Nikola Jokić,2016,2023,C-F,6-11,284.0,"February 19, 1995",,Nikola Jokić,10:58.0,Denver,Denver now leads 3-2,329,207,1st quarter,make,"1st quarter, 10:58.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
2,Nikola Jokić,2016,2023,C-F,6-11,284.0,"February 19, 1995",,Nikola Jokić,10:13.0,Denver,Denver now leads 5-4,219,434,1st quarter,make,"1st quarter, 10:13.0 remaining<br>Nikola Jokić...",2023-05-01 22:00:00,202305012200PHXDEN
3,Nikola Jokić,2016,2023,C-F,6-11,284.0,"February 19, 1995",,Nikola Jokić,9:09.0,Denver,Denver trails 5-6,334,206,1st quarter,miss,"1st quarter, 9:09.0 remaining<br>Nikola Jokić ...",2023-05-01 22:00:00,202305012200PHXDEN
4,Nikola Jokić,2016,2023,C-F,6-11,284.0,"February 19, 1995",,Nikola Jokić,8:24.0,Denver,Denver now leads 7-6,197,429,1st quarter,make,"1st quarter, 8:24.0 remaining<br>Nikola Jokić ...",2023-05-01 22:00:00,202305012200PHXDEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26652,Nikola Vučević,2012,2023,C,6-10,260.0,"October 24, 1990",USC,Nikola Vučević,4:17.0,Chicago,Chicago now trails 34-36,80,327,2nd quarter,make,"2nd quarter, 4:17.0 remaining<br>Nikola Vučevi...",2023-04-14 19:00:00,202304141900CHIMIA
26653,Nikola Vučević,2012,2023,C,6-10,260.0,"October 24, 1990",USC,Nikola Vučević,2:34.0,Chicago,Chicago now trails 38-41,271,370,2nd quarter,make,"2nd quarter, 2:34.0 remaining<br>Nikola Vučevi...",2023-04-14 19:00:00,202304141900CHIMIA
26654,Nikola Vučević,2012,2023,C,6-10,260.0,"October 24, 1990",USC,Nikola Vučević,9:30.0,Chicago,Chicago now trails 48-54,268,417,3rd quarter,make,"3rd quarter, 9:30.0 remaining<br>Nikola Vučevi...",2023-04-14 19:00:00,202304141900CHIMIA
26655,Nikola Vučević,2012,2023,C,6-10,260.0,"October 24, 1990",USC,Nikola Vučević,8:26.0,Chicago,Chicago now trails 52-56,268,397,3rd quarter,make,"3rd quarter, 8:26.0 remaining<br>Nikola Vučevi...",2023-04-14 19:00:00,202304141900CHIMIA
