1. Define garbage time
    Garbage time will be defined as the team being up having a ten point lead when the time is 3 minutes or under
2. Work with one game first, scrape game box score
3. Delete all rows that happen after garbage time
4. Record winner and loser of the game
5. Record all statistics before the rows that were deleted in garbage time
6. Run this code with all games of the 2023-2024 regular season to see where players actually stacked up and where teams statistics actually stacked up

Actual statistics that need to be recorded(repeat with three substitutions and four substitutions)
1. Change the box score to account for garbage time statistics
2. Record who won the game and who lost the game
3. Create an excel spreadsheet that keeps track of all players and their statistics. Do the same for teams.
4. Find some way to account for buckets that are made in the last 10 seconds but do not affect the outcome of the game

In [1]:
import os
from bs4 import BeautifulSoup
import time
from urllib.request import urlopen
import pandas as pd
from numpy import array
import numpy as np
import requests
import certifi

In [2]:
# All of the functions used in the code
 
# Converts starter's name into {First Initial}. {Last Name}

def starter_conversion(starter):
    parts = starter.split()
    first_name, last_name = parts[0], parts[1]
    
    converted_name = f"{first_name[0]}. {last_name}"
    return converted_name
    
    #print(converted_name)
    
# Finds all indices where substitutions are made and adds them to the list specified

def sub_check(array, list):
    for i in range(array.size):
        if not pd.isna(array[i]):
            if 'enters the game for' in array[i]:
                list.append(i)

# Finds the set of values within the away/home indices with longest sequence of consecuive numbers, which signifies the beginning of garbage time as it pertains to games where entire lineups are pulled. This code was written completely by Chat GPT.

def consecutive_reduce(array, combined_list):
    consecutive_sequences = []
    
    i = 0
    while i < len(array):
        start = array[i]
        end = start
        
        while i + 1 < len(array) and array[i + 1] == end + 1:
            end = array[i + 1]
            i += 1
        
        consecutive_sequences.append(list(range(start, end + 1)))
        i += 1

    longest_sequence = max(consecutive_sequences, key=len) if consecutive_sequences else []
    
    if len(longest_sequence) < 4: # this needs to be tested with both 3 and 4 to see how many starters are pulled at a time
        longest_sequence = []

    print("Consective array: " + str(longest_sequence))
    
    for i in range(len(longest_sequence)):
        combined_list.append(longest_sequence[i])
    
    return longest_sequence

# Creates an array with all of the indices that need to be dropped from the play-by-play array. The indices go from the first garbage time substitution to the end of the data frame.
def drop_indices(combined_list, full_list, df):
    for i in range(combined_list[0], len(df)):
        full_list.append(i)
    full_list.reverse()
    df = df.drop(full_list, inplace = True) 
    print("Dropped rows: " + str(full_list))


In [3]:
# Scrapes play-by-play in a new way. Scrapes the pbp data frame and replaces all player names with the player's Basketball Reference code. Also scrapes the box score.

def scrapegame_new(url, array):
    # Creates the file name for the play by play file
    file_name = url.split('/')[-1].replace('.html', '')
    array.append(file_name)

    # Send a GET request to the page
    response = requests.get(url, verify = certifi.where())

    # Sleep delay
    time.sleep(3)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Dictionary to store player full names to their player codes
    player_dict = {}
    player_names = []
    player_codes = []

    # Find all anchor tags with href attribute
    anchors = soup.find_all('a', href=True)

    # Filter out player links, store the full URLs and corresponding text
    for anchor in anchors:
        href = anchor['href']
        if '/players/' in href:
            player_code = href.split('/')[-1].split('.')[0]
            player_name = anchor.text.strip()
            player_dict[player_name] = player_code
            player_names.append(player_name)
            player_codes.append(player_code)


    # Find the play-by-play table on the page
    table = soup.find('table', {'id': 'pbp'})

    if table:
        count = 1
        # Iterate through each row in the table
        data = []
        for row in table.find_all('tr'):
            # Find all cells in the row
            cells = row.find_all(['th', 'td'])
            row_data = []
            for cell in cells:
                # Extract the text from the cell
                cell_text = cell.get_text(strip=True)
                # Check if the cell contains player names with links
                if cell.find('a', href=True):
                    # Replace player names with their corresponding player codes
                    for anchor in cell.find_all('a', href=True):
                        player_name = anchor.text.strip()
                        player_link = anchor['href']
                        # Replace the player name with the player code, adding spaces around it
                        if player_name in player_dict:
                            player_code = player_codes[count]
                            count += 1
                            # Ensure a space before and after the player code
                            cell_text = cell_text.replace(player_name, f" {player_code} ")
                row_data.append(cell_text)
            data.append(row_data)

        # Convert the list of lists into a DataFrame
        df = pd.DataFrame(data)

        # Display the DataFrame
        pd.set_option('display.max_columns', None)
        pd.set_option('display.expand_frame_repr', False)
    else:
        print("Play-by-play table not found.")

    columns_to_drop = [2, 4]
    df = df.drop(df.columns[columns_to_drop], axis = 1)
    df = df.drop(index = [0])

    os.makedirs(# Create directory)
    df.to_csv(# Send file to csv)

    df = pd.read_csv(# Read in file)

    df.to_csv(# Send file to csv)

    df = pd.read_csv(# Read in file)
    df.columns = df.iloc[0]
    df = df.drop(0)

    for i in range(len(df)):
        away_play = df.iloc[i, 1]
        away_play = str(away_play)
        if away_play[0] == ' ':
            df.iloc[i, 1] = away_play[1:]
        home_play = df.iloc[i, 3]
        home_play = str(home_play)
        if home_play[0] == ' ':
            df.iloc[i, 3] = home_play[1:]

    df.to_csv(# Send file to csv)

def scrape_box_new(url, home_array, away_array):
    # Contains all NBA Teams and abbreviations
    teams = pd.read_csv('Teams.csv')
    all_teams = teams.iloc[:, 0].values
    all_teams = all_teams.tolist()
    
    response = requests.get(url, verify = certifi.where())
    response.raise_for_status()

    # Time delay
    time.sleep(3)

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find titlw
    title_element = soup.find('h1')
    title = title_element.get_text().strip()

    # Find index of at to get home team
    index_at = title.find(' at ')

    # Get away team
    away_team = title[:index_at].strip()
    away_index = all_teams.index(away_team)
    away_abbreviation = teams.iloc[away_index, 1]
    away_array.append(away_abbreviation)

    # Get index of box to find home team
    index_box = title.find('Box')

    # Get home team
    home_team = title[index_at + 3:index_box].strip()
    home_index = all_teams.index(home_team)
    home_abbreviation = teams.iloc[home_index, 1] 
    home_array.append(home_abbreviation)
      
    df_list = pd.read_html(response.text)
    away_scrape = df_list[0]
    home_scrape = df_list[8]
    
    away = pd.DataFrame(away_scrape)
    home = pd.DataFrame(home_scrape)
        
    folder_name = url.split('/')[-1].replace('.html', '')
    folder_path = os.path.join('games', folder_name)

    away_file_name = url.split('/')[-1].replace('.html', '') + '_' + away_abbreviation + '.csv'
    home_file_name = url.split('/')[-1].replace('.html', '') + '_' + home_abbreviation + '.csv'

    away_file_path = os.path.join(folder_path, away_file_name)
    home_file_path = os.path.join(folder_path, home_file_name)

    away.to_csv(away_file_path, index = False)
    home.to_csv(home_file_path, index = False)
    
    
    away = pd.read_csv(# Read away file as csv)
    away.columns = away.iloc[0]
    away = away.drop([0, 6])
    away = away.drop(away.columns[[1, 18, 20]], axis = 1)

    home = pd.read_csv(# Read home file as csv)
    home.columns = home.iloc[0]
    home = home.drop([0, 6])
    home = home.drop(home.columns[[1, 18, 20]], axis = 1)
    
    away.to_csv(# Saves away file as csv)
    home.to_csv(# Saves home file as csv)

# Used to find player's Basketball Reference unique player code
def find_player_code(name):
    df = pd.read_csv('Players_True.csv')

    for i in range(len(df)):
        if df.iloc[i, 0] == name:
            return df.iloc[i, 25]
    

In [10]:
# URL of the page to scrape
all_game_names = []
home_team_names = []
away_team_names = []

box_score = ["https://www.basketball-reference.com/boxscores/202310250UTA.html", "https://www.basketball-reference.com/boxscores/202310270CHO.html"]
play_by_play = ["https://www.basketball-reference.com/boxscores/pbp/202310250UTA.html", "https://www.basketball-reference.com/boxscores/pbp/202310270CHO.html"]

for url in play_by_play:    
    scrapegame_new(url, all_game_names)
for url in box_score:
    scrape_box_new(url, home_team_names, away_team_names)

  df_list = pd.read_html(response.text)
  df_list = pd.read_html(response.text)


In [11]:
# Clears all box score data so that statistics can be read in from the play-by-play data frame after it has been cleaned for garbage time values

for i in range(0, len(all_game_names)):
    home_box = pd.read_csv(# Read in home box score)
    away_box = pd.read_csv(# Read in away box score)

    home_box.iloc[:, 1:18] = 0
    away_box.iloc[:, 1:18] = 0

    home_box.to_csv(# Save home box score as csv)
    away_box.to_csv(# Save away box score as csv)

In [12]:
# Sometimes the starters are not substituted out in a row. Sometimes they are substituted out at different times in the last two quarters. Therefore, I need to iterate through the plays in the third and fourth quarters to see if the starter is subbed out and not subbed back in

for i in range(0, len(all_game_names)):
    game_code = all_game_names[i]
    home_team = home_team_names[i]
    away_team = away_team_names[i]

    third_index = 0

    home_starter_substitutions = []
    away_starter_substitutions = []

    home_second_subs = []
    home_first_subs = []
    home_non_repeated_subs = []

    away_second_subs = []
    away_first_subs = []
    away_non_repeated_subs = []

    complete_indices = []

    home_resub_count = 0
    away_resub_count = 0

    total_non_repeated_subs = []

    # Used for checking scores:
    fourth_away_scores = []
    fourth_home_scores = []
    true_away_scores = []
    true_home_scores = []

    df = pd.read_csv(# Read in play by play file)

    home_players = pd.read_csv(f'games/{game_code}/{game_code}_{home_team}.csv')
    away_players = pd.read_csv(f'games/{game_code}/{game_code}_{away_team}.csv')

    # Finds final score(in the foruth quarter) of the game and adds final away and home scores to arrays that will later be used to determine accuracy of the code
    fourth_score = df.iloc[len(df) - 2, 2]
    fourth_split = fourth_score.split('-')

    fourth_away_score = int(fourth_split[0])
    fourth_away_scores.append(fourth_away_score)

    fourth_home_score = int(fourth_split[1])
    fourth_home_scores.append(fourth_home_score)

    quarter_array = df.iloc[:, 1].values

    home_play_array = df.iloc[:, 3].values
    away_play_array = df.iloc[:, 1].values

    home_player_array = home_players.iloc[:, 0].values
    away_player_array = away_players.iloc[:, 0].values

    # How to convert numpy arrays to lists, all data frame columns are automatically converted to lists
    quarter_list = quarter_array.tolist()

    home_play_list = home_play_array.tolist()
    away_play_list = away_play_array.tolist()

    home_player_list = home_player_array.tolist()
    away_player_list = away_player_array.tolist()

    # Only need an array of starters
    home_starter_list = home_player_list[:5]
    away_starter_list = away_player_list[:5]

    # Find the index of the start of the third quarter
    for value in quarter_list:
        if 'Start of 3rd quarter' in str(value):
            third_index = quarter_list.index(value)

    # Converts starters names to the proper format
    for i in range(0, len(home_starter_list)):
        home_starter_list[i] = find_player_code(home_starter_list[i])
        away_starter_list[i] = find_player_code(away_starter_list[i])

    # Finds all indices where starter is substituted for the home team
    for i in range(0, len(home_play_list)):
        play = str(home_play_list[i])
        for starter in home_starter_list:
            if starter in play and 'enters the game for' in play:
                home_starter_substitutions.append(i)

    # Finds all indices where starter is substituted for the away team
    for i in range(0, len(away_play_list)):
        play = str(away_play_list[i])
        for starter in away_starter_list:
            if starter in play and 'enters the game for' in play:
                away_starter_substitutions.append(i)
    
    # Deletes all indices from home substitution array if row is before the start of the third quarter
    i = len(home_starter_substitutions) - 1

    while i >= 0:
        if home_starter_substitutions[i] < third_index:
            home_starter_substitutions.pop(i)
        i -= 1

    # Deletes all indices from away substitution array if row is before the start of the third quarter
    i = len(away_starter_substitutions) - 1

    while i >= 0:
        if away_starter_substitutions[i] < third_index:
            away_starter_substitutions.pop(i)
        i -= 1

    # Removes duplicate values from the home play indices array
    home_starter_substitutions = list(set(home_starter_substitutions))
    home_starter_substitutions.sort()

    # Removes duplicate values from the away play indices array
    away_starter_substitutions = list(set(away_starter_substitutions))
    away_starter_substitutions.sort()
    
    # Finds the plays where a home starter is substituted
    for i in range(0, len(home_starter_substitutions)):
        sub_play = home_play_list[home_starter_substitutions[i]]
        split = sub_play.split(' enters the game for ')
        
        first_sub = split[0]
        second_sub = split[1]

        home_first_subs.append(first_sub)
        home_second_subs.append(second_sub)

    # Finds the plays where an away starter is substituted
    for i in range(0, len(away_starter_substitutions)):
        sub_play = away_play_list[away_starter_substitutions[i]]
        split = sub_play.split(' enters the game for ')
        
        first_sub = split[0]
        second_sub = split[1]

        away_first_subs.append(first_sub)
        away_second_subs.append(second_sub)

    # Only need substitutions after the first substitution
    home_first_subs = home_first_subs[1:]
    away_first_subs = away_first_subs[1:]

    # Stripping the whitespace from all values
    home_first_subs = [value.strip() for value in home_first_subs]
    home_second_subs = [value.strip() for value in home_second_subs]

    away_first_subs = [value.strip() for value in away_first_subs]
    away_second_subs = [value.strip() for value in away_second_subs]
    
    # Seeing if home starter was resubed back in 
    for value in home_second_subs:
        if value in home_first_subs:
            home_resub_count += 1
        if value not in home_first_subs:
            home_non_repeated_subs.append(home_starter_substitutions[home_second_subs.index(value)])

    # Seeing if away starter was resubed back in 
    for value in away_second_subs:
        if value in away_first_subs:
            away_resub_count += 1
        if value not in away_first_subs:
            away_non_repeated_subs.append(away_starter_substitutions[away_second_subs.index(value)])

    # Add values of non_repeated_sub arrays to final total array if there are an appropriate amount of substitutions
    if len(home_non_repeated_subs) != 0:
        if home_resub_count <= 2:
            total_non_repeated_subs += home_non_repeated_subs
    if len(away_non_repeated_subs) != 0:
        if away_resub_count <= 2:
            total_non_repeated_subs += away_non_repeated_subs

    total_non_repeated_subs.sort()

    if len(total_non_repeated_subs) != 0:
        print("Game Code: " + game_code)
        drop_indices(total_non_repeated_subs, complete_indices, df)
    
    # Finds true score(by disregarding garbage time plays) and adds those scores to separate lists
    true_score = df.iloc[len(df) - 2, 2]
    true_split = true_score.split('-')

    true_away_score = int(true_split[0])
    true_away_scores.append(true_away_score)

    true_home_score = int(true_split[1])
    true_home_scores.append(true_home_score)

    print('\n')
    
    print("Home Information: ")
    print("Home Starter sub moments: " + str(home_starter_substitutions))
    print("Home second person subbed: " + str(home_second_subs))
    print("Home first person subbed: " + str(home_first_subs))

    print("Home non-repeated subs: " + str(home_non_repeated_subs))
    print("Home number of starters resubbed: " + str(home_resub_count))
    print('\n')

    print("Away Information: ")
    print("Away Starter sub moments: " + str(away_starter_substitutions))
    print("Away second person subbed: " + str(away_second_subs))
    print("Away first person subbed: " + str(away_first_subs))

    print("Away non-repeated subs: " + str(away_non_repeated_subs))
    print("Away number of starters resubbed: " + str(away_resub_count))
    print('\n')

    print("Total non-repeated substitution array: " + str(total_non_repeated_subs))

    print("Dropped rows: " + str(complete_indices))

    print("True away score: " + str(true_away_score))
    print("True home score: " + str(true_home_score))

    df.to_csv(f'games/{game_code}/{game_code}.csv', index = False)
    




Home Information: 
Home Starter sub moments: [288, 297, 338, 339, 357, 358, 414, 415, 448, 492, 493, 495]
Home second person subbed: ['hortota01', 'kesslwa01', 'clarkjo01', 'markkla01', 'collijo01', 'olynyke01', 'dunnkr01', 'kesslwa01', 'hortota01', 'clarkjo01', 'collijo01', 'markkla01']
Home first person subbed: ['olynyke01', 'agbajoc01', 'dunnkr01', 'georgke01', 'kesslwa01', 'clarkjo01', 'markkla01', 'collijo01', 'hendrita01', 'samanlu01', 'yurtsom01']
Home non-repeated subs: [288, 288]
Home number of starters resubbed: 10


Away Information: 
Away Starter sub moments: [314, 320, 321, 337, 349, 350, 381, 411, 413, 423, 430, 459, 490, 505, 506, 507, 508]
Away second person subbed: ['huertke01', 'foxde01', 'murrake02', 'barneha02', 'mitchda01', 'sabondo01', 'monkma01', 'foxde01', 'vezenal01', 'monkma01', 'duartch01', 'mitchda01', 'sabondo01', 'barneha02', 'foxde01', 'huertke01', 'murrake02']
Away first person subbed: ['mitchda01', 'vezenal01', 'duartch01', 'foxde01', 'mcgeeja01', 'ba

In [13]:
# Trying to work with iterating through the data frames to record statistics

for a in range(0, len(all_game_names)):
    play = # include path to play by play files 
    home = # include path to home box scores
    away = # include path to away box scores

    away_plays = play.iloc[:, 1].values
    home_plays = play.iloc[:, 3].values

    away_players = away.iloc[:, 0].values
    home_players = home.iloc[:, 0].values

    for i in range(0, len(home_players)):
        home_players[i] = find_player_code(home_players[i])

    for i in range(0, len(away_players)):
        away_players[i] = find_player_code(away_players[i])


    FG_MADE_INDEX = 1
    FG_ATTEMPT_INDEX  = 2
    THREE_PTR_MADE_INDEX = 4
    THREE_PTR_ATTEMPT_INDEX = 5
    FREE_THROW_MADE_INDEX = 7
    FREE_THROW_ATTEMPT_INDEX = 8
    OFFENSIVE_REBOUND_INDEX = 10
    DEFENSIVE_REBOUND_INDEX = 11
    ASSSITS_INDEX = 13
    STEAL_INDEX = 14
    BLOCK_INDEX = 15
    TURNOVER_INDEX = 16

    # Function to find the index of the player in the player array, written by Chat GPT
    def find_player_index(player_array, name):
        array_index = np.where(player_array == name)
        if len(array_index[0]) == 0:
            return None
        return int(array_index[0][0])

    # Looks for two-pointers in the plays
    def two_made(play, box_df, player_array):
        part = play.split(" makes 2-pt")
        name = part[0]
        index = find_player_index(player_array, name)

        if index is not None:
            box_df.iloc[index, FG_MADE_INDEX] += 1
            box_df.iloc[index, FG_ATTEMPT_INDEX] += 1

    def two_missed(play, box_df, player_array):
        part = play.split(" misses 2-pt")
        name = part[0]
        index = find_player_index(player_array, name)

        if index is not None:
            box_df.iloc[index, FG_ATTEMPT_INDEX] += 1

    def three_made(play, box_df, player_array):
        part = play.split(" makes 3-pt")
        name = part[0]
        index = find_player_index(player_array, name)

        if index is not None:
            box_df.iloc[index, FG_MADE_INDEX] += 1
            box_df.iloc[index, FG_ATTEMPT_INDEX] += 1
            box_df.iloc[index, THREE_PTR_MADE_INDEX] += 1
            box_df.iloc[index, THREE_PTR_ATTEMPT_INDEX] += 1

    def three_miss(play, box_df, player_array):
        part = play.split(" misses 3-pt")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FG_ATTEMPT_INDEX] += 1
            box_df.iloc[index, THREE_PTR_ATTEMPT_INDEX] += 1

    def ft_made(play, box_df, player_array):
        part = play.split(" makes free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_MADE_INDEX] += 1
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def ft_tech_made(play, box_df, player_array):
        part = play.split(" makes technical free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_MADE_INDEX] += 1
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def ft_flag_made(play, box_df, player_array):
        part = play.split(" makes flagrant free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_MADE_INDEX] += 1
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def ft_miss(play, box_df, player_array):
        part = play.split(" misses free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def tech_ft_miss(play, box_df, player_array):
        part = play.split(" misses technical free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def flag_ft_miss(play, box_df, player_array):
        part = play.split(" misses flagrant free throw")
        name = part[0]
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, FREE_THROW_ATTEMPT_INDEX] += 1

    def offensive_reb(play, box_df, player_array):
        part = play.split()
        name = part[-1]
        if name != " Team":
            index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, OFFENSIVE_REBOUND_INDEX] += 1

    def defenseive_reb(play, box_df, player_array):
        part = play.split()
        name = part[-1]
        if name != " Team":
            index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, DEFENSIVE_REBOUND_INDEX] += 1

    def assist_made(play, box_df, player_array):
        part = play.split("assist by")
        name = part[1].strip().strip("() ")
        index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, ASSSITS_INDEX] += 1

    def turnover_caused(play, box_df, player_array):
        part = play.split("Turnover by ")[1]
        name = part.split(" ")[0]
        if name != " Team":
            index = find_player_index(player_array, name)
        
        if index is not None:
            box_df.iloc[index, TURNOVER_INDEX] += 1


    # Doing this code structure for steals and blocks because the other team's player's code is in a team's play-by-play information
    for play in away_plays:
        play = str(play)
        if "steal by " in play:
            part = play.split("steal by")
            name = part[1].strip().strip("() ")
            index = find_player_index(home_players, name)

            if index is not None:
                home.iloc[index, STEAL_INDEX] += 1
    for play in home_plays:
        play = str(play)
        if "steal by " in play:
            part = play.split("steal by")
            name = part[1].strip().strip("() ")
            index = find_player_index(away_players, name)

            if index is not None:
                away.iloc[index, STEAL_INDEX] += 1

    for play in away_plays:
        play = str(play)
        if "block by " in play:
            part = play.split("block by")
            name = part[1].strip().strip("() ")
            index = find_player_index(home_players, name)

            if index is not None:
                home.iloc[index, BLOCK_INDEX] += 1
    for play in home_plays:
        play = str(play)
        if "block by " in play:
            part = play.split("block by")
            name = part[1].strip().strip("() ")
            index = find_player_index(away_players, name)

            if index is not None:
                away.iloc[index, BLOCK_INDEX] += 1


    two_pointer = " makes 2-pt"
    two_miss = " misses 2-pt"
    three_pointer_made = " makes 3-pt"
    three_pointer_missed = " misses 3-pt"
    free_throw_made = " makes free throw"
    tech_free_throw_made = " makes technical free throw"
    free_throw_missed = " misses free throw"
    tech_free_throw_missed = " misses technical free throw"
    flagrant_free_throw_made = " makes flagrant free throw"
    flagrant_free_throw_missed = " misses flagrant free throw"
    offensive_rebound = "Offensive rebound by "
    defensive_rebound = "Defensive rebound by "
    assist = "assist by"
    turnover = "Turnover by "


    def play_recorder(away_plays, away_players, away, home_plays, home_players, home, function, string):
        for play in away_plays:
            if string in str(play):
                function(play, away, away_players)
        for play in home_plays:
            if string in str(play):
                function(play, home, home_players)

    play_recorder(away_plays, away_players, away, home_plays, home_players, home, two_made, two_pointer)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, two_missed, two_miss)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, three_made, three_pointer_made)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, three_miss, three_pointer_missed)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, ft_made, free_throw_made)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, ft_miss, free_throw_missed)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, ft_tech_made, tech_free_throw_made)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, tech_ft_miss, tech_free_throw_missed)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, ft_flag_made, flagrant_free_throw_made)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, flag_ft_miss, flagrant_free_throw_missed)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, offensive_reb, offensive_rebound)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, defenseive_reb, defensive_rebound)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, assist_made, assist)
    play_recorder(away_plays, away_players, away, home_plays, home_players, home, turnover_caused, turnover)

    # Writes up a fucntion to complete the rest of the dataframe
    def fill_stats(df):
        for i in range(len(df)):
            # Fills in points category
            two_ptrs = df.iloc[i, 1] - df.iloc[i, 4]
            three_ptrs = df.iloc[i, 4]
            free_throws = df.iloc[i, 7]

            pts = (two_ptrs * 2) + (three_ptrs * 3) + free_throws

            df.iloc[i, 17] = pts

            # Fills in field goal percentage category
            if (df.iloc[i, 2]) != 0:
                df.iloc[i, 3] = df.iloc[i, 1] / df.iloc[i, 2]
            else:
                df.iloc[i, 3] = pd.NA

            # Fills in 3-point percentage category
            if (df.iloc[i, 5]) != 0:
                df.iloc[i, 6] = round(df.iloc[i, 4] / df.iloc[i, 5], 3)
            else:
                df.iloc[i, 6] = pd.NA

            # Fills in free throw percentage category
            if (df.iloc[i, 8]) != 0:
                df.iloc[i, 9] = round(df.iloc[i, 7] / df.iloc[i, 8], 3)
            else:
                df.iloc[i, 8] = pd.NA

            # Fills in total rebound category
            df.iloc[i, 12] = df.iloc[i, 10] + df.iloc[i, 11]

    fill_stats(home)
    fill_stats(away)

    home = home.fillna(0)
    away = away.fillna(0)

    home.to_csv # Incorrect syntax but saves files to csv
    away.to_csv


  df.iloc[i, 3] = df.iloc[i, 1] / df.iloc[i, 2]
  df.iloc[i, 6] = round(df.iloc[i, 4] / df.iloc[i, 5], 3)
  df.iloc[i, 9] = round(df.iloc[i, 7] / df.iloc[i, 8], 3)
  df.iloc[i, 3] = df.iloc[i, 1] / df.iloc[i, 2]
  df.iloc[i, 6] = round(df.iloc[i, 4] / df.iloc[i, 5], 3)
  df.iloc[i, 9] = round(df.iloc[i, 7] / df.iloc[i, 8], 3)


In [14]:
# Scrapes the true box scores for each game and cleans the data frame.

for i in range(len(all_game_names)):
    url = f"https://www.basketball-reference.com/boxscores/{all_game_names[i]}.html"
    
    teams = pd.read_csv('Teams.csv')
    all_teams = teams.iloc[:, 0].values
    all_teams = all_teams.tolist()

    response = requests.get(url, verify = certifi.where())
    response.raise_for_status()

    time.sleep(3)

    soup = BeautifulSoup(response.content, 'html.parser')
    
    df_list = pd.read_html(response.text)
    away_scrape = df_list[0]
    home_scrape = df_list[8]

    away = pd.DataFrame(away_scrape)
    home = pd.DataFrame(home_scrape)

    away.to_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{away_team_names[i]}_true.csv", index = False)
    home.to_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{home_team_names[i]}_true.csv", index = False)

    away = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{away_team_names[i]}_true.csv")
    away.columns = away.iloc[0]
    away = away.drop([0, 6])
    away = away.drop(away.columns[[1, 18, 20]], axis = 1)

    home = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{home_team_names[i]}_true.csv")
    home.columns = home.iloc[0]
    home = home.drop([0, 6])
    home = home.drop(home.columns[[1, 18, 20]], axis = 1)

    for j in range(len(away)):
        if (away.iloc[j, 1:18] == "Did Not Play").any():
            away.iloc[j, 1:18] = pd.NA
        if (away.iloc[j, 1:18] == "Did Not Dress").any():
            away.iloc[j, 1:18] = pd.NA
        if (away.iloc[j, 1:18] == "Not With Team").any():
            away.iloc[j, 1:18] = pd.NA
        if (away.iloc[j, 1:18] == "Player Suspended").any():
            away.iloc[j, 1:18] = pd.NA
    for j in range(len(home)):
        if (home.iloc[j, 1:18] == "Did Not Play").any():
            home.iloc[j, 1:18] = pd.NA
        if (home.iloc[j, 1:18] == "Did Not Dress").any():
            home.iloc[j, 1:18] = pd.NA
        if (home.iloc[j, 1:18] == "Not With Team").any():
            home.iloc[j, 1:18] = pd.NA
        if (home.iloc[j, 1:18] == "Player Suspended").any():
            home.iloc[j, 1:18] = pd.NA
    
    away.iloc[len(away) - 1, :] = pd.NA
    home.iloc[len(home) - 1, :] = pd.NA

    away = away.fillna(0)
    home = home.fillna(0)

    away.to_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{away_team_names[i]}_true.csv", index = False)
    home.to_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{home_team_names[i]}_true.csv", index = False)

  df_list = pd.read_html(response.text)
  df_list = pd.read_html(response.text)


In [15]:
# This is a piece of code that checks to see if the box score developed is the same as the box score that was generated by reading the play by play information

for i in range(len(all_game_names)):
    away_edited = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{away_team_names[i]}.csv")
    away_true = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{away_team_names[i]}_true.csv")

    home_edited = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{home_team_names[i]}.csv")
    home_true = pd.read_csv(f"games/{all_game_names[i]}/{all_game_names[i]}_{home_team_names[i]}_true.csv")

    away_edited = away_edited.round(3)
    away_true = away_true.round(3)

    home_edited = home_edited.round(3)
    home_true = home_true.round(3)

    print(all_game_names[i])
    for j in range(len(away_edited)):
        if not away_edited.iloc[j, 1:18].equals(away_true.iloc[j, 1:18]):
            print(f"Error in edited away data frame: Row {j + 1}")
            for col in range(1, 18):
                if away_edited.iloc[j, col] != away_true.iloc[j, col]:
                    print(f"Column {col + 1}: Edited value = {away_edited.iloc[j, col]}, True value = {away_true.iloc[j, col]}")

    for j in range(len(home_edited)):
        if not home_edited.iloc[j, 1:18].equals(home_true.iloc[j, 1:18]):
            print(f"Error in edited home data frame: Row {j + 1}")
            for col in range(1, 18):
                if home_edited.iloc[j, col] != home_true.iloc[j, col]:
                    print(f"Column {col + 1}: Edited value = {home_edited.iloc[j, col]}, True value = {home_true.iloc[j, col]}")


202310250UTA
202310270CHO


1. Read in box score data frame
2. Read in plays data frame
3. Iterate through every value in the plays data frame. When a sring is found that contributes to a player's statistics, search for that player in the rows of the 'Starters' column in the data frame and add one to the cell under the appropriate statistic

In [None]:
def play_recorder(array, player_df, box_dir):
    for filename in os.listdir(box_dir):
        if filename.endswith('.csv'):
            file_path = os.path.join(box_dir, filename)
            df = pd.read_csv(file_path)
    
        for i in range(0, len(array)):
            play = array[i]



1. Record true box score for the game
2. Add all statistics to a csv containing all players and all stats for the entire season

Stats to record/String template(directly from the raw play-by-play)
1. FG Made - {Name} makes
2. FG Miss - {Name} misses
3. 3P Makes - {Name} makes 3-pt...
4. 3P Misses - {Name} misses 3-pt...
5. FT Makes - {Name} makes free throw...
6. FT Misses - {Name} misses free throw...
7. Offensive Rebounds - Offensive rebound by {Name}
8. Defensive Rebounds - Defensive rebound by {Name}
9. Assists - ...(assist by {Name})
10. Steals - ...(...; steal by {Name})
11. Blocks - ...(block by {Name})
12. Turnovers - Turnover by {Name}
13. Personal Fouls - 3 options:
        1. Offensive foul by {Name}
        2. Personal foul by {Name}
        3. Loose ball foul by {Name}
        4. Turnover by {Name}(offensive foul)


In [49]:
directory = 'pbp'

def find_fourth(directory):
    fourth_index = 0
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            for index, name in df.iloc[:, 2].items():
                if df.iloc[index, 2] == "Start of 4th quarter":
                    fourth_index = index
                    print(fourth_index)
                    
            for i in range(0, fourth_index + 1):
                df = df.drop(i)
                
            print(df)

In [50]:
find_fourth(directory)

339
        Time                                             Dallas  \
340  11:44.0       T. Hardaway misses 3-pt jump shot from 23 ft   
341  11:40.0                     Offensive rebound by L. Dončić   
342  11:38.0  P. Washington makes 3-pt jump shot from 25 ft ...   
343  11:19.0                                                NaN   
344  11:16.0                   Defensive rebound by T. Hardaway   
..       ...                                                ...   
450   0:53.0                   J. Green makes free throw 2 of 2   
451   0:35.0                                                NaN   
452   0:24.0          J. Hardy misses 3-pt jump shot from 25 ft   
453   0:21.0                                                NaN   
454   0:00.0                                 End of 4th quarter   

                  Score                                 LA Clippers  
340               64-87                                         NaN  
341               64-87                            

done
/var/folders/4t/y4bl6hxn2_3dh0qt1ms8dpyr0000gn/T/ipykernel_46477/1320253704.py:13: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  df_list = pd.read_html(response.text)

done
/var/folders/4t/y4bl6hxn2_3dh0qt1ms8dpyr0000gn/T/ipykernel_46477/1320253704.py:13: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  df_list = pd.read_html(response.text)

/var/folders/4t/y4bl6hxn2_3dh0qt1ms8dpyr0000gn/T/ipykernel_867/2045480126.py:257: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.46153846153846156' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[i, 3] = df.iloc[i, 1] / df.iloc[i, 2]