Disclaimer: Chat GPT and Grok AI were used for syntax and logic problems
All credit for statistical data goes to Basketball Reference. 
Team abbreviation file created from file created by Tgemayel found on GitHub

In [None]:
import os
from bs4 import BeautifulSoup
import time
from urllib.request import urlopen
import pandas as pd
import numpy as np
import requests
import certifi
from fuzzywuzzy import process
from collections import defaultdict

In [None]:
# Code utility functions

# Converts starter's name into {First Initial}. {Last Name}
def starter_conversion(starter):
    parts = starter.split()
    return f"{parts[0][0]}. {parts[1]}"

# Finds all indices where substitutions are made and adds them to the list specified
def sub_check(array, sub_list):
    sub_list.extend(i for i, val in enumerate(array) if pd.notna(val) and 'enters the game for' in val)

# Creates an array with all of the indices that need to be dropped from the play-by-play array...
def drop_indices(combined_list, full_list, df, offset):
    full_list.extend(range(combined_list[0] + offset, len(df)))
    full_list.reverse()
    df.drop(full_list, inplace=True)
    print("Dropped rows: " + str(full_list))

# Splits the score string into integer scores
def split_score(string):
    away, home = map(int, string.split('-'))
    return away, home

# Scrapes play-by-play in a new way
def scrapegame_new(url, array):
    file_name = url.split('/')[-1].replace('.html', '')
    array.append(file_name)
    
    response = requests.get(url, verify=certifi.where())
    time.sleep(3)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    player_dict = {a.text.strip(): a['href'].split('/')[-1].split('.')[0] 
                  for a in soup.find_all('a', href=True) if '/players/' in a['href']}
    
    table = soup.find('table', {'id': 'pbp'})
    if not table:
        print("Play-by-play table not found.")
        return
    
    data = []
    for row in table.find_all('tr'):
        row_data = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
        for cell in row.find_all('td'):
            if cell.find('a', href=True):
                row_data = [text.replace(name, f" {player_dict[name]} ") 
                          for name in player_dict 
                          for text in row_data if name in text]
        data.append(row_data)
    
    df = pd.DataFrame(data).drop(columns=[2, 4]).drop(index=0)
    
    os.makedirs("games", exist_ok=True)  # Create directory
    df.to_csv(f"games/{file_name}_pbp.csv")
    
    df = pd.read_csv(f"games/{file_name}_pbp.csv")
    df.columns = df.iloc[0]
    df = df.drop(0)
    
    df.iloc[:, [1, 3]] = df.iloc[:, [1, 3]].apply(lambda x: x.str.strip())
    df.to_csv(f"games/{file_name}_pbp.csv")

# Scrapes the box score
def scrape_box_new(url, home_array, away_array):
    teams = pd.read_csv('Teams.csv')
    team_dict = dict(zip(teams.iloc[:, 0], teams.iloc[:, 1]))
    
    response = requests.get(url, verify=certifi.where())
    time.sleep(3)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('h1').text.strip()
    away_team, home_team = title.split(' at ')[0], title.split(' at ')[1].split('Box')[0].strip()
    
    away_array.append(team_dict[away_team])
    home_array.append(team_dict[home_team])
    
    df_list = pd.read_html(response.text)
    away, home = pd.DataFrame(df_list[0]), pd.DataFrame(df_list[8])
    
    folder_name = f"{away_array[-1]}_at_{home_array[-1]}"
    folder_path = os.path.join('games', folder_name)
    os.makedirs(folder_path, exist_ok=True)
    
    away_file = os.path.join(folder_path, f"{away_array[-1]}_box.csv")
    home_file = os.path.join(folder_path, f"{home_array[-1]}_box.csv")
    
    for df, path in [(away, away_file), (home, home_file)]:
        df.drop(index=[0, 6]).drop(columns=[1, 18, 20]).to_csv(path, index=False)
        cleaned_df = pd.read_csv(path)
        cleaned_df.columns = cleaned_df.iloc[0]
        cleaned_df.drop(0).to_csv(path, index=False)

# Function to replace player's name with code
def find_player_code(name, player_df=None):
    if player_df is None:
        player_df = pd.read_csv('CSV with all Players')
    return player_df[player_df.iloc[:, 0] == name].iloc[0, 25] if name in player_df.iloc[:, 0].values else name

# Function to replace code with player's name
def find_player_name(code, player_df=None):
    if player_df is None:
        player_df = pd.read_csv('CSV with all Players')
    return player_df[player_df.iloc[:, 25] == code].iloc[0, 0] if code in player_df.iloc[:, 25].values else code

In [None]:
# Utilizing functions

# List of all correct names
all_true_names = pd.read_csv("Players_True.csv")["Player"].astype(str).tolist()

# URL of the page to scrape
all_game_names, home_team_names, away_team_names = [], [], []
box_score = [
    "https://www.basketball-reference.com/boxscores/202310250CHO.html",
    "https://www.basketball-reference.com/boxscores/202404230LAC.html",
    "https://www.basketball-reference.com/boxscores/202403030BOS.html",
    "https://www.basketball-reference.com/boxscores/202404030MIN.html"
]
play_by_play = [url.replace('/boxscores/', '/boxscores/pbp/') for url in box_score]

for url in play_by_play:    
    scrapegame_new(url, all_game_names)
for url in box_score:
    scrape_box_new(url, home_team_names, away_team_names)

# Code to match names with messed-up accents with correct names
def find_closest_match(messed_up_name, all_true_names_list):
    return process.extractOne(messed_up_name, all_true_names_list)[0]

for i, game in enumerate(all_game_names):
    for team, path in [(home_team_names[i], f"games/{game}/{home_team_names[i]}_box.csv"),
                      (away_team_names[i], f"games/{game}/{away_team_names[i]}_box.csv")]:
        box = pd.read_csv(path)
        box.iloc[:, 1:18] = 0
        box.iloc[:, 0] = box.iloc[:, 0].apply(lambda x: find_closest_match(x, all_true_names) if x != "Team Totals" else x)
        box.to_csv(path, index=False)

# Sometimes the starters are not substituted out in a row
def process_substitutions(df, home_players, away_players, third_index):
    home_subs, away_subs = [], []
    sub_check(df.iloc[:, 3].values, home_subs)
    sub_check(df.iloc[:, 1].values, away_subs)
    
    home_starter_list = [find_player_code(p) for p in home_players.iloc[:5, 0]]
    away_starter_list = [find_player_code(p) for p in away_players.iloc[:5, 0]]
    
    home_subs = sorted(set(i for i in home_subs if i >= third_index and any(s in str(df.iloc[i, 3]) for s in home_starter_list)))
    away_subs = sorted(set(i for i in away_subs if i >= third_index and any(s in str(df.iloc[i, 1]) for s in away_starter_list)))
    
    return home_subs, away_subs, home_starter_list, away_starter_list

for i, game in enumerate(all_game_names):
    df = pd.read_csv(f"games/{game}_pbp.csv")
    home_players = pd.read_csv(f"games/{game}/{home_team_names[i]}_box.csv")
    away_players = pd.read_csv(f"games/{game}/{away_team_names[i]}_box.csv")
    
    # Finding start of third quarter
    third_index = df[df.iloc[:, 1].str.contains('Start of 3rd quarter', na=False)].index[0]
    home_subs, away_subs, home_starters, away_starters = process_substitutions(df, home_players, away_players, third_index)
    
    # Finds first and second subs for home and away teams
    home_first_subs = [df.iloc[i, 3].split(' enters the game for ')[0].strip() for i in home_subs[1:]]
    home_second_subs = [df.iloc[i, 3].split(' enters the game for ')[1].strip() for i in home_subs[1:]]
    away_first_subs = [df.iloc[i, 1].split(' enters the game for ')[0].strip() for i in away_subs[1:]]
    away_second_subs = [df.iloc[i, 1].split(' enters the game for ')[1].strip() for i in away_subs[1:]]
    
    # Counting home resubs
    home_resub_count = sum(1 for val in home_second_subs if val in home_first_subs)
    away_resub_count = sum(1 for val in away_second_subs if val in away_first_subs)
    
    # Counting non repeated subs
    total_non_repeated_subs = [home_subs[home_second_subs.index(val)] for val in home_second_subs if val not in home_first_subs]
    total_non_repeated_subs.extend(away_subs[away_second_subs.index(val)] for val in away_second_subs if val not in away_first_subs)
    total_non_repeated_subs.sort()
    
    if total_non_repeated_subs and (home_resub_count <= 2 or away_resub_count <= 2):
        print("Game Code: " + game)
        drop_indices(total_non_repeated_subs, [], df, 0)
    
    true_away, true_home = split_score(df.iloc[-2, 2])
    print(f"True away score: {true_away}\nTrue home score: {true_home}")
    df.to_csv(f"games/{game}_pbp.csv", index=False)

# If the team that is up in the game has a "positive play,"
pos_plays = [" makes ", " rebound "]

for i, game in enumerate(all_game_names):
    df = pd.read_csv(f"games/{game}_pbp.csv")
    print("Game code: " + game)
    
    final_away, final_home = split_score(df.iloc[-2, 2])
    actual_winner = "home" if final_home > final_away else "away" if final_away > final_home else "tie"
    
    df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: sum(int(p) * 60**(1-i) for i, p in enumerate(str(x).split(':'))) 
                                        if ':' in str(x) else x)
    fourth_index = df[df.iloc[:, 0] == '4th Q'].index[0]
    under_24 = next((i for i in range(fourth_index + 2, len(df)) if int(df.iloc[i, 0]) <= 24), 0)
    
    scoring_drop_indices = []
    winning_team = ""
    
    def end_check(index, winning_team):  
        if not index:
            print("No need to examine end-of-game closeness.")
            return False, winning_team
        
        for y in range(index, len(df) - 1):
            away, home = split_score(df.iloc[y, 2])
            for play, team in [(df.iloc[y, 1], "away"), (df.iloc[y, 3], "home")]:
                if any(p in str(play) for p in pos_plays) and ((away > home and team == "away") or (home > away and team == "home")):
                    if abs(away - home) > 3:
                        scoring_drop_indices.append(y)
                        return True, team
        return False, winning_team

    def margin_check(index, winning_team):  
        margins = [split_score(df.iloc[y, 2])[0] - split_score(df.iloc[y, 2])[1] 
                  if winning_team == "away" else split_score(df.iloc[y, 2])[1] - split_score(df.iloc[y, 2])[0] 
                  for y in range(index, len(df) - 1)]
        return next((i + index for i, m in enumerate(margins) if m < 4), index)
    
    index = under_24
    while end_check(index):
        index = margin_check(scoring_drop_indices[-1])
        if index == scoring_drop_indices[-1]:
            break
        scoring_drop_indices.pop()
    
    if scoring_drop_indices and winning_team == actual_winner:
        drop_indices(scoring_drop_indices, [], df, 1)
    else:
        print("No Dropped Rows")
    
    df.to_csv(f"games/{game}_pbp.csv", index=False)

In [None]:
# Trying to work with iterating through the data frames to record statistics
def process_game_stats(game, home_team, away_team):
    play = pd.read_csv(f"games/{game}_pbp.csv")
    home = pd.read_csv(f"games/{game}/{home_team}_box.csv")
    away = pd.read_csv(f"games/{game}/{away_team}_box.csv")
    
    for df, players in [(home, home.iloc[:, 0]), (away, away.iloc[:, 0])]:
        df.iloc[:, 0] = players.apply(find_player_code)
    
    stat_indices = {
    'FG_MADE': 1, 'FG_ATTEMPT': 2, '3PT_MADE': 4, '3PT_ATTEMPT': 5,
    'FT_MADE': 7, 'FT_ATTEMPT': 8, 'OFF_REB': 10, 'DEF_REB': 11,
    'ASSISTS': 13, 'STEALS': 14, 'BLOCKS': 15, 'TURNOVERS': 16
    }

    def update_stat(play, df, players, stat_func, condition):
        if condition in str(play):
            stat_func(play, df, players)

    stat_funcs = {
        'two_made': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" makes 2-pt")[0]), stat_indices['FG_MADE']), d.iloc[pl.index(p.split(" makes 2-pt")[0]), stat_indices['FG_MADE']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" makes 2-pt")[0]), stat_indices['FG_ATTEMPT']), d.iloc[pl.index(p.split(" makes 2-pt")[0]), stat_indices['FG_ATTEMPT']] + 1),

        'two_missed': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" misses 2-pt")[0]), stat_indices['FG_ATTEMPT']), d.iloc[pl.index(p.split(" misses 2-pt")[0]), stat_indices['FG_ATTEMPT']] + 1),

        'three_made': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" makes 3-pt")[0]), stat_indices['FG_MADE']), d.iloc[pl.index(p.split(" makes 3-pt")[0]), stat_indices['FG_MADE']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" makes 3-pt")[0]), stat_indices['FG_ATTEMPT']), d.iloc[pl.index(p.split(" makes 3-pt")[0]), stat_indices['FG_ATTEMPT']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" makes 3-pt")[0]), stat_indices['3PT_MADE']), d.iloc[pl.index(p.split(" makes 3-pt")[0]), stat_indices['3PT_MADE']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" makes 3-pt")[0]), stat_indices['3PT_ATTEMPT']), d.iloc[pl.index(p.split(" makes 3-pt")[0]), stat_indices['3PT_ATTEMPT']] + 1),

        'three_miss': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" misses 3-pt")[0]), stat_indices['FG_ATTEMPT']), d.iloc[pl.index(p.split(" misses 3-pt")[0]), stat_indices['FG_ATTEMPT']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" misses 3-pt")[0]), stat_indices['3PT_ATTEMPT']), d.iloc[pl.index(p.split(" misses 3-pt")[0]), stat_indices['3PT_ATTEMPT']] + 1),

        'ft_made': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" makes free throw")[0]), stat_indices['FT_MADE']), d.iloc[pl.index(p.split(" makes free throw")[0]), stat_indices['FT_MADE']] + 1) or
                                    d.iloc.__setitem__((pl.index(p.split(" makes free throw")[0]), stat_indices['FT_ATTEMPT']), d.iloc[pl.index(p.split(" makes free throw")[0]), stat_indices['FT_ATTEMPT']] + 1),

        'ft_miss': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split(" misses free throw")[0]), stat_indices['FT_ATTEMPT']), d.iloc[pl.index(p.split(" misses free throw")[0]), stat_indices['FT_ATTEMPT']] + 1),

        'off_reb': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split("Offensive rebound by ")[0]), stat_indices['OFF_REB']), d.iloc[pl.index(p.split("Offensive rebound by ")[0]), stat_indices['OFF_REB']] + 1),

        'def_reb': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split("Defensive rebound by ")[0]), stat_indices['DEF_REB']), d.iloc[pl.index(p.split("Defensive rebound by ")[0]), stat_indices['DEF_REB']] + 1),

        'assist': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split("assist by")[0]), stat_indices['ASSISTS']), d.iloc[pl.index(p.split("assist by")[0]), stat_indices['ASSISTS']] + 1),

        'turnover': lambda p, d, pl: d.iloc.__setitem__((pl.index(p.split("Turnover by ")[0]), stat_indices['TURNOVERS']), d.iloc[pl.index(p.split("Turnover by ")[0]), stat_indices['TURNOVERS']] + 1)
    }
    
    for play_set, df, opp_df, players in [(play.iloc[:, 1], away, home, away.iloc[:, 0]), 
                                        (play.iloc[:, 3], home, away, home.iloc[:, 0])]:
        for play in play_set:
            play = str(play)
            for stat, func in stat_funcs.items():
                update_stat(play, df, players, func, stat.replace('_', ' '))
            if "steal by" in play:
                opp_df.iloc[opp_df.iloc[:, 0].index(play.split("steal by")[1].strip("() ")), stat_indices['STEALS']] += 1
            if "block by" in play:
                opp_df.iloc[opp_df.iloc[:, 0].index(play.split("block by")[1].strip("() ")), stat_indices['BLOCKS']] += 1
    
    def fill_stats(df):
        df.iloc[:, 17] = (df.iloc[:, 1] - df.iloc[:, 4]) * 2 + df.iloc[:, 4] * 3 + df.iloc[:, 7]  # Points
        df.iloc[:, 3] = df.iloc[:, 1] / df.iloc[:, 2].replace(0, pd.NA)  # FG%
        df.iloc[:, 6] = (df.iloc[:, 4] / df.iloc[:, 5].replace(0, pd.NA)).round(3)  # 3PT%
        df.iloc[:, 9] = (df.iloc[:, 7] / df.iloc[:, 8].replace(0, pd.NA)).round(3)  # FT%
        df.iloc[:, 12] = df.iloc[:, 10] + df.iloc[:, 11]  # Total Rebounds
    
    for df in [home, away]:
        fill_stats(df)
        df.fillna(0).to_csv(f"games/{game}/{df.name}_box.csv", index=False)

for i, game in enumerate(all_game_names):
    process_game_stats(game, home_team_names[i], away_team_names[i])