Disclaimer: Chat GPT used for syntax and logic problems
All credit for statistical data goes to Basketball Reference. 
Team abbreviation file created from file created by Tgemayel found on GitHub

In [None]:
import os
import time
import pandas as pd
import numpy as np
import requests
import certifi
from bs4 import BeautifulSoup
from fuzzywuzzy import process

In [None]:
# All of the functions used in the code

# Converts starter's name into {First Initial}. {Last Name}
def starter_conversion(starter):
    first_name, last_name = starter.split()[:2]
    return f"{first_name[0]}. {last_name}"

# Finds all indices where substitutions are made and adds them to the list specified
def sub_check(array, sub_list):
    sub_list.extend(i for i, val in enumerate(array) if pd.notna(val) and 'enters the game for' in val)

# Drops indices from play-by-play based on garbage time play
def drop_indices(combined_list, full_list, df, offset):
    full_list.extend(range(combined_list[0] + offset, len(df)))
    df.drop(full_list, inplace=True)
    print(f"Dropped rows: {full_list}")

# Splits the score string into integer scores
def split_score(score_string):
    return map(int, score_string.split('-'))

# Scrapes play-by-play, replacing player names with Basketball Reference codes
def scrapegame_new(url, game_list):
    file_name = url.split('/')[-1].replace('.html', '')
    game_list.append(file_name)

    response = requests.get(url, verify=certifi.where())
    time.sleep(3)
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Create player name to player code mapping
    player_dict = {
        anchor.text.strip(): anchor['href'].split('/')[-1].split('.')[0]
        for anchor in soup.find_all('a', href=True) if '/players/' in anchor['href']
    }

    table = soup.find('table', {'id': 'pbp'})
    if not table:
        print("Play-by-play table not found.")
        return

    # Extract table data
    data = [
        [
            (cell.get_text(strip=True) if not cell.find('a', href=True) else 
             ' '.join(player_dict.get(a.text.strip(), a.text.strip()) for a in cell.find_all('a', href=True)))
            for cell in row.find_all(['th', 'td'])
        ]
        for row in table.find_all('tr')
    ]

    df = pd.DataFrame(data).drop([0], axis=0)
    df.drop(df.columns[[2, 4]], axis=1, inplace=True)

    os.makedirs("games", exist_ok=True)
    file_path = os.path.join("games", f"{file_name}.csv")
    df.to_csv(file_path, index=False)
    
    print(f"Saved play-by-play data to {file_path}")

# Scrapes box score and extracts home/away teams
def scrape_box_new(url, home_array, away_array):
    teams = pd.read_csv('Teams.csv')
    all_teams = teams.iloc[:, 0].tolist()

    response = requests.get(url, verify=certifi.where())
    time.sleep(3)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').get_text(strip=True)
    away_team, home_team = title.split(' at ')[0], title.split(' at ')[1].split('Box')[0]

    away_array.append(teams.iloc[all_teams.index(away_team), 1])
    home_array.append(teams.iloc[all_teams.index(home_team), 1])

    df_list = pd.read_html(response.text)
    away_df, home_df = df_list[0], df_list[8]

    folder_path = os.path.join("games", url.split('/')[-1].replace('.html', ''))
    os.makedirs(folder_path, exist_ok=True)
    
    away_path = os.path.join(folder_path, "away_box.csv")
    home_path = os.path.join(folder_path, "home_box.csv")
    away_df.to_csv(away_path, index=False)
    home_df.to_csv(home_path, index=False)
    
    print(f"Saved box scores: {away_path}, {home_path}")

# Function to analyze winning margin and garbage time plays
def analyze_winning_margin(df):
    df['time_seconds'] = df.iloc[:, 0].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if ':' in str(x) else x)
    df = df.sort_values(by='time_seconds', ascending=False)
    
    drop_rows = []
    for i in range(len(df) - 1):
        curr_score = split_score(df.iloc[i, 2])
        next_score = split_score(df.iloc[i + 1, 2])
        if abs(curr_score[0] - curr_score[1]) >= 4 and abs(next_score[0] - next_score[1]) >= 4:
            drop_rows.append(i)
    
    df.drop(drop_rows, inplace=True)
    print(f"Dropped garbage time rows: {drop_rows}")
    return df

In [None]:
# Runs code for example games
all_true_names_list = pd.read_csv("Players_True.csv")["Player"].astype(str).tolist()

# URL of the page to scrape
box_scores = [
    "https://www.basketball-reference.com/boxscores/202310250CHO.html",
    "https://www.basketball-reference.com/boxscores/202404230LAC.html",
    "https://www.basketball-reference.com/boxscores/202403030BOS.html",
    "https://www.basketball-reference.com/boxscores/202404030MIN.html"
]

play_by_play_urls = [url.replace('/boxscores/', '/boxscores/pbp/') for url in box_scores]

game_names, home_teams, away_teams = [], [], []

for url in play_by_play_urls:
    scrapegame_new(url, game_names)
for url in box_scores:
    scrape_box_new(url, home_teams, away_teams)

# Process winning margin analysis
for game_name in game_names:
    play_by_play_file = os.path.join("games", f"{game_name}.csv")
    df = pd.read_csv(play_by_play_file)
    df = analyze_winning_margin(df)
    df.to_csv(play_by_play_file, index=False)
    print(f"Processed and saved adjusted play-by-play data for {game_name}")