In [27]:
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm

In [28]:
def linkfix(links):
    transformed_links = []
    for link in links:            
            transformed_link = link.replace('/boxscores/', '/boxscores/pbp/')
            transformed_links.append(transformed_link)
    return transformed_links

In [29]:
# Step 1: Splice the dataframe based on the string value
def splice_dataframe(df, column_name, string_values):
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the dataframe.")
    indices = df[df[column_name].isin(string_values)].index.tolist()
    splits = [df.iloc[i:j].assign(Quarter= index) for index, (i, j) in enumerate(zip([0]+indices, indices+[None]), start=1)]

    for i in range(1, len(splits)):
        splits[i].index = [splits[i-1].index[-1]] + list(splits[i].index[1:])

    return splits

# Step 2: Remove the first four rows from each spliced dataframe
def remove_first_four_rows(df_list):
    modified_dfs = [df.drop(df.index[:4]) if index > 1 else df for index, df in enumerate(df_list, start=1)]
    return modified_dfs

# Step 3: Concatenate the modified spliced dataframes back together
def concatenate_dataframes(df_list):
    combined_df = pd.concat(df_list)
    return combined_df

In [30]:
def process_data(df):
    
    # Apply home_or_away function to 'away' column
    df['home_team'] = df['Away_Play'].apply(lambda x: 1 if pd.isnull(x) else 0)
    df['Play'] = df['Away_Play'].combine_first(df['Home_Play'])

    strings_to_drop = ['1st Q', '2nd Q', '3rd Q', '4th Q', '1st OT', '2nd OT', 'Score', 'quarter']
    df = df[~df['Score'].astype(str).str.contains('|'.join(strings_to_drop))]


    # Apply time_convert function to 'timeleft' column
    df['Seconds_left'] = df['Time'].apply(lambda x: int(x[:x.index(':')]) * 60 + int(x[x.index(':')+1:x.index('.')]) if isinstance(x, str) else x)

    # Apply score_away function to 'score' column
    df['score_away'] = df['Score'].apply(lambda x: int(x[:x.index('-')]) if isinstance(x, str) and '-' in x and len(x) <= 7 else x)
    df['score_home'] = df['Score'].apply(lambda x: int(x[x.index('-')+1:]) if isinstance(x, str) and '-' in x and len(x) <= 7 else x)

    # Apply pts_scored_away function to 'pts_scored' column
    df['Away_Pts_Scored'] = df['Away_Pts_Scored'].apply(lambda x: int(x[x.index('+')+1:]) if isinstance(x, str) and '+' in x else x)
    df['Home_Pts_Scored'] = df['Home_Pts_Scored'].apply(lambda x: int(x[x.index('+')+1:]) if isinstance(x, str) and '+' in x else x)
    
    df['score_away'] = pd.to_numeric(df['score_away'], errors='coerce')
    df['score_away'].fillna(method='ffill', inplace=True)
    df['score_away'].fillna(0, inplace=True)
    df['score_away'] = df['score_away'].astype(int)

    df['score_home'] = pd.to_numeric(df['score_home'], errors='coerce')
    df['score_home'].fillna(method='ffill', inplace=True)
    df['score_home'].fillna(0, inplace=True)
    df['score_home'] = df['score_home'].astype(int)

    df['Away_Pts_Scored'] = pd.to_numeric(df['Away_Pts_Scored'], errors='coerce')
    df['Away_Pts_Scored'].fillna(0, inplace=True)
    df['Away_Pts_Scored'] = df['Away_Pts_Scored'].astype(int)

    
    df['Home_Pts_Scored'] = pd.to_numeric(df['Home_Pts_Scored'], errors='coerce')
    df['Home_Pts_Scored'].fillna(0, inplace=True)
    df['Home_Pts_Scored'] = df['Home_Pts_Scored'].astype(int)
	
    df = df.drop(columns=['Time','Away_Play', 'Score', 'Home_Play'])

    target_words = ['makes 2-pt jump shot', 'makes 2-pt layup', 'makes 2-pt dunk', 'makes 2-pt hook shot', 'makes 3-pt jump shot','makes free throw', 
                    'misses 2-pt jump shot', 'misses 2-pt layup', 'misses 2-pt dunk', 'misses 2-pt hook shot', 'misses 3-pt jump shot','misses free throw',
                    'Turnover', 'Personal foul', 'Offensive foul', 'Technical foul', 'Flagrant foul', 'Shooting foul', 'Loose ball foul', 'Defensive rebound', 'Offensive rebound', 'timeout']

    for word in target_words:
        df[word] = df['Play'].apply(lambda x: 1 if word in x else 0)
    
    target_words2 = ['NBA Finals', 'Conference Finals', 'Conference Semi-Finals', 'First Round', 'Play-in']

    for word in target_words2:
        df[word] = df['game_title'].apply(lambda x: 1 if word in x else 0)

    df['playoffs'] = df['game_title'].apply(lambda x: 1 if any(word in x for word in target_words2) else 0)
    df = df.replace(',', ' ', regex=True).applymap(lambda x: f'"{x}"' if isinstance(x, str) else x)


    return df

In [31]:
SEASONS = [2021, 2022, 2023]
all_tables = []

for season in SEASONS:
    games = pd.DataFrame()
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = requests.get(url)

    soup = BeautifulSoup(html.text, 'html.parser')
    month_links = soup.find_all("a")
    href = [l["href"] for l in month_links]
    href = [l for l in href if f'/leagues/NBA_{season}_games-' in l]
    data = [f"https://www.basketball-reference.com{l}" for l in href]

    for url in data:

        month = requests.get(url)
        soup = BeautifulSoup(month.text, 'html.parser')
        game_schedule = soup.select('table.stats_table')[0]
        links = game_schedule.find_all("a")
        links = [l.get("href") for l in links]
        links = [l for l in links if '/boxscores/' in l]
        links = [l for l in links if not '/boxscores/index' in l]
        links = linkfix(links)

        df = pd.DataFrame({'links': links})
        games = pd.concat([games, df])

        time.sleep(3)

    games = games.reset_index(drop=True)
    games['game_id'] = games['links'].str[15:27]
    total_iterations = len(games)

        
    for index, row in tqdm(games.iterrows(), total=total_iterations, desc="Processing games"):    

        pbp = requests.get(f"https://www.basketball-reference.com{row['links']}")
        html_tables = pd.read_html(pbp.text, match = "Time", header=None)[0]
        html_tables.columns = range(html_tables.shape[1])
        
        soup = BeautifulSoup(pbp.text, 'html.parser')
        h1 = soup.find('h1')
        
        new_columns = ['Time', 'Away_Play', 'Away_Pts_Scored', 'Score', 'Home_Pts_Scored', 'Home_Play']
        html_tables.columns = new_columns
        html_tables['game_id'] = row['game_id']
        html_tables['game_title'] = h1.text


        column_name = 'Score'  # Actual column name containing the string values
        string_values = ['End of 1st quarter', 'End of 2nd quarter', 'End of 3rd quarter', 'End of 4th quarter', 'End of 1st overtime', 'End of 2nd overtime', 'End of 3rd overtime']  # List of string values to splice on
        
        # Splice the dataframe
        spliced_dfs = splice_dataframe(html_tables, column_name, string_values)
        # Remove the first three rows from each spliced dataframe
        modified_dfs = remove_first_four_rows(spliced_dfs)
        # Concatenate the modified spliced dataframes back together
        result = concatenate_dataframes(modified_dfs)
        cleaned = process_data(result)
        
        all_tables.append(cleaned)
        time.sleep(5)

    combined_df = pd.concat(all_tables, ignore_index=True)
    combined_df.to_csv(f'/Users/gtbut/OneDrive/Documents/NBA/playbyplay/scores/nbaplaybyplay{season}.csv') 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Seconds_left'] = df['Time'].apply(lambda x: int(x[:x.index(':')]) * 60 + int(x[x.index(':')+1:x.index('.')]) if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['score_away'] = df['Score'].apply(lambda x: int(x[:x.index('-')]) if isinstance(x, str) and '-' in x and len(x) <= 7 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs