In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

master_column_names = ['date',
                    'opp_name',
                    'wl',
                    'score',
                    'opp_score',
                    'fg',
                    'fga',
                    'fgpct',
                    '3p',
                    '3pa',
                    '3ppct',
                    'ft',
                    'fta',
                    'ftpct',
                    'orb',
                    'trb',
                    'ast',
                    'stl',
                    'blk',
                    'tov',
                    'pf',
                    'opp_fg',
                    'opp_fga',
                    'opp_fgpct',
                    'opp_3p',
                    'opp_3pa',
                    'opp_3ppct',
                    'opp_ft',
                    'opp_fta',
                    'opp_ftpct',
                    'opp_orb',
                    'opp_trb',
                    'opp_ast',
                    'opp_stl',
                    'opp_blk',
                    'opp_tov',
                    'opp_pf',
                    'team_name']

In [None]:
# Insantiate an empty dictionary
team_name_id_dict = {}

# URL for data from all schools
all_schools_url = 'https://www.sports-reference.com/cbb/seasons/men/2023-school-stats.html'
response = requests.get(all_schools_url, timeout = 10)
soup = BeautifulSoup(response.content)
# Find table ('tr')
table = soup.findAll('tr')

# Iterate through rows
for row in table:
    # Returns a None object if nothing is found
    search = row.find('a',href = True)
    # If we have something
    if search:

        # Extract the name and URL via string manipulation
        url_suffix = str(search).split('"')[1].replace(".html","")
        team_name = str(search).split(">")[1].replace("</a","").strip()
        # Update the dictionary
        team_name_id_dict[team_name] = url_suffix

print(team_name_id_dict['Purdue'])

In [None]:
# Instantiate empty data frame
master_df = pd.DataFrame()
# Create an iteration counter
counter = 0
# Create a random number between 100 and 300.  This is where the loop will pause
# So as to not overload the site
stop_to_rest_point = np.random.randint(20,100)

url_prefix = 'https://www.sports-reference.com'

# Iterate through the dictionary
for team_name,url_suffix in tqdm(team_name_id_dict.items()):

    full_url = f"{url_prefix}{url_suffix}-gamelogs.html"
    temp_df = pd.read_html(full_url)[0]
    # Surface-level data cleaning
    temp_df.columns = [col2 if (col1.startswith('Unnamed') or col1 == "School") else f"opp_{col2}" for col1,col2 in temp_df.columns]
    temp_df = temp_df.iloc[:,~temp_df.columns.str.startswith('Unnamed')].drop('G',axis = 1).dropna().query("Date != 'Date'")
    temp_df['team_name'] = team_name
    temp_df.columns = master_column_names
    # Appending the cleaned dataframe back to the master data frame
    master_df = pd.concat([master_df,temp_df])

    # Increment counter
    counter +=1
    # Sleep and save if we've reached our random number
    if counter == stop_to_rest_point:

        time.sleep(np.random.randint(60,120))
        master_df.to_parquet('parquet_files/box_scores_sports_reference.gzip',compression='gzip')
        continue
    
    # Sleep for 3 to 7 seconds
    time.sleep(np.random.randint(3,7))

master_df.to_parquet('parquet_files/box_scores_sports_reference.gzip',compression='gzip')