In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import pickle

master_column_names = ['date',
                    'opp_name',
                    'wl',
                    'score',
                    'opp_score',
                    'fg',
                    'fga',
                    'fgpct',
                    '3p',
                    '3pa',
                    '3ppct',
                    'ft',
                    'fta',
                    'ftpct',
                    'orb',
                    'trb',
                    'ast',
                    'stl',
                    'blk',
                    'tov',
                    'pf',
                    'opp_fg',
                    'opp_fga',
                    'opp_fgpct',
                    'opp_3p',
                    'opp_3pa',
                    'opp_3ppct',
                    'opp_ft',
                    'opp_fta',
                    'opp_ftpct',
                    'opp_orb',
                    'opp_trb',
                    'opp_ast',
                    'opp_stl',
                    'opp_blk',
                    'opp_tov',
                    'opp_pf',
                    'team_name']

url_prefix = 'https://www.sports-reference.com'

In [None]:
# Insantiate an empty dictionary
team_name_id_dict = {}

# URL for data from all schools
all_schools_url = 'https://www.sports-reference.com/cbb/seasons/men/2023-school-stats.html'
response = requests.get(all_schools_url, timeout = 10)
soup = BeautifulSoup(response.content)
# Find table ('tr')
table = soup.findAll('tr')

# Iterate through rows
for row in table:
    # Returns a None object if nothing is found
    search = row.find('a',href = True)
    # If we have something
    if search:

        # Extract the name and URL via string manipulation
        url_suffix = str(search).split('"')[1].replace(".html","")
        team_name = str(search).split(">")[1].replace("</a","").strip()
        # Update the dictionary
        team_name_id_dict[team_name] = url_suffix

print(team_name_id_dict['Purdue'])

In [None]:
# Instantiate empty data frame
master_df = pd.DataFrame()
# Create an iteration counter
counter = 0
# Create a random number between 100 and 300.  This is where the loop will pause
# So as to not overload the site
stop_to_rest_point = np.random.randint(20,100)


# Iterate through the dictionary
for team_name,url_suffix in tqdm(team_name_id_dict.items()):

    full_url = f"{url_prefix}{url_suffix}-gamelogs.html"
    temp_df = pd.read_html(full_url)[0]
    # Surface-level data cleaning
    temp_df.columns = [col2 if (col1.startswith('Unnamed') or col1 == "School") else f"opp_{col2}" for col1,col2 in temp_df.columns]
    temp_df = temp_df.iloc[:,~temp_df.columns.str.startswith('Unnamed')].drop('G',axis = 1).dropna().query("Date != 'Date'")
    temp_df['team_name'] = team_name
    temp_df.columns = master_column_names
    # Appending the cleaned dataframe back to the master data frame
    master_df = pd.concat([master_df,temp_df])

    # Increment counter
    counter +=1
    # Sleep and save if we've reached our random number
    if counter == stop_to_rest_point:

        time.sleep(np.random.randint(60,120))
        master_df.to_parquet('parquet_files/box_scores_sports_reference.gzip',compression='gzip')
        continue
    
    # Sleep for 3 to 7 seconds
    time.sleep(np.random.randint(3,7))

master_df.to_parquet('parquet_files/box_scores_sports_reference.gzip',compression='gzip')

Individual game box scores

In [None]:
all_game_uis = set()
team_game_id_dict = {team:set() for team in team_name_id_dict.keys()}

counter = 0
stop_to_rest_point = np.random.randint(25,70)

for team_name,team_url_section in tqdm(team_name_id_dict.items()):

    url = f'{url_prefix}{team_url_section}{"-gamelogs.html"}'
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Status code was {response.status_code} for {url}")
    
    soup = BeautifulSoup(response.content)
    table = soup.findAll("tr")

    for row in table:

        search = row.find('a',href = True)
        
        if not search:
            continue
            
        game_ui = str(search).split('>')[0].replace('<a href="/cbb/boxscores/',"").replace('.html"',"")

        # Games that were forfeitted end with '2023', so we want to remove those
        if game_ui.endswith('2023'):
            continue

        all_game_uis.add(game_ui)
        team_game_id_dict[team_name].add(game_ui)
 
    counter +=1
    if counter == stop_to_rest_point:

        time.sleep(np.random.randint(60,120))
        continue
    
    time.sleep(np.random.randint(2,7))

# Save the dict as a pickle file
with open('pickle_files/sports_reference_cbb_teams_and_game_uis.pickle','wb') as f:
    pickle.dump(team_game_id_dict,f)
    

In [2]:
with open('pickle_files/game_ids_with_boxscores.pickle','rb') as f:
        game_ui_boxscore_dict = pickle.load(f)

counter = 0
save_point = 100

# Iterate through the game IDs
for game_id, sub_dict in tqdm(game_ui_boxscore_dict.items()):

    # Confirmed bad ID
    if game_id == "2022-11-29-20-oral-roberts":
         continue

    # Skip if we've already gotten data for this game (list is not empty)
    if sub_dict['raw_dataframes']:
        continue

    url = f"{url_prefix}/cbb/boxscores/{game_id}.html"
    response = requests.get(url)

    # Stop if status code is bad
    if response.status_code != 200:
        raise Exception(f'Status code returned was {response.status_code} from URL {url}')
    

    soup = BeautifulSoup(response.content)
    first_team = str(soup.find('title')).split('vs.')[0].replace('<title>',"").strip()
    second_team = str(soup.find('title')).split("Box")[0].split('vs.')[1].strip()
    assert first_team,second_team in team_game_id_dict
    sub_dict['first_team'] = first_team
    sub_dict['second_team'] = second_team

    # The last four dataframes of the 20+ returned are what we want
    target_dfs = pd.read_html(response.text)[-4:]
    sub_dict['raw_dataframes'].extend(target_dfs)
    
    counter += 1
    if counter < save_point:

        time.sleep(np.random.randint(3,5))
        # Pickle it ever so often just in case something happens
        with open('pickle_files/game_ids_with_boxscores.pickle','wb') as f:
            pickle.dump(game_ui_boxscore_dict,f)
        continue

    time.sleep(np.random.randint(3,5))

# Pickle it
with open('pickle_files/game_ids_with_boxscores.pickle','wb') as f:
    pickle.dump(game_ui_boxscore_dict,f)

100%|██████████| 6222/6222 [1:18:15<00:00,  1.33it/s]
