In [None]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
from tqdm import tqdm 
import regex as re
import pickle
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

In [11]:
def NCAAFH_SCRAPE(start_date,season_length_in_days = 90):
    # generate a list of all of the dates in the season
    date_list = [start_date + datetime.timedelta(days=x) for x in range(season_length_in_days)]
    
    ## itterates the list of dates and retrives all compiles a url for each day
    pages = [f"http://stats.ncaa.org/season_divisions/17902/livestream_scoreboards?utf8=✓&season_division_id=&game_date={date_list[x].month}%2F{date_list[x].day}%2F2022&conference_id=0&tournament_id=&commit=Submit" for x,n in enumerate(date_list)]
    
    print('dates parsed')
    print('finding games')
    # loop to itterate through url generated above and retrive all box scores to all games on that day
    
    box_scores = [] 
    for page in tqdm(pages):
    
        url = page
    
        header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
        }
    
        response = requests.get(url,headers=header)
        soup = BeautifulSoup(response.content, 'html.parser')
    
        # Find all table rows
        rows = soup.find_all('tr')
        
        res = [row.find_all('a',string='Box Score') for row in rows]
        res = list(filter(lambda a: a != [], res))
        
        box_scores+=([f"http://stats.ncaa.org{el[0].get('href')}" for el in res]) # make list of urls to box scores
    
    
    all_games = [] # define list
    
    print('games found')
    
    for target in tqdm(box_scores): # itterate through list of box score addresses

        clear_output(wait=True)
        print(f'retriving game data for: {target}')
        
        header = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
            } # ncaa will block if there is no user agent
    
        response = requests.get(target,headers=header)
        soup = BeautifulSoup(response.content, 'html.parser')
    
        #pbp link
        pbp_link = f"http://stats.ncaa.org{soup.find('a',string='Play by Play').get('href')}"
    
        # get date of game
        date = soup.find_all('table')[2].find('td',string='Game Date:').next_sibling.next_sibling.text.strip()
        
        # Find the table element
        tables = soup.find_all('table', class_='mytable')
    
        # Create an empty list to store the data
        datas = []
        datah = []
        dataa = []
    
        # Find all rows in the three tables
        rowss = tables[0].find_all('tr')
        rowsh = tables[1].find_all('tr')
        rowsa = tables[2].find_all('tr')
    
        # Loop through each row and extract the cell values from the tables
        for row in rowss:
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            datas.append(row_data)
    
        for row in rowsh:
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            datah.append(row_data)
    
        for row in rowsa:
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            dataa.append(row_data)
    
    
        df_sum = pd.DataFrame(datas)
        df_home = pd.DataFrame(datah)
        df_away = pd.DataFrame(dataa)
    
        df_home = df_home[2:-1]
        df_away = df_away[2:-1]
    
        target = pbp_link # make another request, retriving the play by play
        response = requests.get(target,headers=header)
        soup = BeautifulSoup(response.content, 'html.parser')
    
        tables = soup.find_all('table', class_='mytable')
    
        pbp = []
        for table in tables: # get all rows from all tables
    
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                row_data = [cell.text.strip() for cell in cells]
                pbp.append(row_data)
    
        pbp = pbp[4:-1] # chop head and tail off
    
        pbp = list(filter(lambda a: len(a) > 1, pbp)) # only incude rows with more than one element
        pbp = list(filter(lambda a: a[0] != 'Time', pbp)) # do not include column head rows
    
        data = {'url':target,'df_sum':df_sum,'df_home':df_home,'df_away':df_away,'pbp':pbp,'date':date} # assemble dict
        
        all_games.append(data) # add to list of dicts
        
    df = pd.DataFrame(all_games)
    return df


In [None]:
start_date = datetime.datetime(2022, 8, 26)
season_length_in_days = 87

df = NCAAFH_SCRAPE(start_date)

In [None]:
# Save data as a file
path = '/Users/euan_brown/Documents/GitHub/NCAA-fh-project/results.pkl'

with open(path, 'wb') as f:
    pickle.dump(df, f)