In [25]:
import requests
from bs4 import BeautifulSoup

# PRE: given specific start url for the web database sports-reference
# POST: returns arrays of tournament years, links to tournament pages
def getTourneyYearLinks(url):
#     print("-> START 'getTourneyYearLinks'")
    
    baseUrl = 'https://www.sports-reference.com'
    
    # open given url
#     print("opening url: " + url + " ...")
    response = requests.get(url)
#     print("url opened!")

    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # find the link to the specific tournament page
    tourney_table = soup.find(id='seasons_NCAAW')
    if tourney_table == None:
        print("Table with id='seasons_NCAAW' not found on page: " + url)
        return [], []
    
    tourneys = tourney_table.find_all('a')
    # check variable exists and is not empty
    if tourneys:
        print("tourneys links found!")
    else: 
        print("could not find tourneys")
        return [], []

    tourneyYears = []
    tourneyLinks = []
    for tourney in tourneys:
        tourney_text = tourney.text
        tourney_link = tourney['href']
        if 'NCAA' in tourney_text:
            tourneyYears.append(tourney_text)
            tourneyLinks.append(baseUrl + tourney_link)

#     print("FINISH 'getTourneyYearLinks'")
#     print()
    
    return tourneyYears, tourneyLinks


In [26]:
import requests
from bs4 import BeautifulSoup

#PRE: given url to tournament page, headlessMode bool
#POST: returns array of links to all games in that tournament season
def getTourneyGameLinks(url):
#     print("-> START 'getTourneyGameLinks'")

    baseUrl = 'https://www.sports-reference.com'
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, 'html.parser')

    # find all divs with class "winner"
    winner_divs = soup.find_all('div', class_='winner')
    if winner_divs == None:
        print("divs with class='winner' not found on page: " + url)

    # extract links from second "a" tag in each "winner" div
    winner_links = []
    for div in winner_divs:
        if div.find_all('a')[1]['href'] != None: 
            winner_links.append(baseUrl + div.find_all('a')[1]['href'])
        else: 
            print("links in winner divs not found on page: " + url)
            return []

#     print("game box score links gotten!")
#     print("quitting driver")

#     print("FINISH 'getTourneyGameLinks'")
#     print()

    return winner_links


In [15]:
#DEFUNCT

import requests
from bs4 import BeautifulSoup
import pandas as pd

def getGameBoxScores(url):
    print("-> START 'getGameBoxScores'")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get team names
    team1_elem = soup.select_one('#content > div.scorebox > div:nth-child(1) > div:nth-child(2) > strong > a')
    team2_elem = soup.select_one('#content > div.scorebox > div:nth-child(3) > div:nth-child(2) > strong > a')
    if team1_elem is None or team2_elem is None:
        print("ERROR: Could not find team names.")
        return None

    team1 = team1_elem.text
    team2 = team2_elem.text
    
    print("team 1: " + team1)
    print("team 2: " + team2)
    
    
    # Get basic stats tables
    basic_stats1 = soup.select_one('#box-' + team1.lower().replace(' ', '-') + '-game-basic')
    basic_stats2 = soup.select_one('#box-' + team2.lower().replace(' ', '-') + '-game-basic')
    if basic_stats1 is None or basic_stats2 is None:
        print("ERROR: Could not find basic stats tables.")
        return None

    # Get advanced stats tables
    adv_stats1 = soup.select_one('#box-' + team1.lower().replace(' ', '-') + '-game-advanced')
    adv_stats2 = soup.select_one('#box-' + team2.lower().replace(' ', '-') + '-game-advanced')
    if adv_stats1 is None or adv_stats2 is None:
        print("ERROR: Could not find advanced stats tables.")
        return None

    # Parse basic stats tables
    headers = [th.text for th in basic_stats1.select('thead th')]
    rows1 = [[td.text for td in row.find_all('td')] for row in basic_stats1.select('tbody tr')]
    rows2 = [[td.text for td in row.find_all('td')] for row in basic_stats2.select('tbody tr')]

    # Combine basic stats data for both teams
    basic_stats = [[team1, headers, rows1], [team2, headers, rows2]]

    # Parse advanced stats tables
    headers = [th.text for th in adv_stats1.select('thead th')]
    rows1 = [[td.text for td in row.find_all('td')] for row in adv_stats1.select('tbody tr')]
    rows2 = [[td.text for td in row.find_all('td')] for row in adv_stats2.select('tbody tr')]

    # Combine advanced stats data for both teams
    adv_stats = [[team1, headers, rows1], [team2, headers, rows2]]

    gameStats = [basic_stats, adv_stats]

    print("FINISH 'getGameBoxScores'")
    print()

    return gameStats


In [16]:
#DEFUNCT

import requests 
from bs4 import BeautifulSoup
import pandas as pd
from lxml import etree

def getGameBoxScoresV2(url):
    HEADERS = ({'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\
            'Accept-Language': 'en-US, en;q=0.5'})

    webpage = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    dom = etree.HTML(str(soup))
    
    #get playing teams and respective records
    nameSep = '('
    recordSep = '-'\
    
    team1 = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[0]
    team1Record = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
    team1WinLoss = [team1Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]
    
    team2 = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[0]
    team2Record = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
    team2WinLoss = [team2Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]
    
    gameTitle = team1 + " vs. " + team2
    print(gameTitle, team1WinLoss, team2WinLoss)
    
    #get line score
    
    tableSelector = "line-score"
#     for table in soup.find_all('table'):
#         print(table.get('id'))
#     print(soup.find_all('table', id=tableSelector))

#     print("\n",soup)
    
    lineScore = dom.xpath('//*[@id="line-score"]')
    print(lineScore)
    
    
    
    #from line score, designate winner and loser
    
    
    #get Four Factors
    
    #get team 1 basic box score
    
    #get team 1 advanced box score
    
    
    #get team 2 basic box score
    
    #get team 2 advanced box score
    
    
    

In [27]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
from lxml import etree
import pandas as pd
import time

# PRE: given url to game in tournament
# POST: returns dictionary with game title, line score, four factors, and basic/advanced box scores for each team
# each table formatted as a pandas dataframe
def getGameInfoStats(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    dom = etree.HTML(str(soup))
    
    #get playing teams and respective records
    nameSep = '('
    recordSep = '-'\
    
    team1 = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[0]
    team1Record = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
    team1WinLoss = [team1Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]    
    
    team2 = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[0]
    team2Record = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
    team2WinLoss = [team2Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]
    
    gameTitle = team1 + " vs. " + team2
    

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    lineScoreFind = 'id="line-score"'
    fourFactorFind = 'id="four-factors"'
    extractedTables = ''
    for c in comments:
        c = str(c)
        if (lineScoreFind in c) or (fourFactorFind in c):
            extractedTables = extractedTables + c + "\n"
    newTables = BeautifulSoup(extractedTables, 'html.parser')

    # Find the first table with caption 'Line Score Table'
    lineScoreTable = newTables.find('caption', text='Line Score Table').find_parent('table')

    # Find the first table with caption 'Four Factors Table'
    fourFactorsTable = newTables.find('caption', text='Four Factors Table').find_parent('table')

    # Find all tables with first row table header containing 'Basic Box Score Stats' or 'Advanced Box Score Stats'
    boxScoreTables = soup.find_all('table', class_="sortable stats_table")
    
    dfTables = []
    dfTables.append(pd.read_html(str(lineScoreTable))[0])
    dfTables.append(pd.read_html(str(fourFactorsTable))[0])
    for t in boxScoreTables:
        df = pd.read_html(str(t))[0]
        df.drop(df.index[5], inplace=True)
        dfTables.append(df)
        
    dfTables[2].rename(columns={'Starters': team1}, inplace=True)
    dfTables[3].rename(columns={'Starters': team1}, inplace=True)
    dfTables[4].rename(columns={'Starters': team2}, inplace=True)
    dfTables[5].rename(columns={'Starters': team2}, inplace=True)
    
    output = {
        'Game Title' : gameTitle,
        'Line Score' : dfTables[0],
        'Four Factors' : dfTables[1],
        'Team 1 Basic BS' : dfTables[2],
        'Team 1 Adv BS' : dfTables[3],
        'Team 2 Basic BS' : dfTables[4],
        'Team 2 Adv BS' : dfTables[5],
        
    }
    return output

In [38]:
#TESTING FUNCTIONS:
import time

def testingFunctions():
    baseUrl = 'https://www.sports-reference.com'
    url='https://www.sports-reference.com/cbb/seasons/'

    print("TESTING 'getTourneyYearLinks' function...")
    years, links = getTourneyYearLinks(url)

    #only want years 2010-2023 (excluding 2020)
    useableLinks = []
    useableYears = []
    if len(years) < 13:
        print("not found all the years/links needed. found " + str(len(years)) + " links")
        return
    for i in range(13):
        #skipping nonexistant 2020 tourney
        if(years[i]=='2020 NCAA'):
            continue
        useableLinks.append(links[i])
        useableYears.append(years[i])

    # #checking to see if arrays have been shortened correctly
    # print("tournament years and respective links being used:")
    # for i in range( len(useableLinks)):
    #     print(useableYears[i] + ": " + useableLinks[i])

    print("FINISH TESTING 'getTourneyYearLinks' function. \n")


    #setting test link
    print("TESTING 'getTourneyGameLinks' function...")
    index = 5
    newUrl = useableLinks[index]
    newYear = useableYears[index]
    #get the game links for the specified year to test
    print("\n go to: " + newUrl)
    gameLinks = getTourneyGameLinks(newUrl)

    # #checking to see if links were gathered correctly
    # print("games in " + newYear)
    # for gamelink in gameLinks:
    #     print(gamelink)

    print("FINISH TESTING 'getTourneyYearLinks' function. \n")

    # print("TESTING 'getGameBoxScoresV2' function...")
    # gameUrl = gameLinks[index]
    # print("game url: " + gameUrl)
    # gameBoxScores = getGameBoxScoresV2(gameUrl)



    # print("FINISH TESTING 'getGameBoxScores' function.")

    print("TESTING 'getGameInfoStats' function...")
    
#     gameUrl = gameLinks[index]
#     print("going to: " + gameUrl)
#     gameInfo = getGameInfoStats(gameUrl)
#     print("\nGathered Info:")
#     print('Game Title', gameInfo["Game Title"])
#     display(gameInfo["Line Score"])
#     display(gameInfo["Four Factors"])
#     display(gameInfo["Team 1 Basic BS"])
#     display(gameInfo["Team 1 Adv BS"])
#     display(gameInfo["Team 2 Basic BS"])
#     display(gameInfo["Team 2 Adv BS"])
    
    gamesInfo = []
    for i in range(3):
        gamesInfo.append( getGameInfoStats(gameLinks[i]) )
        print("got game info " + str(i+1) + "/3")
        time.sleep(3)
    
    print("displaying all gathered games:\n============================================")
    for i in range( len(gamesInfo) ):
            print("\nGathered Info:")
            print('Game Title', gamesInfo[i]["Game Title"])
            display(gamesInfo[i]["Line Score"])
            display(gamesInfo[i]["Four Factors"])
            display(gamesInfo[i]["Team 1 Basic BS"])
            display(gamesInfo[i]["Team 1 Adv BS"])
            display(gamesInfo[i]["Team 2 Basic BS"])
            display(gamesInfo[i]["Team 2 Adv BS"])
            print("===================================================")


    

    print("FINISH TESTING 'getGameInfoStats' function.")

    print("testing sleep functionality...")
    time.sleep(4)
    print("sleep finished.")



    print("\n ---FINISHED TESTING FUNCTIONS.---")
    
    
testingFunctions()

TESTING 'getTourneyYearLinks' function...
tourneys links found!
FINISH TESTING 'getTourneyYearLinks' function. 

TESTING 'getTourneyGameLinks' function...

 go to: https://www.sports-reference.com/cbb/postseason/women/2016-ncaa.html
FINISH TESTING 'getTourneyYearLinks' function. 

TESTING 'getGameInfoStats' function...
got game info 1/3
got game info 2/3
got game info 3/3


In [41]:
#DATA GATHERING
import time

baseUrl = 'https://www.sports-reference.com'
url='https://www.sports-reference.com/cbb/seasons/'

#getting tourney season links
print("START Data collection... \n")

# 1 server call
years, links = getTourneyYearLinks(url)
print("-> GOT tourney links, years")

#only want years 2010-2023 (excluding 2020) -> 13 seasons of data 
useableLinks = []
useableYears = []
for i in range(13):
    #skipping nonexistent 2020 tourney
    if(years[i]=='2020 NCAA'):
        continue
    useableLinks.append(links[i])
    useableYears.append(years[i])
print("-> CLEANED links, years")
    
print("-> START Game Data Collection...")
allData = []
seasonsToCollect = 1
for i in range( seasonsToCollect ):
    print("--> START " + useableYears[i] + " season collection...")
    # 1 server call
    gameLinks = getTourneyGameLinks( useableLinks[i] )
    time.sleep(7)
    seasonData = []
    for j in range( len(gameLinks) ):
        # 1 server call
        seasonData.append( getGameInfoStats( gameLinks[j] ) )
        print("---> game data collected " + str(j+1) + "/" + str(len(gameLinks)))
        time.sleep(7)
    seasonOUT = [useableYears[i], seasonData]
    allData.append(seasonOUT)
    print("--> FINISH " + useableYears[i] + " season collection. \n")
print("-> FINISH Game Data Collection.")

print("\nFINISH Data Collection.")
    

START Data collection... 

tourneys links found!
-> GOT tourney links, years
-> CLEANED links, years
-> START Game Data Collection...
--> START 2022 NCAA season collection...
---> game data collected 1/63
---> game data collected 2/63
---> game data collected 3/63
---> game data collected 4/63
---> game data collected 5/63
---> game data collected 6/63
---> game data collected 7/63
---> game data collected 8/63
---> game data collected 9/63
---> game data collected 10/63
---> game data collected 11/63
---> game data collected 12/63
---> game data collected 13/63
---> game data collected 14/63
---> game data collected 15/63
---> game data collected 16/63
---> game data collected 17/63
---> game data collected 18/63
---> game data collected 19/63
---> game data collected 20/63
---> game data collected 21/63
---> game data collected 22/63
---> game data collected 23/63
---> game data collected 24/63
---> game data collected 25/63
---> game data collected 26/63
---> game data collected 27/

In [45]:
print(allData[0])

['2022 NCAA', [{'Game Title': 'Longwood  vs. NC State ', 'Line Score':              Scoring                    
  Unnamed: 0_level_1   1   2   3   4   T
0           Longwood  15   6  26  21  68
1           NC State  19  23  26  28  96, 'Four Factors':   Unnamed: 0_level_0 Unnamed: 1_level_0 Four Factors                     \
  Unnamed: 0_level_1               Pace         eFG%  TOV%  ORB% FT/FGA   
0           Longwood               76.0        0.399  15.5  36.6  0.188   
1           NC State               75.6        0.556  12.7  46.2  0.222   

  Unnamed: 6_level_0  
                ORtg  
0               89.5  
1              126.3  , 'Team 1 Basic BS':      Unnamed: 0_level_0 Basic Box Score Stats                                 \
              Longwood                     MP  FG FGA   FG%  2P 2PA   2P% 3P   
0       Tra'dayja Smith                    37   8  20  .400   3   7  .429  5   
1          Kyla McMakin                    37   7  20  .350   5   9  .556  2   
2           Aki