In [150]:
import requests
from bs4 import BeautifulSoup

# PRE: given specific start url for the web database sports-reference
# POST: returns arrays of tournament years, links to tournament pages
def getTourneyYearLinks(url):
#     print("-> START 'getTourneyYearLinks'")
    
    baseUrl = 'https://www.sports-reference.com'
    
    # open given url
#     print("opening url: " + url + " ...")
    response = requests.get(url)
#     print("url opened!")

    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # find the link to the specific tournament page
    tourney_table = soup.find(id='seasons_NCAAW')
    if tourney_table == None:
        print("Table with id='seasons_NCAAW' not found on page: " + url)
        return [], []
    
    tourneys = tourney_table.find_all('a')
    # check variable exists and is not empty
    if tourneys:
        print("tourneys links found!")
    else: 
        print("could not find tourneys")
        return [], []

    tourneyYears = []
    tourneyLinks = []
    for tourney in tourneys:
        tourney_text = tourney.text
        tourney_link = tourney['href']
        if 'NCAA' in tourney_text:
            tourneyYears.append(tourney_text.split()[0])
            tourneyLinks.append(baseUrl + tourney_link)

    return tourneyYears, tourneyLinks


In [149]:
import requests
from bs4 import BeautifulSoup

#PRE: given url to tournament page
#POST: returns array of links to all games in that tournament season
def getTourneyGameLinks(url):
    baseUrl = 'https://www.sports-reference.com'
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, 'html.parser')

    # find all divs with class "winner"
    winner_divs = soup.find_all('div', class_='winner')
    if winner_divs == None:
        print("divs with class='winner' not found on page: " + url)

    # extract links from second "a" tag in each "winner" div
    winner_links = []
    for div in winner_divs:
        if div.find_all('a')[1]['href'] != None: 
            winner_links.append(baseUrl + div.find_all('a')[1]['href'])
        else: 
            print("links in winner divs not found on page: " + url)
            return []

    return winner_links


In [148]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
from lxml import etree
import pandas as pd
import time

# PRE: given url to game in tournament
# POST: returns dictionary with game title, winner, line score, four factors, and basic/advanced box scores for each team
# each table formatted as a pandas dataframe
def getGameInfoStats(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    dom = etree.HTML(str(soup))
    
    #get playing teams and respective records
    nameSep = '('
    recordSep = '-'\
    
    try:
        team1 = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[0]
        team1Record = dom.xpath('//*[@id="boxes"]/div[2]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
        team1WinLoss = [team1Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]    
    except:
        print("--->> could not find team one data, skipping collection of game. > link: " + url)
        return None
    
    try: 
        team2 = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[0]
        team2Record = dom.xpath('//*[@id="boxes"]/div[5]/div/h2')[0].text.split(nameSep, 1)[1][:-1]
        team2WinLoss = [team2Record.split(recordSep, 1)[0], team1Record.split(recordSep, 1)[1]]
    except:
        print("--->> could not find team two data, skipping collection of game. > link: " + url)
        return None
    
    gameTitle = team1 + " vs. " + team2
    

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    lineScoreFind = 'id="line-score"'
    fourFactorFind = 'id="four-factors"'
    extractedTables = ''
    for c in comments:
        c = str(c)
        if (lineScoreFind in c) or (fourFactorFind in c):
            extractedTables = extractedTables + c + "\n"
    newTables = BeautifulSoup(extractedTables, 'html.parser')

    # Find the first table with caption 'Line Score Table'
    lineScoreTable = newTables.find('caption', text='Line Score Table').find_parent('table')

    # Find the first table with caption 'Four Factors Table'
    fourFactorsTable = newTables.find('caption', text='Four Factors Table').find_parent('table')

    # Find all tables with first row table header containing 'Basic Box Score Stats' or 'Advanced Box Score Stats'
    boxScoreTables = soup.find_all('table', class_="sortable stats_table")
    
    lineScoreDF = pd.read_html(str(lineScoreTable))[0]
    team1Score = lineScoreDF['Scoring']['T'].values[0]
    team2Score = lineScoreDF['Scoring']['T'].values[1]
    
    team1Win = True if(team1Score > team2Score) else (False)
    team2Win = not team1Win 
        
    winnerDict = {'Team': [team1, team2], 'Winner?':[team1Win, team2Win] }
    winnerDf = pd.DataFrame(data=winnerDict)
    
    dfTables = []
    dfTables.append(winnerDf)
    dfTables.append(pd.read_html(str(lineScoreTable))[0])
    dfTables.append(pd.read_html(str(fourFactorsTable))[0])
    for t in boxScoreTables:
        df = pd.read_html(str(t))[0]
        df.drop(df.index[5], inplace=True)
        dfTables.append(df)
        
    dfTables[2].rename(columns={'Starters': team1}, inplace=True)
    dfTables[3].rename(columns={'Starters': team1}, inplace=True)
    dfTables[4].rename(columns={'Starters': team2}, inplace=True)
    dfTables[5].rename(columns={'Starters': team2}, inplace=True)
    
    output = {
        'Game Title' : gameTitle,
        'Game Outcome' : dfTables[0],
        'Line Score' : dfTables[1],
        'Four Factors' : dfTables[2],
        'Team 1 Basic BS' : dfTables[3],
        'Team 1 Adv BS' : dfTables[4],
        'Team 2 Basic BS' : dfTables[5],
        'Team 2 Adv BS' : dfTables[6]
        
    }
    return output

In [96]:
import pandas as pd
import os

# #PRE: df is a pandas datafrome, folderLocation is the folder where it is to be saved,
#       fileName is the name of the file saved, fileType is the file type, including the '.'
# #POST: df has been converted to csv format and saved in the given folder with the given name as the given file type
def dataSave(df, folderLocation, fileName, fileType):
    path = folderLocation + '/' + fileName + fileType
    df.to_csv(path)
    
d = {"col1": [1, 2, 3], "col2": [4,5,6], "col3": [7,8,9]}
df = pd.DataFrame(data=d)

location = 'C:/Users/danna/Documents/GitHub/WBBTournamentPredictions/Web Scraping Code/GatheredData'
fName = "test"
fType = ".csv"
dataSave(df, location, fName, fType)

In [40]:
#TESTING FUNCTIONS:
import time

def testingFunctions():
    baseUrl = 'https://www.sports-reference.com'
    url='https://www.sports-reference.com/cbb/seasons/'

    print("TESTING 'getTourneyYearLinks' function...")
    years, links = getTourneyYearLinks(url)

    #only want years 2010-2023 (excluding 2020)
    useableLinks = []
    useableYears = []
    if len(years) < 13:
        print("not found all the years/links needed. found " + str(len(years)) + " links")
        return
    for i in range(13):
        #skipping nonexistant 2020 tourney
        if(years[i]=='2020'):
            continue
        useableLinks.append(links[i])
        useableYears.append(years[i])

    # #checking to see if arrays have been shortened correctly
    # print("tournament years and respective links being used:")
    # for i in range( len(useableLinks)):
    #     print(useableYears[i] + ": " + useableLinks[i])

    print("FINISH TESTING 'getTourneyYearLinks' function. \n")


    #setting test link
    print("TESTING 'getTourneyGameLinks' function...")
    index = 5
    newUrl = useableLinks[index]
    newYear = useableYears[index]
    #get the game links for the specified year to test
    print("\n go to: " + newUrl)
    gameLinks = getTourneyGameLinks(newUrl)

    # #checking to see if links were gathered correctly
    # print("games in " + newYear)
    # for gamelink in gameLinks:
    #     print(gamelink)

    print("FINISH TESTING 'getTourneyYearLinks' function. \n")

    # print("TESTING 'getGameBoxScoresV2' function...")
    # gameUrl = gameLinks[index]
    # print("game url: " + gameUrl)
    # gameBoxScores = getGameBoxScoresV2(gameUrl)



    # print("FINISH TESTING 'getGameBoxScores' function.")

    print("TESTING 'getGameInfoStats' function...")
    
    gameUrl = gameLinks[index]
    print("going to: " + gameUrl)
    gameInfo = getGameInfoStats(gameUrl)
    print("\nGathered Info:")
    print('Game Title', gameInfo["Game Title"])
    display(gameInfo["Game Outcome"])
    display(gameInfo["Line Score"])
    display(gameInfo["Four Factors"])
    display(gameInfo["Team 1 Basic BS"])
    display(gameInfo["Team 1 Adv BS"])
    display(gameInfo["Team 2 Basic BS"])
    display(gameInfo["Team 2 Adv BS"])
    
#     gamesInfo = []
#     for i in range(3):
#         gamesInfo.append( getGameInfoStats(gameLinks[i]) )
#         print("got game info " + str(i+1) + "/3")
#         time.sleep(3)
    
#     print("displaying all gathered games:\n============================================")
#     for i in range( len(gamesInfo) ):
#             print("\nGathered Info:")
#             print('Game Title', gamesInfo[i]["Game Title"])
#             display(gamesInfo[i]["Game Outcome"])
#             display(gamesInfo[i]["Line Score"])
#             display(gamesInfo[i]["Four Factors"])
#             display(gamesInfo[i]["Team 1 Basic BS"])
#             display(gamesInfo[i]["Team 1 Adv BS"])
#             display(gamesInfo[i]["Team 2 Basic BS"])
#             display(gamesInfo[i]["Team 2 Adv BS"])
#             print("===================================================")


    

    print("FINISH TESTING 'getGameInfoStats' function.")

    print("testing sleep functionality...")
    time.sleep(4)
    print("sleep finished.")
    
    print("TESTING 'dataSave' function...")
    
    d = {"col1": [1,2,3], "col2": [4,5,6], "col3": [7,8,9]}
    df = pd.DataFrame(data=d)
    display(df)
    loc = 'C:/Users/danna/Documents/GitHub/WBBTournamentPredictions/Web Scraping Code/GatheredData'
    fileName = 'testFile'
    fileType = '.csv'
    path = loc + '/' + fileName + fileType
    dataSave(df, loc, fileName, fileType)
    
    print("FINISH TESTING 'dataSave' function.")



    print("\n ---FINISHED TESTING FUNCTIONS.---")
    
    
testingFunctions()

TESTING 'getTourneyYearLinks' function...
tourneys links found!
FINISH TESTING 'getTourneyYearLinks' function. 

TESTING 'getTourneyGameLinks' function...

 go to: https://www.sports-reference.com/cbb/postseason/women/2016-ncaa.html
FINISH TESTING 'getTourneyYearLinks' function. 

TESTING 'getGameInfoStats' function...
going to: https://www.sports-reference.com/cbb/boxscores/2016-03-19-brigham-young_w.html

Gathered Info:
Game Title Missouri  vs. Brigham Young 


Unnamed: 0,Team,Winner?
0,Missouri,True
1,Brigham Young,False


Unnamed: 0_level_0,Scoring,Scoring,Scoring,Scoring,Scoring,Scoring
Unnamed: 0_level_1,Unnamed: 0_level_1,1,2,3,4,T
0,Missouri,15,20,25,18,78
1,BYU,17,14,18,20,69


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Four Factors,Four Factors,Four Factors,Four Factors,Unnamed: 6_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1.1,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,Missouri,71.4,0.625,23.9,20.0,0.523,109.9
1,Brigham Young,71.4,0.509,18.5,14.8,0.283,97.2


Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats
Unnamed: 0_level_1,Missouri,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Sophie Cunningham,34,7,10,0.7,7,8,0.875,0,2,...,0.857,0,5,5,2,3,0,4,4,20
1,Morgan Stock,32,4,5,0.8,1,1,1.0,3,4,...,0.75,1,1,2,1,0,0,0,1,14
2,Jordan Frericks,30,8,13,0.615,8,13,0.615,0,0,...,0.6,0,9,9,2,1,0,5,3,19
3,Lindsey Cunningham,25,1,2,0.5,1,2,0.5,0,0,...,,0,2,2,5,0,0,3,5,2
4,Cierra Porter,10,1,4,0.25,1,3,0.333,0,1,...,,1,0,1,0,0,0,1,3,2
6,Kayla Michael,29,2,4,0.5,2,2,1.0,0,2,...,1.0,2,5,7,3,0,0,0,1,8
7,Juanita Robinson,21,2,5,0.4,0,0,,2,5,...,0.833,0,0,0,1,0,0,2,3,11
8,Lianna Doty,10,0,0,,0,0,,0,0,...,,0,0,0,1,1,0,1,1,0
9,Sierra Michaelis,6,0,1,0.0,0,0,,0,1,...,1.0,0,0,0,0,1,0,2,2,2
10,Maddie Stock,2,0,0,,0,0,,0,0,...,,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,Unnamed: 0_level_0,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats
Unnamed: 0_level_1,Brigham Young,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,Sophie Cunningham,34,0.75,0.7,0.2,0.7,0.0,21.8,12.5,14.0,4.9,0.0,23.4,27.0,118.0,91.0,
1,Morgan Stock,32,1.014,1.1,0.8,0.8,6.3,4.6,5.3,6.3,0.0,0.0,0.0,11.4,203.0,105.0,
2,Jordan Frericks,30,0.618,0.615,0.0,0.385,0.0,44.4,25.5,18.6,1.9,0.0,24.8,36.2,94.0,95.0,
3,Lindsey Cunningham,25,0.5,0.5,0.0,0.0,0.0,11.9,6.8,34.2,0.0,0.0,60.0,10.8,82.0,104.0,
4,Cierra Porter,10,0.25,0.25,0.25,0.0,20.0,0.0,8.5,0.0,0.0,0.0,20.0,26.9,56.0,105.0,
6,Kayla Michael,29,0.678,0.5,0.5,1.0,13.8,25.5,20.5,18.6,0.0,0.0,0.0,10.7,168.0,102.0,
7,Juanita Robinson,21,0.701,0.6,1.0,1.2,0.0,0.0,0.0,9.0,0.0,0.0,20.7,24.7,118.0,105.0,
8,Lianna Doty,10,,,,,0.0,0.0,0.0,16.0,5.6,0.0,100.0,5.4,51.0,93.0,
9,Sierra Michaelis,6,0.513,0.0,1.0,2.0,0.0,0.0,0.0,0.0,9.3,0.0,51.5,34.8,54.0,85.0,
10,Maddie Stock,2,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,105.0,


Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats
Unnamed: 0_level_1,Brigham Young,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Kalani Purcell,40,7,10,0.7,6,9,0.667,1,1,...,0.8,2,7,9,4,3,0,4,4,19
1,Makenzi Pulsipher,37,3,10,0.3,1,4,0.25,2,6,...,0.833,1,3,4,0,2,0,1,4,13
2,Lexi Rydalch,36,8,17,0.471,7,12,0.583,1,5,...,1.0,0,1,1,1,1,0,7,5,22
3,Kylie Maeda,30,3,6,0.5,3,3,1.0,0,3,...,,0,1,1,1,1,0,2,2,6
4,Micaelee Orton,16,1,1,1.0,1,1,1.0,0,0,...,,1,0,1,0,0,1,0,4,2
6,Cassie Devashrayee,16,1,3,0.333,1,3,0.333,0,0,...,,0,1,1,0,1,0,0,5,2
7,Amanda Wayment,15,2,2,1.0,2,2,1.0,0,0,...,0.5,0,3,3,0,0,1,0,5,5
8,Kristine Nielson,8,0,3,0.0,0,1,0.0,0,2,...,,0,0,0,0,0,0,0,1,0
9,Jasmine Moody,2,0,1,0.0,0,1,0.0,0,0,...,,0,0,0,0,0,0,0,0,0
10,School Totals,200,25,53,0.472,21,36,0.583,4,17,...,0.833,4,16,20,6,8,2,14,30,69


Unnamed: 0_level_0,Unnamed: 0_level_0,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats,Advanced Box Score Stats
Unnamed: 0_level_1,Starters,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,Kalani Purcell,40,0.768,0.75,0.1,0.5,7.4,35.0,19.1,22.2,4.2,0.0,24.7,21.6,128.0,107.0,
1,Makenzi Pulsipher,37,0.506,0.4,0.6,0.6,4.0,16.2,9.2,0.0,3.0,0.0,7.3,19.7,104.0,111.0,
2,Lexi Rydalch,36,0.568,0.5,0.294,0.294,0.0,5.6,2.4,6.9,1.6,0.0,26.7,38.9,86.0,116.0,
3,Kylie Maeda,30,0.5,0.5,0.5,0.0,0.0,6.7,2.8,6.3,1.9,0.0,25.0,14.2,80.0,115.0,
4,Micaelee Orton,16,1.0,1.0,0.0,0.0,9.3,0.0,5.3,0.0,0.0,8.6,0.0,3.3,205.0,114.0,
6,Cassie Devashrayee,16,0.333,0.333,0.0,0.0,0.0,12.5,5.3,0.0,3.5,0.0,0.0,10.0,70.0,110.0,
7,Amanda Wayment,15,0.847,1.0,0.0,1.0,0.0,40.0,17.0,0.0,0.0,9.2,0.0,10.3,175.0,110.0,
8,Kristine Nielson,8,0.0,0.0,0.667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,120.0,
9,Jasmine Moody,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.7,0.0,120.0,
10,School Totals,200,0.561,0.509,0.321,0.34,14.8,80.0,42.6,24.0,11.3,6.9,18.5,100.0,97.2,109.9,


FINISH TESTING 'getGameInfoStats' function.
testing sleep functionality...
sleep finished.

 ---FINISHED TESTING FUNCTIONS.---


In [138]:
#DATA GATHERING
import time

baseUrl = 'https://www.sports-reference.com'
url='https://www.sports-reference.com/cbb/seasons/'

#getting tourney season links
print("\n===========================\nSTART Data collection... \n")

# 1 server call
years, links = getTourneyYearLinks(url)
print("-> GOT tourney links, years")

#only want years 2010-2023 (excluding 2020) -> 13 seasons of data 
useableLinks = []
useableYears = []
for i in range(13):
    #skipping nonexistent 2020 tourney
    if(years[i]=='2020'):
        continue
    useableLinks.append(links[i])
    useableYears.append(years[i])
print("-> CLEANED links, years")
    
print("-> START Game Data Collection...")
allData = []
seasonsToCollect = len(useableYears)
for i in range( seasonsToCollect ):
    print("--> START " + useableYears[i] + " season collection...")
    # 1 server call
    gameLinks = getTourneyGameLinks( useableLinks[i] )
    time.sleep(7)
    seasonData = []
    for j in range( len(gameLinks) ):
        # 1 server call
        if( getGameInfoStats( gameLinks[j] ) == None): 
            continue
        seasonData.append( getGameInfoStats( gameLinks[j] ) )
        print("---> game data collected " + str(j+1) + "/" + str(len(gameLinks)))
        time.sleep(7)
    seasonOUT = [useableYears[i], seasonData]
    allData.append(seasonOUT)
    print("--> FINISH " + useableYears[i] + " season collection. \n")
print("-> FINISH Game Data Collection.")

print("\nFINISH Data Collection.")

print("\n===========================\nSTART Data Saving...")

dataPathBase = 'C:/Users/danna/Documents/GitHub/WBBTournamentPredictions/Web Scraping Code/GatheredData'
dataPath = dataPathBase + '/' + '3.14.23 1am Data Scrape'

dfDefns = ['Line Score','Four Factors','Team 1 Basic BS','Team 1 Adv BS','Team 2 Basic BS','Team 2 Adv BS']
fileType = '.csv'

#go through all years collected
for i in range ( len(allData) ):
    year = str(allData[0][i])\
    #make folder for season
    sznPath = dataPath + '/' + year
    if not os.path.isdir(sznPath):
        os.makedirs(sznPath)  
    #go through each game's data for season
    for j in range ( len(allData[i][1]) ):
        gameTitle = str(allData[i][1][j]['Game Title'])
        #make folder for game's data to be saved at 
        gamePath = sznPath + '/' + gameTitle.strip()
        if not os.path.isdir(gamePath):
            os.makedirs(gamePath)  
        #save each data table as a csv in the game's folder 
        for defn in dfDefns:
            dataSave(allData[i][1][j][defn], gamePath, defn, fileType)
        
print("\n===========================\nFINISH Data Saving.")
        
        


START Data collection... 

tourneys links found!
-> GOT tourney links, years
-> CLEANED links, years
-> START Game Data Collection...
--> START 2022 season collection...


KeyboardInterrupt: 

In [139]:
import time

#PRE: yrIndex is the index associated with useableYears and useableLinks to get necessary data to save,
#     scrapeFolderName is the name of the folder the data will be saved to 
#POST: saves each piece of game data in .csv format (separately) to folder scrapeNameFolder
def gatherData(yrIndex, useableYears, useableLinks, scrapeFolderName):
    print("-> START Game Data Collection...")
    allData = []
    print("--> START " + useableYears[yrIndex] + " season collection...")
    # 1 server call
    gameLinks = getTourneyGameLinks( useableLinks[yrIndex] )
    time.sleep(7)
    seasonData = []
    for j in range( len(gameLinks) ):
        # 1 server call
        if( getGameInfoStats( gameLinks[j] ) == None ):
            continue
        seasonData.append( getGameInfoStats( gameLinks[j] ) )
        print("---> game data collected " + str(j+1) + "/" + str(len(gameLinks)))
        time.sleep(7)
    seasonOUT = [useableYears[yrIndex], seasonData]
    allData.append(seasonOUT)
    print("--> FINISH " + useableYears[yrIndex] + " season collection. \n")

    print("\nFINISH Data Collection.")

    print("\n===========================\nSTART Data Saving...")

    dataPathBase = 'C:/Users/danna/Documents/GitHub/WBBTournamentPredictions/Web Scraping Code/GatheredData'
    dataPath = dataPathBase + '/' + scrapeFolderName

    dfDefns = ['Line Score','Four Factors','Team 1 Basic BS','Team 1 Adv BS','Team 2 Basic BS','Team 2 Adv BS']
    fileType = '.csv'

    #go through all years collected
    for i in range ( len(allData) ):
        year = str(allData[0][i])
        #make folder for season
        sznPath = dataPath + '/' + year
        if not os.path.isdir(sznPath):
            os.makedirs(sznPath)  
        #go through each game's data for season
        for j in range ( len(allData[i][1]) ):
            gameTitle = str(allData[i][1][j]['Game Title'])
            #make folder for game's data to be saved at 
            gamePath = sznPath + '/' + gameTitle.strip()
            if not os.path.isdir(gamePath):
                os.makedirs(gamePath)  
            #save each data table as a csv in the game's folder 
            for defn in dfDefns:
                dataSave(allData[i][1][j][defn], gamePath, defn, fileType)

    print("\n===========================\nFINISH Data Saving.")



In [146]:
# Gather Data
baseUrl = 'https://www.sports-reference.com'
url='https://www.sports-reference.com/cbb/seasons/'

#getting tourney season links
print("\n===========================\nSTART Data collection... \n")

# 1 server call
years, links = getTourneyYearLinks(url)
print("-> GOT tourney links, years")

#only want years 2010-2023 (excluding 2020) -> 13 seasons of data 
useableLinks = []
useableYears = []
for i in range(13):
    #skipping nonexistent 2020 tourney
    if(years[i]=='2020'):
        continue
    useableLinks.append(links[i])
    useableYears.append(years[i])
print("-> CLEANED links, years")

yrIndex = 6
print(str(yrIndex+1) + '/' + str(len(useableYears)) + ' seasons => ' + str(useableYears[yrIndex]) + '\n\n')
scrapeFolderName = '3.14.23 230pm Scrape'
gatherData(yrIndex, useableYears, useableLinks, scrapeFolderName)




START Data collection... 

tourneys links found!
-> GOT tourney links, years
-> CLEANED links, years
7/12 seasons => 2015


-> START Game Data Collection...
--> START 2015 season collection...
---> game data collected 1/63
---> game data collected 2/63
---> game data collected 3/63
---> game data collected 4/63
---> game data collected 5/63
---> game data collected 6/63
---> game data collected 7/63
---> game data collected 8/63
---> game data collected 9/63
---> game data collected 10/63
---> game data collected 11/63
---> game data collected 12/63
---> game data collected 13/63
---> game data collected 14/63
---> game data collected 15/63
--->> could not find team two data, skipping collection of game.
---> game data collected 17/63
---> game data collected 18/63
---> game data collected 19/63
---> game data collected 20/63
---> game data collected 21/63
---> game data collected 22/63
---> game data collected 23/63
---> game data collected 24/63
---> game data collected 25/63
---> g