Purpose: Scrape past tournaments and get new matchups for current tournament

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')


Process postseason data

In [16]:
for year in [str(yr) for yr in np.arange(2017,2003,-1)]:
    # get page content
    url='https://www.sports-reference.com/cbb/postseason/'+year+'-ncaa.html'
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html5lib")

    region={}

    # get region names and ordering
    regionNames=[]
    divs=soup.findAll("div", {"data-controls": '#brackets'})
    for reg in divs[0].find_all('div'):
        print(reg.find('a').text)
        regionNames.append(reg.find('a').text.lower().replace(' ','').replace('.',''))
    regionNames[-1]='national'
    
    
    for regionName in regionNames:
        divs=soup.findAll("div", {"id": regionName})[0]
        divround=divs.find_all('div',{'class': 'round'})[0]

        teamNames=[]
        for t in divround.find_all('div'):
            tname=t.find('a').text.lower().replace(' ','-')
            if tname not in teamNames:
                teamNames.append(tname)
        teamNames

        
    # Pull team information directly from bracket    
    region={}
    initRound={}
    teamSeed={}

    for regionName in regionNames:
        initRound[regionName]={}
        divs=soup.findAll("div", {"id": regionName})[0]
        rounds=divs.find_all('div',{'class': 'round'})
        region[regionName]={}
        for iround in range(len(rounds)-1):
            region[regionName][iround]=[]
            divround=rounds[iround]

            # get names and seeds
            teamNames=[]
            seedList=[]
            for t in divround.find_all('div'):
                tname=t.find('a').text.lower().replace(' ','-')
                tname=t.find('a')['href'].split('/')[3].lower().replace(' ','-')#.replace('-st-','-st.-')
                if tname not in teamNames:
                    seedList.append(t.find('span').text)
                    teamNames.append(tname)
            # determine winner
            winnerNames=[]
            for t in divround.find_all('div', {'class': 'winner'}):
                tname=t.find('a').text.lower().replace(' ','-')
                tname=t.find('a')['href'].split('/')[3].lower().replace(' ','-')#.replace('-st-','-st.-')
                if tname not in winnerNames:
                    winnerNames.append(tname)

                    
            if regionName=='national':
                roundNumber = iround + 1 + 4
            else:
                roundNumber = iround + 1
            gameList=[]
            
            # fill list for games
            for t1,t2,s1,s2 in zip(teamNames[0::2],teamNames[1::2],seedList[0::2],seedList[1::2]):
                if t1 in winnerNames:
                    result = 1
                if t2 in winnerNames: 
                    result = 0
                teamSeed[t1]=s1
                teamSeed[t2]=s2

                gameList.append([t1,t2,result,roundNumber,regionName])
                # fill list for only initial teams
                if iround==0:
                    initRound[regionName][s1]=t1
                    initRound[regionName][s2]=t2
                
            region[regionName][iround] = gameList

    gameList=[]
    for regionName in region.keys():
        for iround in region[regionName].keys():
            for gameOutcome in region[regionName][iround]:
                gameList.append(gameOutcome)

    all_games=gameList

    team_data=pd.read_csv('data/combined_team_stats_'+year+'.csv')

    name=[]
    for r in team_data.itertuples():
        name.append(r.sched_url.split('/')[3])
    team_data['name']=name

    combinedDF=[]
    print('Looping over games in ',year)
    for game in all_games:
        team_one = team_data[team_data['name'] == game[0]]
        column1 = team_one.columns[1:]    
        team_one[column1].values[0]
        
        seed1 = teamSeed[game[0]]
        seed2 = teamSeed[game[1]]

        team_two = team_data[team_data['name'] == game[1]]
        column2 = team_two.columns[1:]    
        team_two[column2].values[0]

        combinedDF.append(list(team_one[column1].values[0])+list(team_two[column2].values[0])+[game[2],game[3],game[4],seed1,seed2])
    combinedColumns=[c+'_1' for c in column1]+[c+'_2' for c in column2]+['outcome','round','region','seed_1','seed_2']

    game_data=pd.DataFrame(combinedDF,columns=combinedColumns)
    game_data.to_csv("data/games/tourn_games_" + year + ".csv")
    # game_data
    teamList=[]
    for regionName in initRound.keys():
        region=initRound[regionName]
        if regionName=='national':
            continue
        for iseed in region.keys():
            teamList.append([regionName,iseed,region[iseed]])
    tourneyTeams = pd.DataFrame(teamList,columns=['regionName','seed','teamName'])
    tourneyTeams.to_csv("data/team/tourn_teams_" + year + ".csv")


East
Midwest
South
West
Final Four
Looping over games in  2017
East
South
West
Midwest
Final Four
Looping over games in  2016
East
Midwest
South
West
Final Four
Looping over games in  2015
Midwest
South
East
West
Final Four
Looping over games in  2014
East
Midwest
West
South
Final Four
Looping over games in  2013
Midwest
South
West
East
Final Four
Looping over games in  2012
East
Southeast
Southwest
West
Final Four
Looping over games in  2011
South
East
Midwest
West
Final Four
Looping over games in  2010
Midwest
East
South
West
Final Four
Looping over games in  2009
East
Midwest
South
West
Final Four
Looping over games in  2008
West
East
Midwest
South
Final Four
Looping over games in  2007
Minneapolis
Atlanta
Oakland
Washington
Final Four
Looping over games in  2006
Syracuse
Albuquerque
Austin
Chicago
Final Four
Looping over games in  2005
St. Louis
Atlanta
East Rutherford
Phoenix
Final Four
Looping over games in  2004
