In [1]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import pandas as pd
import re
import helper_functions as hf

## Get tournament teams

Because this script takes a while to run, I only want to scrape for teams that were in the tournament in a given year (our training set).

In [2]:
seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
seeds = seeds_all[seeds_all['Season'] >= 2003]

In [3]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
1154,2003,W01,1328
1155,2003,W02,1448
1156,2003,W03,1393
1157,2003,W04,1257
1158,2003,W05,1280


In [5]:
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')

In [6]:
team_table = seeds.merge(team_spellings, on=['TeamID'])

In [7]:
team_names = team_table[team_table['Season'] == 2021]['TeamNameSpelling']

In [120]:
yr = 2021
seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
team_table = seeds.merge(team_spellings, on=['TeamID'])
tourney_team_names = list(team_table[team_table['Season'] == yr]['TeamNameSpelling'])
tourney_team_names

['oklahoma',
 'syracuse',
 'oklahoma st',
 'oklahoma st.',
 'oklahoma state',
 'oklahoma-st',
 'oklahoma-state',
 'texas',
 'florida',
 'connecticut',
 'uconn',
 'maryland',
 'michigan st',
 'michigan st.',
 'michigan state',
 'michigan-st',
 'michigan-state',
 'louisiana state',
 'louisiana-state',
 'lsu',
 'purdue',
 'colorado',
 'brigham young',
 'brigham-young',
 'byu',
 'texas southern',
 'texas-southern',
 'tx southern',
 'wisconsin',
 'missouri',
 'oregon',
 'alabama',
 'kansas',
 'illinois',
 'creighton',
 'gonzaga',
 'utah st',
 'utah st.',
 'utah state',
 'utah-st',
 'utah-state',
 'north carolina',
 'north-carolina',
 'unc',
 'texas tech',
 'texas-tech',
 'va commonwealth',
 'vcu',
 'vcu(va. commonwealth',
 'virginia commonwealth',
 'virginia-commonwealth',
 'e washington',
 'eastern wash.',
 'eastern washington',
 'eastern-wash',
 'eastern-washington',
 'liberty',
 'georgia tech',
 'georgia-tech',
 'west virginia',
 'west-virginia',
 'ucla',
 'winthrop',
 'iowa',
 'villanov

In [59]:
def get_team_info(yr):
    # get seeds
    seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    team_table = seeds_all.merge(team_spellings, on=['TeamID'])
    tourney_team_names = list(team_table[team_table['Season'] == yr]['TeamNameSpelling'])
    link = 'https://www.sports-reference.com/cbb/seasons/' + str(yr) + '-coaches.html'
    with urllib.request.urlopen(link) as url:
        page = url.read()
    soup = BeautifulSoup(page, "html.parser")
    coach_table = soup.find('table', {'id': 'coaches'})
    coach_tbody = coach_table.find('tbody')
    coach_rows = coach_tbody.find_all('tr')
    cols = ['Season','school','returning_min_pct', 'returning_score_pct', 'avg_height', 'avg_yr']
    team_info_df = pd.DataFrame(columns=cols)
    for coach_row in coach_rows:
        tds = coach_row.find_all('td')
        if len(tds) > 10:
            team_stub = tds[0].find('a')['href']
            school = team_stub.split('/')[3]
            if school not in tourney_team_names:
                continue
            team_link = 'https://www.sports-reference.com' + team_stub
            print(school)
            with urllib.request.urlopen(team_link) as url:
                page = url.read()
            team_soup = BeautifulSoup(page, "html.parser")
            text_div = team_soup.find('div', {'id':'tfooter_roster'}).text
            decimals = re.findall('\d+\.\d', text_div)
            if len(decimals) == 3:
                # avg. yr weighted for time played
                year_avg = float(decimals[0])
                returning_min_pct = float(decimals[1])
                returning_score_pct = float(decimals[2])
            else:
                year_avg, returning_min_pct, returning_score_pct = None, None, None
            # avg height
            height_str = re.search('\d+\-+\d', text_div)
            if height_str == None:
                height_avg = 76
            else:
                height_ints = height_str.group().split('-')
                height_avg = 12*int(height_ints[0]) + int(height_ints[1])
                if height_avg > 100:
                    height_avg = 76
            new_row = pd.Series([yr, school, returning_min_pct, returning_score_pct, height_avg, year_avg], index=cols)
            team_info_df = team_info_df.append(new_row, ignore_index=True)
    return(team_info_df)

In [58]:
get_team_info(2004)

      Season  Seed  TeamID   TeamNameSpelling
0       1985   W01    1207         georgetown
1       1986   X04    1207         georgetown
2       1987   X01    1207         georgetown
3       1988   W08    1207         georgetown
4       1989   W01    1207         georgetown
...      ...   ...     ...                ...
6220    2021   X15    1213       grand-canyon
6221    2021   Z16    1216           hartford
6222    2022   Z14    1255           longwood
6223    2022  Z16a    1136             bryant
6224    2022  Z16a    1136  bryant university

[6225 rows x 4 columns]
      Season Seed  TeamID    TeamNameSpelling
44      2004  Z03    1210        georgia tech
45      2004  Z03    1210        georgia-tech
69      2004  W05    1228            illinois
140     2004  X05    1393            syracuse
203     2004  X07    1177              depaul
...      ...  ...     ...                 ...
5828    2004  Y15    1186        e washington
5829    2004  Y15    1186       eastern wash.
5830    2

/cbb/schools/gonzaga/2004.html
gonzaga
gonzaga
/cbb/schools/grambling/2004.html
grambling
/cbb/schools/green-bay/2004.html
green-bay
/cbb/schools/hampton/2004.html
hampton
/cbb/schools/hartford/2004.html
hartford
/cbb/schools/harvard/2004.html
harvard
/cbb/schools/hawaii/2004.html
hawaii
/cbb/schools/high-point/2004.html
high-point
/cbb/schools/hofstra/2004.html
hofstra
/cbb/schools/holy-cross/2004.html
holy-cross
/cbb/schools/houston/2004.html
houston
/cbb/schools/howard/2004.html
howard
/cbb/schools/idaho/2004.html
idaho
/cbb/schools/idaho-state/2004.html
idaho-state
/cbb/schools/illinois/2004.html
illinois
illinois
/cbb/schools/illinois-chicago/2004.html
illinois-chicago
illinois-chicago
/cbb/schools/illinois-state/2004.html
illinois-state
/cbb/schools/indiana/2004.html
indiana
/cbb/schools/indiana-state/2004.html
indiana-state
/cbb/schools/iona/2004.html
iona
/cbb/schools/iowa/2004.html
iowa
/cbb/schools/iowa-state/2004.html
iowa-state
/cbb/schools/ipfw/2004.html
ipfw
/cbb/schools/

/cbb/schools/stephen-f-austin/2004.html
stephen-f-austin
/cbb/schools/stetson/2004.html
stetson
/cbb/schools/stony-brook/2004.html
stony-brook
/cbb/schools/syracuse/2004.html
syracuse
syracuse
/cbb/schools/temple/2004.html
temple
/cbb/schools/tennessee/2004.html
tennessee
/cbb/schools/tennessee-martin/2004.html
tennessee-martin
/cbb/schools/tennessee-state/2004.html
tennessee-state
/cbb/schools/tennessee-tech/2004.html
tennessee-tech
/cbb/schools/texas/2004.html
texas
texas
/cbb/schools/texas-am/2004.html
texas-am
/cbb/schools/texas-am-corpus-christi/2004.html
texas-am-corpus-christi
/cbb/schools/texas-arlington/2004.html
texas-arlington
/cbb/schools/texas-christian/2004.html
texas-christian
/cbb/schools/texas-el-paso/2004.html
texas-el-paso
texas-el-paso
/cbb/schools/texas-pan-american/2004.html
texas-pan-american
/cbb/schools/texas-san-antonio/2004.html
texas-san-antonio
texas-san-antonio
/cbb/schools/texas-southern/2004.html
texas-southern
/cbb/schools/texas-state/2004.html
texas-st

KeyboardInterrupt: 

In [60]:
def sports_ref_join_to_kaggle(coach_df):
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    coach_joined = team_spellings.merge(coach_df, left_on='TeamNameSpelling', right_on='school')
    coach_joined.drop('TeamNameSpelling', axis=1, inplace=True)
    return(coach_joined)

In [61]:
team_info_df = get_team_info(2022)
team_info_joined = sports_ref_join_to_kaggle(team_info_df)
for yr in range(2021,2002,-1):
    print(yr)
    new_team_info_df = get_team_info(yr)
    new_team_info_joined = sports_ref_join_to_kaggle(new_team_info_df)
    team_info_joined = pd.concat([team_info_joined, new_team_info_joined], ignore_index=True)
    

akron
alabama
alabama-birmingham
arizona
arkansas
auburn
baylor
boise-state
bryant
cal-state-fullerton
chattanooga
colgate
colorado-state
connecticut
creighton
davidson
delaware
duke
georgia-state
gonzaga
houston
illinois
indiana
iowa
iowa-state
jacksonville-state
kansas
kentucky
longwood
louisiana-state
louisiana-state
loyola-il
marquette
memphis
miami-fl
michigan
michigan-state
montana-state
murray-state
new-mexico-state
norfolk-state
north-carolina
notre-dame
ohio-state
providence
purdue
richmond
rutgers
saint-marys-ca
saint-peters
san-diego-state
san-francisco
seton-hall
south-dakota-state
southern-california
tennessee
texas
texas-am-corpus-christi
texas-christian
texas-southern
texas-tech
ucla
vermont
villanova
virginia-tech
wisconsin
wright-state
wyoming
yale
2021
abilene-christian
alabama
appalachian-state
arkansas
baylor
brigham-young
california-santa-barbara
clemson
cleveland-state
colgate
colorado
connecticut
creighton
drake
drexel
eastern-washington
florida
florida-state
geo

hampton
illinois
indiana-state
kansas
kansas-state
kentucky
long-island-university
louisville
marquette
memphis
michigan
michigan-state
missouri
morehead-state
nevada-las-vegas
north-carolina
north-carolina-asheville
northern-colorado
notre-dame
oakland
ohio-state
old-dominion
penn-state
pittsburgh
princeton
purdue
richmond
saint-peters
san-diego-state
southern-california
st-johns-ny
syracuse
temple
tennessee
texas
texas-am
texas-san-antonio
ucla
utah-state
vanderbilt
villanova
virginia-commonwealth
washington
west-virginia
wisconsin
wofford
xavier
2010
arkansas-pine-bluff
baylor
brigham-young
butler
california
california-santa-barbara
clemson
cornell
duke
east-tennessee-state
florida
florida-state
georgetown
georgia-tech
gonzaga
houston
kansas
kansas-state
kentucky
lehigh
louisville
marquette
maryland
michigan-state
minnesota
missouri
montana
morgan-state
murray-state
nevada-las-vegas
new-mexico
new-mexico-state
north-texas
northern-iowa
notre-dame
oakland
ohio
ohio-state
oklahoma-sta

In [62]:
team_info_joined

Unnamed: 0,TeamID,Season,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
0,1103,2022,akron,60.1,53.1,76,2.2
1,1104,2022,alabama,46.6,49.3,77,1.3
2,1412,2022,alabama-birmingham,60.0,63.2,77,3.1
3,1112,2022,arizona,48.3,45.9,80,1.5
4,1116,2022,arkansas,37.5,37.4,77,2.5
...,...,...,...,...,...,...,...
1269,1448,2003,wake-forest,39.3,37.3,76,1.0
1270,1451,2003,weber-state,83.6,82.6,77,2.0
1271,1443,2003,western-kentucky,71.8,77.7,77,1.7
1272,1458,2003,wisconsin,66.5,72.1,77,1.2


In [63]:
team_info_joined.groupby('Season').count()

Unnamed: 0_level_0,TeamID,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003,65,65,59,59,65,59
2004,66,66,65,65,66,65
2005,65,65,65,65,65,65
2006,65,65,65,65,65,65
2007,65,65,63,63,65,63
2008,66,66,65,65,66,65
2009,65,65,63,63,65,63
2010,65,65,65,65,65,65
2011,68,68,68,68,68,68
2012,69,69,69,69,69,69


In [64]:
team_info_joined.to_csv('data/generated_data/additional_team_features.csv', index=False)

In [35]:
def check_for_missing_spellings(df, joined):
    comp = df.merge(joined, on='TeamID', how='left')
    return(comp[comp['school'].isna()])

In [36]:
seeds = seeds_all[seeds_all['Season'] >= 2009]

In [37]:
seeds.merge(team_info_joined, on='TeamID', how='left')

Unnamed: 0,Season_x,Seed,TeamID,Season_y,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
0,2009,W01,1338,2016,pittsburgh,73.0,76.5,79,2.1
1,2009,W01,1338,2014,pittsburgh,50.7,48.8,78,1.8
2,2009,W01,1338,2013,pittsburgh,57.6,56.3,78,1.9
3,2009,W01,1338,2011,pittsburgh,84.3,86.0,77,2.3
4,2009,W01,1338,2010,pittsburgh,42.5,33.4,76,1.4
...,...,...,...,...,...,...,...,...,...
5460,2022,Z15,1174,2022,delaware,100.0,100.0,77,2.4
5461,2022,Z15,1174,2014,delaware,62.4,67.1,77,1.7
5462,2022,Z16a,1136,2022,bryant,75.4,76.8,77,2.4
5463,2022,Z16b,1460,2022,wright-state,73.2,68.5,77,1.8


In [38]:
check_for_missing_spellings(seeds, team_info_joined)

Unnamed: 0,Season_x,Seed,TeamID,Season_y,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
