In [78]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import pandas as pd
import re
import helper_functions as hf

## Get tournament teams

Because this script takes a while to run, I only want to scrape for teams that were in the tournament in a given year (our training set).

In [101]:
seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
seeds = seeds_all[seeds_all['Season'] >= 2003]

In [102]:
seeds.head()

Unnamed: 0,Season,Seed,TeamID
1154,2003,W01,1328
1155,2003,W02,1448
1156,2003,W03,1393
1157,2003,W04,1257
1158,2003,W05,1280


In [103]:
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')

In [106]:
team_table = seeds.merge(team_spellings, on=['TeamID'])

In [111]:
team_names = team_table[team_table['Season'] == 2021]['TeamNameSpelling']

In [116]:
'oklahoma' in list(team_names)

True

In [120]:
yr = 2021
seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
team_table = seeds.merge(team_spellings, on=['TeamID'])
tourney_team_names = list(team_table[team_table['Season'] == yr]['TeamNameSpelling'])
tourney_team_names

['oklahoma',
 'syracuse',
 'oklahoma st',
 'oklahoma st.',
 'oklahoma state',
 'oklahoma-st',
 'oklahoma-state',
 'texas',
 'florida',
 'connecticut',
 'uconn',
 'maryland',
 'michigan st',
 'michigan st.',
 'michigan state',
 'michigan-st',
 'michigan-state',
 'louisiana state',
 'louisiana-state',
 'lsu',
 'purdue',
 'colorado',
 'brigham young',
 'brigham-young',
 'byu',
 'texas southern',
 'texas-southern',
 'tx southern',
 'wisconsin',
 'missouri',
 'oregon',
 'alabama',
 'kansas',
 'illinois',
 'creighton',
 'gonzaga',
 'utah st',
 'utah st.',
 'utah state',
 'utah-st',
 'utah-state',
 'north carolina',
 'north-carolina',
 'unc',
 'texas tech',
 'texas-tech',
 'va commonwealth',
 'vcu',
 'vcu(va. commonwealth',
 'virginia commonwealth',
 'virginia-commonwealth',
 'e washington',
 'eastern wash.',
 'eastern washington',
 'eastern-wash',
 'eastern-washington',
 'liberty',
 'georgia tech',
 'georgia-tech',
 'west virginia',
 'west-virginia',
 'ucla',
 'winthrop',
 'iowa',
 'villanov

In [139]:
def get_team_info(yr):
    # get seeds
    seeds_all = pd.read_csv('data/kaggle_data/MNCAATourneySeeds.csv')
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    team_table = seeds.merge(team_spellings, on=['TeamID'])
    tourney_team_names = list(team_table[team_table['Season'] == yr]['TeamNameSpelling'])
    
    link = 'https://www.sports-reference.com/cbb/seasons/' + str(yr) + '-coaches.html'
    with urllib.request.urlopen(link) as url:
        page = url.read()
    soup = BeautifulSoup(page, "html.parser")
    coach_table = soup.find('table', {'id': 'coaches'})
    coach_tbody = coach_table.find('tbody')
    coach_rows = coach_tbody.find_all('tr')
    cols = ['Season','school','returning_min_pct', 'returning_score_pct', 'avg_height', 'avg_yr']
    team_info_df = pd.DataFrame(columns=cols)
    for coach_row in coach_rows:
        tds = coach_row.find_all('td')
        if len(tds) > 10:
            team_stub = tds[0].find('a')['href']
            school = team_stub.split('/')[3]
            if school not in tourney_team_names:
                continue
            team_link = 'https://www.sports-reference.com' + team_stub
            print(school)
            with urllib.request.urlopen(team_link) as url:
                page = url.read()
            team_soup = BeautifulSoup(page, "html.parser")
            text_div = team_soup.find('div', {'id':'tfooter_roster'}).text
            decimals = re.findall('\d+\.\d', text_div)
            if len(decimals) == 3:
                # avg. yr weighted for time played
                year_avg = float(decimals[0])
                returning_min_pct = float(decimals[1])
                returning_score_pct = float(decimals[2])
            else:
                year_avg, returning_min_pct, returning_score_pct = None, None, None
            # avg height
            height_str = re.search('\d+\-+\d', text_div)
            height_ints = height_str.group().split('-')
            height_avg = 12*int(height_ints[0]) + int(height_ints[1])
            if height_avg > 100:
                height_avg = 76
            new_row = pd.Series([yr, school, returning_min_pct, returning_score_pct, height_avg, year_avg], index=cols)
            team_info_df = team_info_df.append(new_row, ignore_index=True)
    return(team_info_df)

In [122]:
def sports_ref_join_to_kaggle(coach_df):
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    coach_joined = team_spellings.merge(coach_df, left_on='TeamNameSpelling', right_on='school')
    coach_joined.drop('TeamNameSpelling', axis=1, inplace=True)
    return(coach_joined)

In [124]:
team_info_df = get_team_info(2021)
team_info_joined = sports_ref_join_to_kaggle(team_info_df)
for yr in range(2020,2002,-1):
    print(yr)
    new_team_info_df = get_team_info(yr)
    new_team_info_joined = sports_ref_join_to_kaggle(new_team_info_df)
    team_info_joined = pd.concat([team_info_joined, new_team_info_joined], ignore_index=True)
    

abilene-christian
alabama
appalachian-state
arkansas
baylor
brigham-young
california-santa-barbara
clemson
cleveland-state
colgate
colorado
connecticut
creighton
drake
drexel
eastern-washington
florida
florida-state
georgetown
georgia-tech
gonzaga
grand-canyon
hartford
houston
illinois
iona
iowa
kansas
liberty
louisiana-state
loyola-il
maryland
michigan
michigan-state
missouri
morehead-state
mount-st-marys
norfolk-state
north-carolina
north-carolina-greensboro
north-texas
ohio
ohio-state
oklahoma
oklahoma-state
oral-roberts
oregon
oregon-state
purdue
rutgers
san-diego-state
southern-california
st-bonaventure
syracuse
tennessee
texas
texas-southern
texas-tech
ucla
utah-state
villanova
virginia
virginia-commonwealth
virginia-tech
west-virginia
wichita-state
winthrop
wisconsin
2020
2019
abilene-christian
arizona-state
auburn
baylor
belmont
bradley
buffalo
california-irvine
central-florida
cincinnati
colgate
duke
fairleigh-dickinson
florida
florida-state
gardner-webb
georgia-state
gonzaga


lehigh
louisville
marquette
maryland
michigan-state
minnesota
missouri
montana
morgan-state
murray-state
nevada-las-vegas
new-mexico
new-mexico-state
north-texas
northern-iowa
notre-dame
oakland
ohio
ohio-state
oklahoma-state
old-dominion
pittsburgh
purdue
richmond
robert-morris
saint-marys-ca
sam-houston-state
san-diego-state
siena
syracuse
temple
tennessee
texas
texas-am
texas-el-paso
utah-state
vanderbilt
vermont
villanova
wake-forest
washington
west-virginia
winthrop
wisconsin
wofford
xavier
2009
akron
alabama-state
american
arizona
arizona-state
binghamton
boston-college
brigham-young
butler
cal-state-northridge
california
chattanooga
clemson
cleveland-state
connecticut
cornell
dayton
duke
east-tennessee-state
florida-state
gonzaga
illinois
kansas
louisiana-state
louisville
marquette
maryland
memphis
michigan
michigan-state
minnesota
mississippi-state
missouri
morehead-state
morgan-state
north-carolina
north-dakota-state
northern-iowa
ohio-state
oklahoma
oklahoma-state
pittsburgh


AttributeError: 'NoneType' object has no attribute 'group'

In [125]:
team_info_joined

Unnamed: 0,TeamID,Season,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
0,1101,2021,abilene-christian,72.7,74.3,75,1.6
1,1104,2021,alabama,52.7,54.2,78,1.6
2,1111,2021,appalachian-state,50.0,54.9,76,1.5
3,1116,2021,arkansas,18.3,15.3,78,1.7
4,1124,2021,baylor,67.8,71.7,77,2.0
...,...,...,...,...,...,...,...
808,1449,2009,washington,68.9,71.9,76,1.4
809,1452,2009,west-virginia,58.8,59.2,78,1.2
810,1443,2009,western-kentucky,50.4,39.4,77,1.5
811,1458,2009,wisconsin,65.8,61.9,77,1.9


In [142]:
team_info_joined.loc[team_info_joined.avg_height > 100, 'avg_height'] = 76

In [144]:
team_info_joined.to_csv('data/generated_data/additional_team_features.csv', index=False)

In [134]:
def check_for_missing_spellings(df, joined):
    comp = df.merge(joined, on='TeamID', how='left')
    return(comp[comp['school'].isna()])

In [131]:
seeds = seeds_all[seeds_all['Season'] >= 2009]

In [135]:
seeds.merge(team_info_joined, on='TeamID', how='left')

Unnamed: 0,Season_x,Seed,TeamID,Season_y,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
0,2009,W01,1338,2016,pittsburgh,73.0,76.5,79,2.1
1,2009,W01,1338,2014,pittsburgh,50.7,48.8,78,1.8
2,2009,W01,1338,2013,pittsburgh,57.6,56.3,78,1.9
3,2009,W01,1338,2011,pittsburgh,84.3,86.0,77,2.3
4,2009,W01,1338,2010,pittsburgh,42.5,33.4,76,1.4
...,...,...,...,...,...,...,...,...,...
4735,2021,Z13,1317,2010,north-texas,66.5,68.1,77,1.5
4736,2021,Z14,1159,2021,colgate,68.6,65.6,76,1.8
4737,2021,Z14,1159,2019,colgate,61.5,60.5,78,1.6
4738,2021,Z15,1331,2021,oral-roberts,40.7,44.2,24230,0.9


In [136]:
check_for_missing_spellings(seeds, team_info_joined)

Unnamed: 0,Season_x,Seed,TeamID,Season_y,school,returning_min_pct,returning_score_pct,avg_height,avg_yr
