In [8]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import pandas as pd
import helper_functions as hf

## Add some team spellings to MTeamSpellings.csv manually

This is needed for joining our scraped data (sports-reference) with our provided data (kaggle). Think the best/quickest way to do this is just manually right now.

"Spellings" I'm manually adding to the MTeamSpellings.csv (which didn't have matches with the sports-reference dataset):
- california-baptist (1465)
- dixie-state (1469)
- north-alabama (1466)
- st-thomas-mn (1472)
- tarleton-state (1460)
- california-san-diego (1471)

In [10]:
unmatched_spellings_lst = [('california-baptist', 1465), ('dixie-state', 1469), ('north-alabama', 1466),
                           ('st-thomas-mn', 1472), ('tarleton-state', 1460), ('california-san-diego', 1471)]
hf.add_team_spellings(unmatched_spellings_lst)

AttributeError: module 'helper_functions' has no attribute 'add_team_spellings'

In [20]:
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')

In [21]:
unmatched_spellings_lst = [('california-baptist', 1465), ('dixie-state', 1469), ('north-alabama', 1466),
                           ('st-thomas-mn', 1472), ('tarleton-state', 1460), ('california-san-diego', 1471)]
unmatched_spellings = pd.DataFrame({
    'TeamNameSpelling': [spelling for spelling, _ in unmatched_spellings_lst],
    'TeamID': [teamid for _, teamid in unmatched_spellings_lst]
})
unmatched_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,california-baptist,1465
1,dixie-state,1469
2,north-alabama,1466
3,st-thomas-mn,1472
4,tarleton-state,1460
5,california-san-diego,1471


In [22]:
full_spellings = pd.concat([team_spellings, unmatched_spellings], ignore_index=True)
full_spellings.drop_duplicates(inplace=True)
full_spellings.to_csv('data/kaggle_data/MTeamSpellings.csv', index=False)

In [23]:
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
team_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101
...,...,...
1158,dixie-state,1469
1159,north-alabama,1466
1160,st-thomas-mn,1472
1161,tarleton-state,1460


## Scrape coaching data from sports-reference

In [24]:
def get_coach_df(yr):
    link = 'https://www.sports-reference.com/cbb/seasons/' + str(yr) + '-coaches.html'
    with urllib.request.urlopen(link) as url:
        page = url.read()
    soup = BeautifulSoup(page, "html.parser")
    coach_table = soup.find('table', {'id': 'coaches'})
    coach_tbody = coach_table.find('tbody')
    coach_rows = coach_tbody.find_all('tr')
    cols = ['Season','school','is_ap_pre_top_5', 'is_ap_pre_top_15', 'is_ap_pre_top_25','coach_WL_car','tourneys_car','sw16_car','ff_car','champ_car']
    coach_df = pd.DataFrame(columns=cols)
    for coach_row in coach_rows:
        tds = coach_row.find_all('td')
        if len(tds) > 10:
            school = tds[0].find('a')['href'].split('/')[3]
            is_ap_pre_top_5 = False if tds[6].text == '' else int(tds[6].text) <= 5
            is_ap_pre_top_15 = False if tds[6].text == '' else 5 < int(tds[6].text) <= 15
            is_ap_pre_top_25 = False if tds[6].text == '' else 15 < int(tds[6].text) <= 25
            # arbitrarily choosing 0.300 winning % if the coach for some reason has no winning % listed (probably a bad sign)
            coach_WL_car = 0.3 if tds[21].text == '' else float(tds[21].text)
            tourneys_car = 0 if tds[22].text == '' else int(tds[22].text)
            sw16_car = 0 if tds[23].text == '' else int(tds[23].text)
            ff_car = 0 if tds[24].text == '' else int(tds[24].text)
            champ_car = 0 if tds[25].text == '' else int(tds[25].text)
            new_row = pd.Series([yr, school, is_ap_pre_top_5, is_ap_pre_top_15, is_ap_pre_top_25, coach_WL_car, tourneys_car, sw16_car, ff_car, champ_car], index=cols)
            coach_df = coach_df.append(new_row, ignore_index=True)
    return(coach_df)

In [25]:
def sports_ref_join_to_kaggle(coach_df):
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    coach_joined = team_spellings.merge(coach_df, left_on='TeamNameSpelling', right_on='school')
    coach_joined.drop('TeamNameSpelling', axis=1, inplace=True)
    return(coach_joined)

In [26]:
def check_for_missing_spellings(coach_df, coach_joined):
    coach_comp = coach_df.merige(coach_joined, on='school', how='left')
    return(coach_comp[coach_comp['TeamNameSpelling'].isna()])

## Compiling dataset

In [28]:
coach_df = get_coach_df(2003)
coach_joined = sports_ref_join_to_kaggle(coach_df)
for yr in range(2004,2023):
    print(yr)
    new_coach_df = get_coach_df(yr)
    new_coach_joined = sports_ref_join_to_kaggle(new_coach_df)
    coach_joined = pd.concat([coach_joined, new_coach_joined], ignore_index=True)

2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [29]:
coach_joined.head()

Unnamed: 0,TeamID,Season,school,is_ap_pre_top_5,is_ap_pre_top_15,is_ap_pre_top_25,coach_WL_car,tourneys_car,sw16_car,ff_car,champ_car
0,1102,2003,air-force,False,False,False,0.341,0,0,0,0
1,1103,2003,akron,False,False,False,0.462,0,0,0,0
2,1104,2003,alabama,False,True,False,0.66,4,0,0,0
3,1105,2003,alabama-am,False,False,False,0.518,0,0,0,0
4,1412,2003,alabama-birmingham,False,False,False,0.618,0,0,0,0


In [30]:
coach_joined.to_csv('data/generated_data/coach_features.csv', index=False)