In [19]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandasql import sqldf 
import helper_functions as hf

In [3]:
yr = 2022
link = 'https://fantasy.espn.com/tournament-challenge-bracket/'+str(yr)+'/en/whopickedwhom'

In [28]:
with urllib.request.urlopen(link) as url:
    page = url.read()
soup = BeautifulSoup(page, "html.parser")
wpw_table = soup.find('table', {'class':'wpw-table'})
cols = ['school', 'pct']
team_pcts = pd.DataFrame(columns=cols)
trs = wpw_table.find_all('tr')
for tr in trs[1:]:
    tds = tr.find_all('td')
    for td in tds:
        school = td.find('span', {'class':'teamName'}).text.lower()
        pct = float(td.find('span', {'class':'percentage'}).text.rstrip('%'))
        new_row = pd.Series([school, pct], index=cols)
        team_pcts = team_pcts.append(new_row, ignore_index=True)

In [29]:
team_pcts

Unnamed: 0,school,pct
0,arizona,97.0
1,gonzaga,90.4
2,gonzaga,78.5
3,gonzaga,58.2
4,gonzaga,41.6
...,...,...
379,wrst/bry,1.4
380,montana state,0.7
381,csu fullerton,0.3
382,csu fullerton,0.1


In [34]:
q = '''
SELECT
    school, 
    pct AS pct_win_rd,
    ROW_NUMBER() OVER(PARTITION BY school ORDER BY pct DESC) rd
FROM team_pcts
'''
team_pct_by_rd = sqldf(q)
team_pct_by_rd

Unnamed: 0,school,pct_win_rd,rd
0,akron,9.1,1
1,akron,3.3,2
2,akron,0.8,3
3,akron,0.3,4
4,akron,0.1,5
...,...,...,...
379,yale,2.4,2
380,yale,0.8,3
381,yale,0.3,4
382,yale,0.1,5


In [44]:
def preseason_join_to_team(scraped_table):
    team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
    joined = team_spellings.merge(scraped_table, left_on='TeamNameSpelling', right_on='school')
    #preseason_joined.drop('TeamNameSpelling', axis=1, inplace=True)
    return(joined)

In [45]:
def check_for_missing_spellings(df, joined):
    comp = df.merge(joined, on='school', how='left')
    return(comp[comp['TeamNameSpelling'].isna()])

In [46]:
joined = preseason_join_to_team(team_pct_by_rd)
check_for_missing_spellings(team_pct_by_rd, joined)

Unnamed: 0,school,pct_win_rd_x,rd_x,TeamNameSpelling,TeamID,pct_win_rd_y,rd_y
1512,rutg/nd,28.8,1,,,,
1513,rutg/nd,5.6,2,,,,
1514,rutg/nd,1.4,3,,,,
1515,rutg/nd,0.4,4,,,,
1516,rutg/nd,0.2,5,,,,
1517,rutg/nd,0.1,6,,,,
2202,wrst/bry,3.0,1,,,,
2203,wrst/bry,1.4,2,,,,
2204,wrst/bry,0.8,3,,,,
2205,wrst/bry,0.5,4,,,,


In [43]:
team_spellings = pd.read_csv('data/kaggle_data/MTeamSpellings.csv')
unmatched_spellings_lst = [('csu fullerton', 1168), ("j'ville st", 1240)]
unmatched_spellings = pd.DataFrame({
    'TeamNameSpelling': [spelling for spelling, _ in unmatched_spellings_lst],
    'TeamID': [teamid for _, teamid in unmatched_spellings_lst]
})
unmatched_spellings
full_spellings = pd.concat([team_spellings, unmatched_spellings], ignore_index=True)
full_spellings.drop_duplicates(inplace=True)
full_spellings.to_csv('data/kaggle_data/MTeamSpellings.csv', index=False)

In [48]:
joined.drop('TeamNameSpelling', axis=1, inplace=True)

In [49]:
joined

Unnamed: 0,TeamID,school,pct_win_rd,rd
0,1103,akron,9.1,1
1,1103,akron,3.3,2
2,1103,akron,0.8,3
3,1103,akron,0.3,4
4,1103,akron,0.1,5
...,...,...,...,...
367,1240,j'ville st,1.9,2
368,1240,j'ville st,0.9,3
369,1240,j'ville st,0.3,4
370,1240,j'ville st,0.1,5


In [50]:
joined.to_csv('data/generated_data/who_picked_whom.csv')

In [8]:
trs = wpw_table.find_all('tr')

In [10]:
tds = trs[1].find_all('td')

In [15]:
tds[0].find('span', {'class':'teamName'}).text

'Arizona'

In [16]:
tds[0].find('span', {'class':'percentage'}).text

'97.0%'