## WSA BeautifulSoup Demo Pt. 2

### Imports
For the necessary libraries to be available, make sure you have run in your terminal:
* `pip install requests`
* `pip install bs4`

In [8]:
import requests
from bs4 import BeautifulSoup

### Example 2: 2011-2024 Michigan Football Game Logs
**Website:** https://www.sports-reference.com/cfb/schools/michigan

In [10]:
all_rows = []
start_year = 2011
end_year = 2024

In [13]:
# Here, we pass the year into the URL via string interpolation

# 'table' -> 'tbody' -> 'tr' -> 'td'
def get_rows(year):
    url = requests.get(f'https://www.sports-reference.com/cfb/schools/michigan/{year}/gamelog/')
    soup = BeautifulSoup(url.text, 'html.parser') 
    table = soup.find('div', attrs = {'id': 'div_offense'}).find('table')
    year_rows = table.find('tbody').find_all('tr')
    for row in year_rows:
        all_rows.append(row)

In [15]:
for year in range(start_year, end_year+1):
    get_rows(str(year))

In [21]:
# Example to display format of data in table
ex_game = all_rows[162].find_all('td')
for col in ex_game:
    ind = ex_game.index(col)
    print('index', ind, ' ' * (2-len(str(ind))), ' | ', col.text, ' ' * (10-len(col.text)), ' | ', col)

# column   |  data         |  HTML 

index 0    |  2023-12-02   |  <td class="left" data-stat="date_game"><a href="/cfb/boxscores/2023-12-02-iowa.html">2023-12-02</a></td>
index 1    |  N            |  <td class="left" data-stat="game_location">N</td>
index 2    |  Iowa         |  <td class="left" data-stat="opp_name"><a href="/cfb/schools/iowa/2023.html">Iowa</a></td>
index 3    |  W (26-0)     |  <td class="left" csk="26" data-stat="game_result">W (26-0)</td>
index 4    |  22           |  <td class="right" data-stat="pass_cmp">22</td>
index 5    |  30           |  <td class="right" data-stat="pass_att">30</td>
index 6    |  73.3         |  <td class="right" data-stat="pass_cmp_pct">73.3</td>
index 7    |  147          |  <td class="right" data-stat="pass_yds">147</td>
index 8    |  0            |  <td class="right iz" data-stat="pass_td">0</td>
index 9    |  34           |  <td class="right" data-stat="rush_att">34</td>
index 10   |  66           |  <td class="right" data-stat="rush_yds">66</td>
index 11   |  1.9       

In [23]:
def get_stats(row):
    columns = row.find_all('td')
    date = columns[0].find('a').text
    year = int(date.split('-')[0])

    # To make sure bowl games are counted in correct season
    if date.split('-')[1] == '01':
        year -= 1
    
    location = columns[1].text
    if location == '':
        site = 'Home'
    elif location == '@':
        site = 'Away'
    else:
        # location == 'N'
        site = 'Neutral'
    
    opponent = columns[2].find('a').text
    
    # We are getting multiple data points of interest from this single column, so we must split it
    result_list = columns[3].text.split(' ')
    result = result_list[0]
    points_scored = int(result_list[1].split('-')[0][1:])
    points_against = int(result_list[1].split('-')[1][:-1])
    
    pass_cmp = float(columns[4].text)
    pass_att = int(columns[5].text)
    pass_pct = float(columns[6].text)
    pass_yrds = int(columns[7].text)
    pass_td = int(columns[8].text)
    pass_1st_down = int(columns[16].text)
    
    rush_att = int(columns[9].text)
    rush_yrds = int(columns[10].text)
    rush_td = int(columns[12].text)
    rush_1st_down = int(columns[17].text)
    
    total_offense = int(columns[14].text)
    fumbles = int(columns[22].text)
    ints = int(columns[23].text)
    
    return([date, year, opponent, site, result, points_scored, points_against, pass_cmp, pass_att, pass_pct,
             pass_yrds, pass_td, pass_1st_down, rush_att, rush_yrds, rush_td, rush_1st_down, total_offense, fumbles, ints])

In [25]:
game_stats = []

for row in all_rows:
    game_stats.append(get_stats(row))

In [27]:
for game in game_stats:
    print(game)

['2011-09-03', 2011, 'Western Michigan', 'Home', 'W', 34, 10, 9.0, 13, 69.2, 98, 0, 5, 26, 190, 3, 9, 288, 0, 0]
['2011-09-10', 2011, 'Notre Dame', 'Home', 'W', 35, 31, 11.0, 24, 45.8, 338, 4, 10, 26, 114, 1, 5, 452, 0, 3]
['2011-09-17', 2011, 'Eastern Michigan', 'Home', 'W', 31, 3, 7.0, 18, 38.9, 95, 2, 5, 50, 376, 2, 19, 471, 0, 1]
['2011-09-24', 2011, 'San Diego State', 'Home', 'W', 28, 7, 8.0, 17, 47.1, 93, 0, 3, 45, 320, 4, 14, 413, 2, 2]
['2011-10-01', 2011, 'Minnesota', 'Home', 'W', 58, 0, 18.0, 25, 72.0, 217, 3, 10, 48, 363, 3, 19, 580, 0, 0]
['2011-10-08', 2011, 'Northwestern', 'Away', 'W', 42, 24, 19.0, 28, 67.9, 362, 2, 14, 50, 179, 4, 8, 541, 0, 3]
['2011-10-15', 2011, 'Michigan State', 'Away', 'L', 14, 28, 12.0, 31, 38.7, 168, 1, 6, 36, 82, 1, 8, 250, 0, 1]
['2011-10-29', 2011, 'Purdue', 'Home', 'W', 36, 14, 10.0, 17, 58.8, 196, 0, 7, 53, 339, 4, 17, 535, 0, 2]
['2011-11-05', 2011, 'Iowa', 'Away', 'L', 16, 24, 18.0, 38, 47.4, 196, 2, 10, 37, 127, 0, 10, 323, 1, 1]
['2011-1

### Exercise 2: Michigan Football Roster on ESPN
**Website:** https://www.espn.com/college-football/team/roster/_/id/130/michigan-wolverines

This example scrapes the ESPN website, which sometimes needs a workaround. This involves creating a variable called headers and using it as a parameter in the `requests.get()` function, as shown below. We also access multiple tables from within the same webpage in this example.

We will gather the following player data:
* Name
* Number
* Position
* Height
* Weight
* Year
* Birth City
* Birth State

In [29]:
# The headers variable is a workaround for ESPN (because they are a little finicky with their data)
headers = {'User-Agent': '...'}
url = requests.get('https://www.espn.com/college-football/team/roster/_/id/130/michigan-wolverines', headers = headers)
soup = BeautifulSoup(url.text, 'html.parser')

In [31]:
# By inspecting the HTML, what class attributes can we use to locate the tables?
offense_table = soup.find('div', attrs = {'class': 'REPLACE'}).find('tbody').find_all('tr')
defense_table = soup.find('div', attrs = {'class': 'REPLACE'}).find('tbody').find_all('tr')
special_table = soup.find('div', attrs = {'class': 'REPLACE'}).find('tbody').find_all('tr')

tables = [offense_table, defense_table, special_table]

In [33]:
# Example to display format of data in table

ex_columns = offense_table[3].find_all('td')
for col in ex_columns:
    print('index', ex_columns.index(col), ' | ', col.text, ' ' * (11-len(col.text)), ' | ', col)

# column |  data          |  HTML 

index 0  |                |  <td class="Table__TD--headshot Table__TD"><div class="inline Table__TD--headshot" style="min-width:40px"><a class="AnchorLink" href="https://www.espn.com/college-football/player/_/id/4685495/alex-orji" tabindex="0"><div class="headshot inline-block relative TableHeadshot roster-headshot headshot--sm athlete silo"><figure class="Image aspect-ratio--parent"><div class="RatioFrame aspect-ratio--1x1"></div><div class="Image__Wrapper aspect-ratio--child"></div></figure></div></a></div></td>
index 1  |  Alex Orji10   |  <td class="Table__TD"><div class="inline" style="min-width:140px"><a class="AnchorLink" href="https://www.espn.com/college-football/player/_/id/4685495/alex-orji" tabindex="0">Alex Orji</a><span class="pl2 n10">10</span></div></td>

In [None]:
players = []

# The .find() and .split() functions will be useful here within columns!
def get_player_data(table):
    for row in table:
        columns = row.find_all('td')
        
        # name = 
        # number = 
        # position = 
        
        # height_total = 
        # height_ft = 
        # height_in = 
        # height = 
        
        # weight = 
        # year = 
        # birth_city, birth_state =
        
        players.append([name, number, position, height, weight, year, birth_city, birth_state])

In [None]:
for table in tables:
    get_player_data(table)

for player in players:
    print(player)