In [239]:
import requests as reqs
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as BS
import matplotlib.pyplot as plt

In [240]:
ou_url = 'https://soonersports.com/sports/football/roster?sort=jersey'
soup = BS(reqs.get(ou_url).content, 'html.parser')

In [241]:
jerseys = [int(i.text) for i in soup.select('div.s-stamp>span.s-stamp__text')]
names = [i.text for i in soup.select('div.s-person-details__personal>div>a>span')]
positions = [i.text for i in soup.select('div.s-person-details__bio-stats>span')[0::4]]
levels = [i.text for i in soup.select('div.s-person-details__bio-stats>span')[1::4]]
heights = [i.text.strip().replace("'", "") for i in soup.select('div.s-person-details__bio-stats>span')[2::4]]
weights = [int(i.text.strip().replace(" lbs", "")) for i in soup.select('div.s-person-details__bio-stats>span')[3::4]]
cities = [i.text.strip().split(', ')[0] for i in soup.select('span.s-person-card__content__person__location-item')[1::2]]
states = [i.text.strip().split(', ')[1] for i in soup.select('span.s-person-card__content__person__location-item')[1::2]]

In [242]:
ou = pd.DataFrame({
    'jersey': jerseys,
    'name': names,
    'position': positions,
    'level': levels,
    'height': heights,
    'weight': weights,
    'city': cities,
    'state': states,
})

In [243]:
# Change the wierd state-naming conventions
ou_state_map = {
    'Calif.': 'California',
    'Colo': 'Colorado',
    'Colo.': 'Colorado',
    'Conn.': 'Connecticut',
    'Fla.': 'Florida',
    'Ga.': 'Georgia',
    'Hawaii': 'Hawaii',
    'Ill.': 'Illinois',
    'Ind.': 'Indiana',
    'Kan.': 'Kansas',
    'Ky.': 'Kentucky',
    'Md.': 'Maryland',
    'Mich.': 'Michigan',
    'Mo.': 'Missouri',
    'N.C.': 'North Carolina',
    'N.J.': 'New Jersey',
    'Neb.': 'Nebraska',
    'Nev.': 'Nevada',
    'Ohio': 'Ohio',
    'Okla.': 'Oklahoma',
    'Pa.': 'Pennsylvania',
    'S.C.': 'South Carolina',
    'Tenn.': 'Tennessee',
    'Texas': 'Texas',
    'Va.': 'Virginia',
    'Wash.': 'Washington',
}

# Change the classman notation
ou_level_map = {
    'Fr.': 'Freshman',
    'R-Sr.': 'Redshirt Senior',
    'So.': 'Sophomore',
    'R-Fr.': 'Redshirt Freshman',
    'R-Jr.': 'Redshirt Junior',
    'Jr.': 'Junior',
    'R-So.': 'Redshirt Sophomore',
    'Sr.': 'Senior',
    'Red 5th': 'Redshirt 5th Year'
}

In [244]:
ou['height'] = ou.height.apply(lambda x: int(x.split(' ')[0]) * 12 + int(x.split(' ')[1]))
ou['level'] = ou.level.map(ou_level_map)
ou['state'] = ou.state.map(ou_state_map)

In [245]:
ou.head()

Unnamed: 0,jersey,name,position,level,height,weight,city,state
0,0,Kalib Hicks,RB,Freshman,71,199,Denton,Texas
1,0,Derrick LeBlanc,DL,Freshman,77,278,Kissimmee,Florida
2,1,Jayden Gibson,WR,Sophomore,77,189,Winter Garden,Florida
3,1,Dasan McCullough,LB,Sophomore,77,222,Bloomington,Indiana
4,2,Jovantae Barnes,RB,Sophomore,72,206,Las Vegas,Nevada


In [246]:
# How many players are from Texas compared to Oklahoma?
tx_ou_player_count = len(ou[ou.state == 'Texas'])
ok_ou_player_count = len(ou[ou.state == 'Oklahoma'])
ou_roster_count = len(ou)

print("Texas Roster Pct:", f"{tx_ou_player_count / ou_roster_count * 100:.0f}%")
print("Okla. Roster Pct:", f"{ok_ou_player_count / ou_roster_count * 100:.0f}%")

Texas Roster Pct: 31%
Okla. Roster Pct: 17%


In [247]:
ou.state.value_counts()[:5]

state
Texas             31
Oklahoma          17
Florida            6
Nevada             4
North Carolina     4
Name: count, dtype: int64

In [248]:
ut_url = 'https://texassports.com/sports/football/roster'
soup = BS(reqs.get(ut_url).content, 'html.parser')

In [249]:
jerseys = [int(i.text.strip()) for i in soup.select('span.sidearm-roster-player-jersey-number')]
names = [i.text.strip().split('\n')[-1] for i in soup.select('div.sidearm-roster-player-name')]
positions = [i.text.strip() for i in soup.select('span.sidearm-roster-player-position-long-short')][0::2]
levels = [i.text for i in soup.select('div>div>span.sidearm-roster-player-academic-year')[1::2]][:99]
heights = [i.text.replace('"', '').replace("'", " ") for i in soup.select('span.sidearm-roster-player-height')][::2]
weights = [int(i.text.replace(' lbs', '')) for i in soup.select('span.sidearm-roster-player-weight')][::2]
cities = [i.text.split(', ')[0] for i in soup.select('span.sidearm-roster-player-hometown')][::2][:99]
states = [i.text.split(', ')[1] for i in soup.select('span.sidearm-roster-player-hometown')][::2][:99]
#set(states)

In [250]:
ut_state_map = {
    'Ala.': 'Alabama',
    'Ariz.': 'Arizona',
    'Calif.': 'California',
    'Colo.': 'Colorado',
    'D.C.': 'District of Columbia',
    'Fla.': 'Florida',
    'Hawaii': 'Hawaii',
    'La.': 'Louisiana',
    'Miss.': 'Mississippi',
    'N.J.': 'New Jersey',
    'Texas': 'Texas',
    'Wash.': 'Washington',
}

In [251]:
ut = pd.DataFrame({
    'jersey': jerseys,
    'name': names,
    'position': positions,
    'level': levels,
    'height': heights,
    'weight': weights,
    'city': cities,
    'state': states,
})

In [252]:
ut.head()

Unnamed: 0,jersey,name,position,level,height,weight,city,state
0,0,Anthony Hill Jr.,LB,Freshman,6 3,229,Denton,Texas
1,0,Ja'Tavion Sanders,TE,Junior,6 2,250,Denton,Texas
2,1,Justice Finkley,EDGE,Sophomore,6 0,184,Trussville,Ala.
3,1,Xavier Worthy,WR,Junior,6 0,199,Fresno,Calif.
4,2,Johntay Cook II,WR,Freshman,6 0,202,DeSoto,Texas


In [253]:
ut['height'] = ut.height.apply(lambda x: int(x.split(' ')[0]) * 12 + int(x.split(' ')[1]))
ut['state'] = ut.state.map(ut_state_map)

In [254]:
ut.head()

Unnamed: 0,jersey,name,position,level,height,weight,city,state
0,0,Anthony Hill Jr.,LB,Freshman,75,229,Denton,Texas
1,0,Ja'Tavion Sanders,TE,Junior,74,250,Denton,Texas
2,1,Justice Finkley,EDGE,Sophomore,72,184,Trussville,Alabama
3,1,Xavier Worthy,WR,Junior,72,199,Fresno,California
4,2,Johntay Cook II,WR,Freshman,72,202,DeSoto,Texas


In [255]:
tx_ut_player_count = len(ut[ut.state == 'Texas'])
ut_roster_count = len(ut)

print("Texas Roster Pct:", f"{tx_ut_player_count / ut_roster_count * 100:.0f}%")

Texas Roster Pct: 75%
