# Scraping weekly PFR data


## Process
1. Hit the weekly URL Tuesday morning at 1 am: `https://www.pro-football-reference.com/years/{year}/week_{week number}.htm`
2. Turn the content of the weekly page into some beautiful soup
3. Find the list of all games that week
4. For all games in the week, hit the `Final` URL: `https://www.pro-football-reference.com/boxscores/{id}.htm`
5. Turn the game content into some beautiful soup
6. Parse the `Passing, Rushing, & Receiving` table
7. Parse the `Advanced passing` table
8. Parse the `Advanced rushing` table
9. Parse the `Advanced receiving` table
10. Parse the `Snap Counts` table for all offensive player snaps > 0
11. For each player, save to DB

In [170]:
import requests
from bs4 import BeautifulSoup, Comment
import json
import hashlib

### Hitting weekly URL, getting list of games, pulling link for each game

In [171]:
res = requests.get('https://www.pro-football-reference.com/years/2021/week_1.htm')
if res.status_code == 200:
    page = BeautifulSoup(res.content, 'html.parser')
    print(page)


<!DOCTYPE html>

<html class="no-js" data-root="/home/pfr/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
<link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202208193" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_N

In [172]:
games = page.find('div', { "class": "game_summaries"})
gamesList = games.find_all('table', { 'class': 'teams'})
for game in gamesList:
    linkTD = game.find('td', { 'class': 'gamelink' })
    link = linkTD.find('a')['href']
    print(link)

/boxscores/202109090tam.htm
/boxscores/202109120atl.htm
/boxscores/202109120buf.htm
/boxscores/202109120car.htm
/boxscores/202109120cin.htm
/boxscores/202109120clt.htm
/boxscores/202109120det.htm
/boxscores/202109120htx.htm
/boxscores/202109120oti.htm
/boxscores/202109120was.htm
/boxscores/202109120kan.htm
/boxscores/202109120nor.htm
/boxscores/202109120nwe.htm
/boxscores/202109120nyg.htm
/boxscores/202109120ram.htm
/boxscores/202109130rai.htm


### Parsing information from box score

In [173]:
statPage = requests.get('https://www.pro-football-reference.com/boxscores/202109090tam.htm')
if statPage.status_code == 200:
    statPageSoup = BeautifulSoup(statPage.content, 'html.parser')

In [174]:
statTables = statPageSoup.find_all('div', { 'class': 'table_wrapper'} )
print(len(statTables))

20


In [175]:
for table in statTables:
    print(table['id'])

all_scoring
all_game_info
all_officials
all_expected_points
all_team_stats
all_player_offense
all_player_defense
all_returns
all_kicking
all_passing_advanced
all_rushing_advanced
all_receiving_advanced
all_defense_advanced
all_home_starters
all_vis_starters
all_home_snap_counts
all_vis_snap_counts
all_home_drives
all_vis_drives
all_pbp


#### Parsing basic offense data

In [176]:
offenseDiv = None
for table in statTables:
    tableId = table['id']
    if tableId == 'all_player_offense':
        offenseDiv = table
print(offenseDiv)

<div class="table_wrapper" id="all_player_offense">
<div class="section_heading assoc_player_offense" id="player_offense_sh">
<span class="section_anchor" data-label="Passing, Rushing, &amp; Receiving" id="player_offense_link"></span><h2>Passing, Rushing, &amp; Receiving</h2> <div class="section_heading_text">
<ul>
</ul>
</div>
</div>
<div class="table_container" id="div_player_offense">
<table class="sortable stats_table" data-cols-to-freeze=",1" id="player_offense">
<caption>Passing, Rushing, &amp; Receiving Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr class="over_header">
<th aria-label="" class="over_header center" colspan="2" data-stat=""></th>
<th aria-label="" class="over_header center" colspan="9" data-stat="header_pass">Passing</th>
<th aria-label="" class="over_header center" colspan="4" data-stat="header_rush">Rushing</th>
<th aria-label=""

In [177]:
offenseTable = offenseDiv.find('table')
oTableHead = offenseTable.find('thead')
oTableBody = offenseTable.find('tbody')

overheader = oTableHead.find_all('tr')[0].find_all('th')
colCounts = []
for col in overheader:
    colCounts.append(int(col['colspan']))
numInfo = colCounts[0]
numPassing = sum(colCounts[:2])
numRushing = sum(colCounts[:3])
numReceiving = sum(colCounts[:4])
numFumbles = sum(colCounts)

stats = oTableHead.find_all('tr')[-1].find_all('th')
statKeys = []
for th in stats:
    statKeys.append(th.text.lower())


In [178]:
bodyRows = oTableBody.find_all('tr')
offenseList = []
for row in bodyRows:
    if not row.has_attr('class'):
        cells = row.find_all(recursive=False)
        player = {
            'info': {},
            'passing': {},
            'rushing': {},
            'receiving': {},
            'fumbles': {}
        }
        count = 0
        for stat, cell in zip(statKeys, cells):
            value = cell.text
            if value.isnumeric():
                value = float(value)
                if value.is_integer():
                    value = int(value)
            if count == 0:
                atag = cell.find('a')
                link = atag['href']
                player['_id'] = hashlib.md5(link.encode()).hexdigest()
            if count < numInfo:
                player['info'][stat] = value
            elif count < numPassing:
                player['passing'][stat] = value
            elif count < numRushing:
                player['rushing'][stat] = value
            elif count < numReceiving:
                player['receiving'][stat] = value
            elif count < numFumbles:
                player['fumbles'][stat] = value
            else:
                print(f'Should never get here.\nstat: {stat}, value: {value}')
            count += 1
        offenseList.append(player)
for player in offenseList:
    print(json.dumps(player, indent=2, sort_keys=True))


{
  "_id": "34532b329ccca52d2a94fca0611461f2",
  "fumbles": {
    "fl": 0,
    "fmb": 1
  },
  "info": {
    "player": "Dak Prescott",
    "tm": "DAL"
  },
  "passing": {
    "att": 58,
    "cmp": 42,
    "int": 1,
    "lng": 31,
    "rate": "101.4",
    "sk": 1,
    "td": 3,
    "yds": 12
  },
  "receiving": {
    "lng": 0,
    "rec": 0,
    "td": 0,
    "tgt": 0,
    "yds": 0
  },
  "rushing": {
    "att": 4,
    "lng": 7,
    "td": 0,
    "yds": 13
  }
}
{
  "_id": "bb705d3384b3d866577be4b02e1b09d0",
  "fumbles": {
    "fl": 0,
    "fmb": 0
  },
  "info": {
    "player": "Ezekiel Elliott",
    "tm": "DAL"
  },
  "passing": {
    "att": 0,
    "cmp": 0,
    "int": 0,
    "lng": 0,
    "rate": "",
    "sk": 0,
    "td": 0,
    "yds": 0
  },
  "receiving": {
    "lng": 3,
    "rec": 2,
    "td": 0,
    "tgt": 2,
    "yds": 6
  },
  "rushing": {
    "att": 11,
    "lng": 13,
    "td": 0,
    "yds": 33
  }
}
{
  "_id": "22c20b61e9b389b043c624e0f48b01c5",
  "fumbles": {
    "fl": 0,
    "

In [200]:
def findPlayerByName(name, players):
    # Wow i need to have better variable names in actual implementation
    # Shout out to stack oerflow.  I may want to do a longhanded version of this with try/except
    return next((player for player in players if player['info']['player'] == name), None)

def findPlayerById(id, players):
    return next((player for player in players if player['_id'] == id), None)
def convertStatIfNecessary(stat):
    if stat == '':
        stat = '0'
    stat = stat.replace('%', '')
    if stat.replace('.','',1).isnumeric():
        stat = float(stat)
        if stat.is_integer():
            stat = int(stat)
    return stat
print(findPlayerByName('Blake Jarwin', offenseList))
print(findPlayerById('9a80ff286b625c709e452f291797fc70', offenseList))

{'info': {'player': 'Blake Jarwin', 'tm': 'DAL'}, 'passing': {'cmp': 0, 'att': 0, 'yds': 0, 'td': 0, 'int': 0, 'sk': 0, 'lng': 0, 'rate': ''}, 'rushing': {'att': 0, 'yds': 0, 'td': 0, 'lng': 0}, 'receiving': {'tgt': 3, 'rec': 3, 'yds': 20, 'td': 0, 'lng': 7}, 'fumbles': {'fmb': 0, 'fl': 0}, '_id': '9a80ff286b625c709e452f291797fc70'}
{'info': {'player': 'Blake Jarwin', 'tm': 'DAL'}, 'passing': {'cmp': 0, 'att': 0, 'yds': 0, 'td': 0, 'int': 0, 'sk': 0, 'lng': 0, 'rate': ''}, 'rushing': {'att': 0, 'yds': 0, 'td': 0, 'lng': 0}, 'receiving': {'tgt': 3, 'rec': 3, 'yds': 20, 'td': 0, 'lng': 7}, 'fumbles': {'fmb': 0, 'fl': 0}, '_id': '9a80ff286b625c709e452f291797fc70'}


#### Parsing Advanced Passing

In [180]:
advancedPassingDiv = None
for table in statTables:
    tableId = table['id']
    if tableId == 'all_passing_advanced':
        advancedPassingDiv = table


In [181]:
advancedPassingTable = advancedPassingDiv.find(text=lambda text:isinstance(text, Comment))
advancedPassingTable = BeautifulSoup(advancedPassingTable, 'html.parser')
apTableHead = advancedPassingTable.find('thead')
apTableBody = advancedPassingTable.find('tbody')

In [182]:
keys = [column.text.lower() for column in apTableHead.findAll('th')]
print(keys)

['player', 'tm', 'cmp', 'att', 'yds', '1d', '1d%', 'iay', 'iay/pa', 'cay', 'cay/cmp', 'cay/pa', 'yac', 'yac/cmp', 'drops', 'drop%', 'badth', 'bad%', 'sk', 'bltz', 'hrry', 'hits', 'prss', 'prss%', 'scrm', 'yds/scr']


In [186]:
for row in apTableBody.findAll('tr'):
    if not row.has_attr('class'):
        count = 0
        advancedPassing = {}
        playerID = None
        for key, stat in zip(keys,row.findAll(recursive=False)):
            if count == 0:
                atag = stat.find('a')
                playerID = hashlib.md5(atag['href'].encode()).hexdigest()
            if count < 2:
                count += 1
                continue
            value = stat.text
            # for some reason python cannot recognized floats after I have removed a '%' from the string.
            # will need to make a function to handle this
            convertedValue = convertStatIfNecessary(value)
            advancedPassing[key] = convertedValue
        player = findPlayerById(playerID, offenseList)
        player['advanced-passing'] = advancedPassing
        player = findPlayerById(playerID, offenseList)
        print(player['advanced-passing'])

{'cmp': 42, 'att': 58, 'yds': 403, '1d': 21, '1d%': 35.6, 'iay': 385, 'iay/pa': 6.6, 'cay': 205, 'cay/cmp': 4.9, 'cay/pa': 3.5, 'yac': 198, 'yac/cmp': 4.7, 'drops': 2, 'drop%': 3.5, 'badth': 9, 'bad%': 15.8, 'sk': 1, 'bltz': 32, 'hrry': 2, 'hits': 6, 'prss': 9, 'prss%': 14.5, 'scrm': 3, 'yds/scr': 3.3}
{'cmp': 32, 'att': 50, 'yds': 379, '1d': 22, '1d%': 44, 'iay': 454, 'iay/pa': 9.1, 'cay': 203, 'cay/cmp': 6.3, 'cay/pa': 4.1, 'yac': 176, 'yac/cmp': 5.5, 'drops': 3, 'drop%': 6.4, 'badth': 7, 'bad%': 14.9, 'sk': 0, 'bltz': 11, 'hrry': 1, 'hits': 2, 'prss': 3, 'prss%': 6, 'scrm': 0, 'yds/scr': ''}


#### Parsing Advanced Rushing

In [193]:
# Get the div.  The idiots hide the advanced stats in the comments lmao
advancedRushingDiv = None
for table in statTables:
    tableId = table['id']
    if tableId == 'all_rushing_advanced':
        advancedRushingDiv = table

advancedRushingTable = advancedRushingDiv.find(text=lambda text:isinstance(text, Comment))
advancedRushingTable = BeautifulSoup(advancedRushingTable, 'html.parser')
advancedRushingTableHead = advancedRushingTable.find('thead')
advancedRushingTableBody = advancedRushingTable.find('tbody')
print(advancedRushingTableBody)


<tbody><tr><th class="left" data-append-csv="ElliEz00" data-stat="player" scope="row"><a href="/players/E/ElliEz00.htm">Ezekiel Elliott</a></th><td class="left" data-stat="team">DAL</td><td class="right" data-stat="rush_att">11</td><td class="right" data-stat="rush_yds">33</td><td class="right" data-stat="rush_first_down">2</td><td class="right" data-stat="rush_yds_before_contact">20</td><td class="right" data-stat="rush_yds_bc_per_rush">1.8</td><td class="right" data-stat="rush_yac">13</td><td class="right" data-stat="rush_yac_per_rush">1.2</td><td class="right iz" data-stat="rush_broken_tackles">0</td><td class="right iz" data-stat="rush_broken_tackles_per_rush"></td></tr>
<tr><th class="left" data-append-csv="PresDa01" data-stat="player" scope="row"><a href="/players/P/PresDa01.htm">Dak Prescott</a></th><td class="left" data-stat="team">DAL</td><td class="right" data-stat="rush_att">4</td><td class="right" data-stat="rush_yds">13</td><td class="right iz" data-stat="rush_first_down">

In [210]:
keys = [column.text.lower() for column in advancedRushingTableHead.findAll('th')]
print(keys)

['player', 'tm', 'att', 'yds', '1d', 'ybc', 'ybc/att', 'yac', 'yac/att', 'brktkl', 'att/br']


In [213]:
for row in advancedRushingTableBody.findAll('tr'):
    if not row.has_attr('class'):
        count = 0
        advancedPassing = {}
        playerID = None
        for key, stat in zip(keys,row.findAll(recursive=False)):
            if count == 0:
                atag = stat.find('a')
                playerID = hashlib.md5(atag['href'].encode()).hexdigest()
            if count < 2:
                count += 1
                continue
            value = stat.text
            # for some reason python cannot recognized floats after I have removed a '%' from the string.
            # will need to make a function to handle this
            convertedValue = convertStatIfNecessary(value)
            advancedPassing[key] = convertedValue
        player = findPlayerById(playerID, offenseList)
        player['advanced-rushing'] = advancedPassing
        player = findPlayerById(playerID, offenseList)

#### Advanced Receiving

In [214]:
# Get the div.  The idiots hide the advanced stats in the comments lmao
advancedReceivingDiv = None
for table in statTables:
    tableId = table['id']
    if tableId == 'all_receiving_advanced':
        advancedReceivingDiv = table

advancedReceivingTable = advancedReceivingDiv.find(text=lambda text:isinstance(text, Comment))
advancedReceivingTable = BeautifulSoup(advancedReceivingTable, 'html.parser')
advancedReceivingTableHead = advancedReceivingTable.find('thead')
advancedReceivingTableBody = advancedReceivingTable.find('tbody')


In [215]:
keys = [column.text.lower() for column in advancedReceivingTableHead.findAll('th')]
print(keys)

['player', 'tm', 'tgt', 'rec', 'yds', 'td', '1d', 'ybc', 'ybc/r', 'yac', 'yac/r', 'adot', 'brktkl', 'rec/br', 'drop', 'drop%', 'int', 'rat']


In [217]:
for row in advancedReceivingTableBody.findAll('tr'):
    if not row.has_attr('class'):
        count = 0
        advancedReceiving = {}
        playerID = None
        for key, stat in zip(keys,row.findAll(recursive=False)):
            if count == 0:
                atag = stat.find('a')
                playerID = hashlib.md5(atag['href'].encode()).hexdigest()
            if count < 2:
                count += 1
                continue
            value = stat.text
            # for some reason python cannot recognized floats after I have removed a '%' from the string.
            # will need to make a function to handle this
            convertedValue = convertStatIfNecessary(value)
            advancedReceiving[key] = convertedValue
        player = findPlayerById(playerID, offenseList)
        player['advanced-receiving'] = advancedReceiving
        player = findPlayerById(playerID, offenseList)

In [220]:
for player in offenseList:
    print(json.dumps(player, indent=4, sort_keys=True))

{
    "_id": "34532b329ccca52d2a94fca0611461f2",
    "advanced-passing": {
        "1d": 21,
        "1d%": 35.6,
        "att": 58,
        "bad%": 15.8,
        "badth": 9,
        "bltz": 32,
        "cay": 205,
        "cay/cmp": 4.9,
        "cay/pa": 3.5,
        "cmp": 42,
        "drop%": 3.5,
        "drops": 2,
        "hits": 6,
        "hrry": 2,
        "iay": 385,
        "iay/pa": 6.6,
        "prss": 9,
        "prss%": 14.5,
        "scrm": 3,
        "sk": 1,
        "yac": 198,
        "yac/cmp": 4.7,
        "yds": 403,
        "yds/scr": 3.3
    },
    "advanced-rushing": {
        "1d": 0,
        "att": 4,
        "att/br": 0,
        "brktkl": 0,
        "yac": 5,
        "yac/att": 1.3,
        "ybc": 8,
        "ybc/att": 2,
        "yds": 13
    },
    "fumbles": {
        "fl": 0,
        "fmb": 1
    },
    "info": {
        "player": "Dak Prescott",
        "tm": "DAL"
    },
    "passing": {
        "att": 58,
        "cmp": 42,
        "int": 1,
        

In [221]:
print(len(offenseList))

18
