In [2]:
import requests, os, time
import pandas as pd
import datetime as dt
import numpy as np
from datetime import date, timedelta
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', None)

In [None]:
# login payload information pulled from a .env file
stat_login_url = "https://stathead.com/users/login.cgi"
stat_user_name = os.environ.get('statheadusername')
stat_password = os.environ.get('statheadpassword')
stat_payload = {
    'username': stat_user_name,
    'password': stat_password
}
# url for team data
team_data_url = {'1': 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=points&match=game&year_max={Season}&order_by_asc=0&week_num_min=0&game_type=E&game_num_min=0&year_min={Season}&cstat[1]=all_td_team&ccomp[1]=gt&cval[1]=0&cstat[2]=third_down_att&ccomp[2]=gt&cval[2]=0&cstat[3]=vegas_line&ccomp[3]=gt&cval[3]=-50&cstat[4]=penalties&ccomp[4]=gt&cval[4]=0&cstat[5]=rush_att&ccomp[5]=gt&cval[5]=0&cstat[6]=tot_yds&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down&ccomp[7]=gt&cval[7]=0&cstat[8]=punt&ccomp[8]=gt&cval[8]=0&cstat[9]=pass_cmp&ccomp[9]=gt&cval[9]=0&offset={page}', 
                 '2': 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=all_td_opp&match=game&year_max={Season}&order_by_asc=0&week_num_min=0&game_type=R&game_num_min=0&year_min={Season}&cstat[1]=tot_yds_opp&ccomp[1]=gt&cval[1]=0&cstat[2]=rush_yds_diff&ccomp[2]=gt&cval[2]=-500&cstat[3]=score_diff_thru_1&ccomp[3]=gt&cval[3]=-500&cstat[4]=rush_att_opp&ccomp[4]=gt&cval[4]=0&cstat[5]=kick_ret_td_tgl&ccomp[5]=gt&cval[5]=0&cstat[6]=pass_cmp_opp&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down_opp&ccomp[7]=gt&cval[7]=0&cstat[8]=score_diff_1_qtr&ccomp[8]=gt&cval[8]=-500&cstat[9]=third_down_att_opp&ccomp[9]=gt&cval[9]=0&offset={page}'}
# injury report url
injury_url = 'https://www.pro-football-reference.com/teams/{team}/{Season}_injuries.htm'
# player database url
player_url = "https://www.pro-football-reference.com/players/{abcd}/"
# page to start scrape at
page = 0
# season to scrape
s = 2020
# list of team abbreviations from pro-football-reference for url purposes
teams = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 
         'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan',
         'sdg', 'ram', 'mia', 'min', 'nor', 'nwe', 'nyg', 'nyj',
         'rai', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']
abcd = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
        'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [None]:
# scrape a player database
start = time.time()
players = []
for c in range(0,len(abcd)):
    res = requests.get(player_url.format(abcd=abcd[c]))
    soup = BeautifulSoup(res.content, 
                         'html.parser')
    data = soup.find_all('p')
    for i in data:
        a = i.find('a')['href']
        pl = i.get_text()[: (i.get_text().find("(")-1)]
        po = i.get_text()[(i.get_text().find("(")+1): i.get_text().find(")")]
        fy = i.get_text()[(i.get_text().rfind("-")-4): (i.get_text().rfind("-"))]
        ly = i.get_text()[(i.get_text().rfind("-")+1): (i.get_text().rfind("-")+5)]
        p = {"Address": a[: -4], 
             "Player": pl, 
             "Position": po, 
             "First_Year": fy, 
             "Last_Year": ly}
        players.append(p)
players_df = pd.DataFrame().from_dict(players)
players_df = players_df[players_df.Last_Year != "Ever"]
players_df['Player'] = players_df['Player'].str.replace('+', '')
players_df['Last_Year'] = players_df.Last_Year.astype('int32')
players_df.to_csv('../data/database-players.csv',
                  index=False)
end = time.time()
print(f'Done creating player database.', 
      f'Time to complete: {(end-start)/6} minutes', 
      sep='\n')

In [None]:
#scrape all team data (requires stathead subscription)
start = time.time()
with requests.Session() as session:
    sess = session.post(stat_login_url, 
                        data=stat_payload)
    try:
        for url in team_data_url:
            while page < 100000:          
                website = session.get(team_data_url[url].format(Season=s, 
                                                                page=page)).text
                soup = BeautifulSoup(website, 
                                     'html')
                table = soup.find('table', 
                                  attrs={'class': 'sortable', 
                                         'id': 'results'})
                table_headers = [header.text for header in table.find('thead').find_all('th')]
                table_rows = table.find_all('tr')
                final_data = []
                for tr in table_rows:
                    td = tr.find_all('td')
                    row = [tr.text for tr in td]
                    final_data.append(row)
                df = pd.DataFrame(final_data[1:], 
                                  columns=table_headers[12:])
                df.to_csv(f'../data/raw-data/nfl-team-data-{s}-{url}-raw.csv', 
                          mode='a', 
                          index=False)
                page += 100
    except:
        end = time.time()
        print(f'Done: Team {s}, {url}, {page}', 
              f'Time to complete: {end-start}', 
              sep='\n')

In [None]:
# scrape weekly nfl injury reports
start = time.time()
injury_final_df = pd.DataFrame()
for team in teams:
    res = requests.get(injury_url.format(Season=s, 
                                         team=team))
    soup = BeautifulSoup(res.content, 
                         'lxml')
    table = soup.find('table', 
                      attrs={'class': 'sortable', 
                             'id': 'team_injuries'})
    table_rows = table.find_all('tr')
    final_data = []
    for tr in table_rows:
        td = tr.find_all(['th','td'])
        row = [tr['data-tip'] if tr.has_attr("data-tip") else tr.text for tr in td]
        #p_add = [tr['player-address'] if tr.has_attr('a') else tr.text for tr in td]
        final_data.append(row)
    df_data = final_data[1:]
    data_body = [[df_data[j][i] for j in range(len(df_data))] for i in range(len(df_data[0]))]
    df = pd.DataFrame(data_body,
                      final_data[0]).T
    df.insert(loc=1,
              column='Team',
              value=team)
    df.insert(loc=2,
              column='Season',
              value=s)
    injury_final_df = pd.concat([injury_final_df, 
                                 df])
injury_final_df.rename(columns={'PlayerÂ ':'Player'},
                       inplace=True)
injury_final_df.to_csv(f'../data/raw-data/nfl-injury-report-{s}-raw.csv',
                       index=False)
end = time.time()
print(f'Done: Injury Reports {s}',
      f'Time to complete: {end-start}',
      sep='\n')

In [7]:
s = 2020
# scrape player weekly stats
start = time.time()
players = pd.read_csv('../data/database-players.csv')
player_stats_df = pd.DataFrame()
for p_add in players['Address']:
    try:
        df = pd.read_html(f'https://www.pro-football-reference.com{p_add}/gamelog/{s}', 
                          header=[0,1], 
                          attrs={'id': 'stats'})
        df = df[0]
        df = df.head(-1)
        df.columns = df.columns.to_flat_index()
        df['Player_Address'] = p_add
        df['Season'] = s
        player_stats_df = pd.concat([df, 
                                     player_stats_df])
    except:
        continue
# player_stats_df.to_csv(f'player-weekly-stats-{s}-raw.csv',
#                        index=False)
end = time.time()
print(f'Done: Player stats scrape (including snap counts) for {s} season',
      f'{(end-start)/60}: minutes to complete.', 
      sep='\n')

Done: Player stats scrape (including snap counts) for 2019 season
0.14686747392018637: minutes to complete.


In [8]:
player_stats_df.head(5)

Unnamed: 0,"(Unnamed: 0_level_0, Rk)","(Unnamed: 1_level_0, Date)","(Unnamed: 2_level_0, G#)","(Unnamed: 3_level_0, Week)","(Unnamed: 4_level_0, Age)","(Unnamed: 5_level_0, Tm)","(Unnamed: 6_level_0, Unnamed: 6_level_1)","(Unnamed: 7_level_0, Opp)","(Unnamed: 8_level_0, Result)","(Unnamed: 9_level_0, GS)","(Unnamed: 10_level_0, Sk)","(Tackles, Solo)","(Tackles, Ast)","(Tackles, Comb)","(Tackles, TFL)","(Tackles, QBHits)","(Def Interceptions, Int)","(Def Interceptions, Yds)","(Def Interceptions, TD)","(Def Interceptions, PD)","(Fumbles, Fmb)","(Fumbles, FL)","(Fumbles, FF)","(Fumbles, FR)","(Fumbles, Yds)","(Fumbles, TD)","(Off. Snaps, Num)","(Off. Snaps, Pct)","(Def. Snaps, Num)","(Def. Snaps, Pct)","(ST Snaps, Num)","(ST Snaps, Pct)",Player_Address,Season,"(Receiving, Tgt)","(Receiving, Rec)","(Receiving, Yds)","(Receiving, Y/R)","(Receiving, TD)","(Receiving, Ctch%)","(Receiving, Y/Tgt)","(Scoring, TD)","(Scoring, Pts)","(Rushing, Att)","(Rushing, Yds)","(Rushing, Y/A)","(Rushing, TD)","(Kick Returns, Rt)","(Kick Returns, Yds)","(Kick Returns, Y/Rt)","(Kick Returns, TD)","(Punt Returns, Ret)","(Punt Returns, Yds)","(Punt Returns, Y/R)","(Punt Returns, TD)","(Unnamed: 31_level_0, Sk)","(Scoring, 2PM)","(Unnamed: 12_level_0, Sk)"
0,1.0,2019-09-08,1.0,1.0,25.036,SFO,@,TAM,W 31-17,*,0.0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0%,20,29%,1,4%,/players/A/AlexKw00,2019,,,,,,,,,,,,,,,,,,,,,,,,
1,2.0,2019-09-15,2.0,2.0,25.043,SFO,@,CIN,W 41-17,*,0.0,3,3,6,0,0,1,0,0,3,0,0,0,0,0,0,0,0%,51,74%,1,3%,/players/A/AlexKw00,2019,,,,,,,,,,,,,,,,,,,,,,,,
2,3.0,2019-09-22,3.0,3.0,25.05,SFO,,PIT,W 24-20,*,0.0,5,2,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0%,52,98%,2,7%,/players/A/AlexKw00,2019,,,,,,,,,,,,,,,,,,,,,,,,
3,4.0,2019-10-07,4.0,5.0,25.065,SFO,,CLE,W 31-3,*,0.5,0,2,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0%,46,100%,1,4%,/players/A/AlexKw00,2019,,,,,,,,,,,,,,,,,,,,,,,,
4,5.0,2019-10-13,5.0,6.0,25.071,SFO,@,LAR,W 20-7,*,0.0,4,3,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0%,52,100%,1,4%,/players/A/AlexKw00,2019,,,,,,,,,,,,,,,,,,,,,,,,
