In [1]:
import requests, os, time
import pandas as pd
import datetime as dt
import numpy as np
from datetime import date, timedelta
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', None)

In [None]:
# scrape a player database
def player_database():
    start = time.time()
    player_url = "https://www.pro-football-reference.com/players/{abcd}/"
    abcd = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    players = []
    for c in range(0,len(abcd)):
        res = requests.get(player_url.format(abcd=abcd[c]))
        soup = BeautifulSoup(res.content, 
                             'html.parser')
        data = soup.find_all('p')
        try:
            for i in data:
                a = i.find('a')['href']
                pl = i.get_text()[: (i.get_text().find("(")-1)]
                po = i.get_text()[(i.get_text().find("(")+1): i.get_text().find(")")]
                fy = i.get_text()[(i.get_text().rfind("-")-4): (i.get_text().rfind("-"))]
                ly = i.get_text()[(i.get_text().rfind("-")+1): (i.get_text().rfind("-")+5)]
                p = {"Address": a[: -4], 
                     "Player": pl, 
                     "Position": po, 
                     "First_Year": fy, 
                     "Last_Year": ly}
                players.append(p)
        except:
            continue
    players_df = pd.DataFrame().from_dict(players)
    players_df = players_df[players_df.Last_Year != "Ever"]
    players_df['Player'] = players_df['Player'].str.replace('+', '')
    players_df['Last_Year'] = players_df.Last_Year.astype('int32')
    players_df.to_csv('../data/database-players.csv',
                      index=False)
    end = time.time()
    print(f'Done creating player database.', 
          f'Time to complete: {(end-start)/60} minutes', 
          sep='\n')

In [None]:
#scrape all team data (requires stathead subscription)
def team_data(s):
    start = time.time()
    page = 0
    stat_login_url = "https://stathead.com/users/login.cgi"
    team_data_url = "https://stathead.com/football/tgl_finder.cgi?request=1&match=game&order_by=pass_td&year_min={Season}&year_max={Season}&game_type=R&ccomp%5B1%5D=gt&cval%5B1%5D=-500&cstat%5B1%5D=first_down&ccomp%5B2%5D=gt&cval%5B2%5D=-500&cstat%5B2%5D=rush_att&ccomp%5B3%5D=gt&cval%5B3%5D=-500&cstat%5B3%5D=third_down_att&ccomp%5B4%5D=gt&cval%5B4%5D=-500&cstat%5B4%5D=punt&ccomp%5B5%5D=gt&cval%5B5%5D=-500&cstat%5B5%5D=pass_cmp&ccomp%5B6%5D=gt&cval%5B6%5D=-500&cstat%5B6%5D=all_td_team&ccomp%5B7%5D=gt&cval%5B7%5D=-500&cstat%5B7%5D=points&ccomp%5B8%5D=gt&cval%5B8%5D=-500&cstat%5B8%5D=tot_yds&ccomp%5B9%5D=gt&cval%5B9%5D=-500&cstat%5B9%5D=kick_ret_td_tgl&ccomp%5B10%5D=gt&cval%5B10%5D=-500&cstat%5B10%5D=penalties&game_num_min=0&game_num_max=99&week_num_min=0&week_num_max=99&temperature_gtlt=lt&offset={page}"
    stat_user_name = os.environ.get('statheadusername')
    stat_password = os.environ.get('statheadpassword')
    stat_payload = {
        'username': stat_user_name,
        'password': stat_password
    }
    with requests.Session() as session:
        sess = session.post(stat_login_url, 
                            data=stat_payload)
        try:
            while page < 100000:          
                website = session.get(team_data_url.format(Season=s, 
                                                                page=page)).text
                soup = BeautifulSoup(website, 
                                     'html')
                table = soup.find('table', 
                                  attrs={'id': 'results'})
                table_headers = [header.text for header in table.find('thead').find_all('th')]
                table_rows = table.find_all('tr')
                final_data = []
                for tr in table_rows:
                    td = tr.find_all('td')
                    row = [tr.text for tr in td]
                    final_data.append(row)
                df = pd.DataFrame(final_data[1:], 
                                  columns=table_headers[12:])
                df.to_csv(f'../data/raw-data/nfl-team-data-{s}-raw.csv', 
                          mode='a', 
                          index=False)
                page += 100
        except:
            end = time.time()
            print(f'Done: Team {s}, {page}',
                  f'Time to complete: {end-start}', 
                  sep='\n')

In [None]:
# scrape weekly nfl injury reports
# season to scrape
def injuy_reports(s):
    start = time.time()
    injury_url = 'https://www.pro-football-reference.com/teams/{team}/{Season}_injuries.htm'
    teams = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 
         'dal', 'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan',
         'sdg', 'ram', 'mia', 'min', 'nor', 'nwe', 'nyg', 'nyj',
         'rai', 'phi', 'pit', 'sea', 'sfo', 'tam', 'oti', 'was']
    injury_final_df = pd.DataFrame()
    for team in teams:
        res = requests.get(injury_url.format(Season=s, 
                                             team=team))
        soup = BeautifulSoup(res.content, 
                             'lxml')
        table = soup.find('table', 
                          attrs={'class': 'sortable', 
                                 'id': 'team_injuries'})
        table_rows = table.find_all('tr')
        final_data = []
        for tr in table_rows:
            td = tr.find_all(['th','td'])
            row = [tr['data-tip'] if tr.has_attr("data-tip") else tr.text for tr in td]
            p_add = tr.find('a')['href'][:-4]
            row.insert(1, p_add)
            final_data.append(row)
        df_data = final_data[1:]
        data_body = [[df_data[j][i] for j in range(len(df_data))] for i in range(len(df_data[0]))]
        df = pd.DataFrame(data_body,
                          final_data[0]).T
        df.insert(loc=1,
                  column='Team',
                  value=team)
        df.insert(loc=2,
                  column='Season',
                  value=s)
        injury_final_df = pd.concat([injury_final_df, 
                                     df])
    injury_final_df.rename(columns= {'Player\xa0': 'Player'},
                           inplace=True)
    injury_final_df.rename(columns= {i: "Player_Address" for i in  injury_final_df.columns if i.startswith("/boxscores")}, 
                           inplace=True)
    injury_final_df.to_csv(f'../data/raw-data/nfl-injury-report-{s}-raw.csv',
                           index=False)
    end = time.time()
    print(f'Done: Injury Reports {s}',
          f'Time to complete: {end-start}',
          sep='\n')

In [None]:
# scrape player weekly stats
def player_stats(s):
    start = time.time()
    players = pd.read_csv('../data/database-players.csv')
    players = players[players['Last_Year'] >= s]
    player_stats_df = pd.DataFrame()
    player_fan_df = pd.DataFrame()
    for p_add in players['Address']:
        try:
            df = pd.read_html(f'https://www.pro-football-reference.com{p_add}/gamelog/{s}/', 
                              header=[0, 1], 
                              attrs={'id': 'stats'})
            df = df[0]
            df = df.head(-1)
            df.columns = df.columns.to_flat_index()
            df['Player_Address'] = p_add
            df['Season'] = s
            player_stats_df = pd.concat([df, 
                                         player_stats_df])

            df_fan = pd.read_html(f'https://www.pro-football-reference.com{p_add}/fantasy/{s}/', 
                      header=[0, 1], 
                      attrs={'id': 'player_fantasy'})
            df_fan = df_fan[0]
            df_fan = df_fan.head(-1)
            df_fan.columns = df_fan.columns.to_flat_index()
            df_fan['Player_Address'] = p_add
            df_fan['Season'] = s
            player_fan_df = pd.concat([df_fan, 
                                         player_fan_df])
        except:
            continue
    player_stats_df.to_csv(f'../data/raw-data/player-weekly-stats-{s}-raw.csv',
                           index=False)
    player_fan_df.to_csv(f'../data/raw-data/player-weekly-fantasy-{s}-raw.csv',
                           index=False)
    end = time.time()
    print(f'Done: Player stats scrape (including snap counts) for {s} season',
          f'{(end-start)/60}: minutes to complete.', 
          sep='\n')

In [None]:
# player_database
# for s in range(2015, 2021):
#     team_data(s)
#     injury_reports(s)
#     player_stats(s)

In [2]:
start = time.time()
s = 2015
players = pd.read_csv('../data/database-players.csv')
players = players[players['Last_Year'] >= s]
player_fan_df = pd.DataFrame()
for p_add in players['Address']:
    try:
        df_fan = pd.read_html(f'https://www.pro-football-reference.com{p_add}/fantasy/{s}/', 
                  header=[0, 1], 
                  attrs={'id': 'player_fantasy'})
        df_fan = df_fan[0]
        df_fan = df_fan.head(-1)
        df_fan.columns = df_fan.columns.to_flat_index()
        df_fan['Player_Address'] = p_add
        df_fan['Season'] = s
        player_fan_df = pd.concat([df_fan, 
                                     player_fan_df])
    except:
        continue
player_fan_df.to_csv(f'../data/raw-data/player-weekly-fantasy-{s}-raw.csv',
                       index=False)
end = time.time()
print(f'Done: Player stats scrape (including snap counts) for {s} season',
      f'{(end-start)/60}: minutes to complete.', 
      sep='\n')

SyntaxError: 'continue' not properly in loop (<ipython-input-2-8dc4d44ace9a>, line 18)