In [1]:
import pandas as pd
import copy
import datetime

In [2]:
# pull receiving stats by year into list of dataframes
# each year has its own dataframe

receiving_dfs = []

# stat_type can be one of the following
# passing, rushing, receiving, scrimmage, defense, returns (doesn't start until 1941), or scoring

stat_type = 'receiving'

start_year = 1932
if stat_type == 'returns':
    start_year = max(1941, start_year)
else:
    start_year = max(1932, start_year)

current_year = datetime.datetime.now().year
# earliest year for stats on pfr

num_years = current_year - start_year

for i in range(start_year, current_year):
    url = 'https://www.pro-football-reference.com/years/{}/{}.htm'
    receiving_dfs.append(pd.read_html(url.format(i, stat_type))[0])

In [3]:
# clean player names in every dataframe

for i in range(0, num_years):
    receiving_dfs[i]['Player'] = receiving_dfs[i]['Player'].replace(to_replace='[*+]+', value='', regex=True)
    receiving_dfs[i]['Player'] = receiving_dfs[i]['Player'].map(lambda x: x.strip())

In [4]:
# making a copy of all receiving dataframes before selecting individual stats

receiving_dfs_original = copy.deepcopy(receiving_dfs)

In [5]:
# drop all columns i'm not going to use

for i in range(0, num_years):
    df = receiving_dfs[i]
    df.drop(df.columns.difference(['Player', 'Rec']), 1, inplace=True)
    df.set_index('Player', inplace=True)

In [6]:
# create master dataframe
# index = player name
# column for every year from 1932 to 2020
# each column will represent rec for that year

master_df = pd.DataFrame(columns=[x for x in range(start_year, current_year)])
master_df.insert(0, 'Player', '')
master_df.insert(1, 'image', '')
master_df.set_index('Player', inplace=True)

In [7]:
# create set of all players

all_players = set()
for i in range(0, num_years):
    for j in receiving_dfs[i].index:
        all_players.add(j)

In [8]:
# dictionary
# key = player name
# value = list of each year of recs

dictionary = {}

for i in all_players:
    dictionary[i] = [0] * num_years

# build dictionary through each year
for i in range(0, num_years):
    for row in receiving_dfs[i].iterrows():
        player_name = row[0]
        rec = row[1][0]

        if rec == 'Rec' or player_name == 'Player':
            continue

        dictionary[player_name][i] = int(rec)

In [9]:
# extend accumulated values once no more values are being added

for key in dictionary.keys():
    lst = dictionary[key]

    for i in range(len(lst)):
        if i > 0:
            lst[i] += lst[i-1]

In [10]:
# finalize dictionary to dataframe

df_final = pd.DataFrame.from_dict(dictionary).transpose()

new_list = [x for x in range(start_year, current_year)]
df_final.columns = df_final.columns[:0].tolist() + new_list

In [11]:
players = set()

for i in range(start_year, current_year):
    current = df_final.sort_values(by=i, ascending=False)
    lst = current.head(10).index.values.tolist()
    players.update(lst)

In [13]:
import bs4
import urllib
import re
import sys
import time
from socket import timeout
# add column for player images
# names with '.' or "'" will be replaced by '-'
# spaces replaced by '-'
# example: D.J. Moore will be d-j-moore
# note: some names on pfr don't have a '.' like DJ Chark
# jr is not used
# players with same name will have first-last-2 after the first player

i = 0
n = len(players)
for player in players:
    name = player
    name = name.lower()
    name = re.sub('[\'. ]', '-', name)
    name = re.sub('jr|jr.', '', name)
    name = re.sub('-{2,}', '-', name)
    name = re.sub('-$|^-', '', name)
    link = 'https://www.nfl.com/players/{}/'.format(name)

    try:
        doc = urllib.request.urlopen(link, timeout=3).read().decode('utf-8')
    except (urllib.error.HTTPError, urllib.error.URLError) as error:
        continue
    except timeout:
        continue
    
    soup = bs4.BeautifulSoup(doc)
    try:
        src = soup.select('#main-content > div > div > div > figure.nfl-c-player-header__headshot > picture > img')[0]['src']
    except IndexError:
        continue
    
    src = re.sub('t_lazy/', '', src)
    df_final.loc[player, 'image'] = src
    i += 1
    sys.stdout.write("\r{} {} {}".format(i,n,player))
    sys.stdout.flush()

87 97 Raymond Berry

In [14]:
df_final = df_final[['image'] + [col for col in df_final.columns if col != 'image']]
df_final.to_csv('out.csv')

In [15]:
for player in players:
    print(player)

Red Grange
Carl Brumbaugh
Billy Howton
Kyle Rote
Frank Gifford
Charley Malone
Andre Reed
Elroy Hirsch
Tommy McDonald
Johnny Blood
Pete Retzlaff
Marvin Harrison
Hugh Taylor
Milt Gantenbein
Tony Gonzalez
Dick Todd
Jim Keane
Mal Kutner
Larry Centers
Jimmy Smith
Mike Ditka
Ernie Caddel
Larry Fitzgerald
Harry Ebding
Tom Fears
Jim Benton
Elbie Nickel
Gaynell Tinsley
Keenan McCardell
Perry Schwartz
Red Badgro
Anquan Boldin
Gary Clark
Bobby Mitchell
Paul Riblett
Art Monk
Dante Lavelli
Ahmad Rashad
Bobby Joe Conrad
Reggie Wayne
Bob Mann
Andre Johnson
Jackie Smith
Ray Flaherty
Bobby Walston
Red Phillips
Kellen Winslow
Carroll Dale
James Lofton
Randy Moss
Dutch Clark
Ray McLean
Irving Fryar
Don Hutson
Harold Jackson
Lavvie Dilweg
Steve Largent
Shipwreck Kelly
Bill Sortet
Drew Pearson
Bill Hewitt
Charley Taylor
Ozzie Newsome
Henry Ellard
Dale Burnett
Cris Carter
Wes Chandler
Harold Carmichael
Bill Smith
Luke Johnsos
Cliff Branch
Boyd Dowler
George Wilson
John Greene
Billy Dewell
Drew Hill
Pete Pih