In [1]:
import pandas as pd
import requests
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Scrape from TeamRankings

In [2]:
if not os.path.exists('./nfl_data/receiving'):
    os.mkdir('./nfl_data/receiving')

In [3]:
stats = ['receiving-receptions', 'receiving-yards', 'receiving-yards-per-catch', 'receiving-touchdowns', 
        'receiving-longest-yards', 'receiving-targeted', 'receiving-catch-rate', 'receiving-2pt-conversions-succeeded']

sub_cats = ['home', 'division', 'last_2_weeks','last_4_weeks', 'top_10_nfl']

file_list = []

for stat in stats:
    for cat in sub_cats:
        url = f'https://www.teamrankings.com/nfl/player-stat/{stat}?split={cat}&rate=per-game'
        resp = requests.get(url)
        output = open(f'./nfl_data/receiving/{stat}_{cat}.xls', 'wb')
        output.write(resp.content)
        file_list.append(f'./nfl_data/receiving/{stat}_{cat}.xls')
        output.close() 
        data = pd.read_html(f'./nfl_data/receiving/{stat}_{cat}.xls')
        df = pd.DataFrame(data[0]) 
        print((stat, cat), df.shape)

('receiving-receptions', 'home') (100, 5)
('receiving-receptions', 'division') (100, 5)
('receiving-receptions', 'last_2_weeks') (100, 5)
('receiving-receptions', 'last_4_weeks') (100, 5)
('receiving-receptions', 'top_10_nfl') (100, 5)
('receiving-yards', 'home') (100, 5)
('receiving-yards', 'division') (100, 5)
('receiving-yards', 'last_2_weeks') (100, 5)
('receiving-yards', 'last_4_weeks') (100, 5)
('receiving-yards', 'top_10_nfl') (100, 5)
('receiving-yards-per-catch', 'home') (100, 5)
('receiving-yards-per-catch', 'division') (100, 5)
('receiving-yards-per-catch', 'last_2_weeks') (12, 5)
('receiving-yards-per-catch', 'last_4_weeks') (39, 5)
('receiving-yards-per-catch', 'top_10_nfl') (100, 5)
('receiving-touchdowns', 'home') (100, 5)
('receiving-touchdowns', 'division') (100, 5)
('receiving-touchdowns', 'last_2_weeks') (100, 5)
('receiving-touchdowns', 'last_4_weeks') (100, 5)
('receiving-touchdowns', 'top_10_nfl') (100, 5)
('receiving-longest-yards', 'home') (100, 5)
('receiving-l

In [4]:
# get df started with the 1st file './nfl_data/receiving-receptions_home.xls'.  This way we have something to merge to.

df_wr = pd.read_html('./nfl_data/receiving/receiving-receptions_home.xls')
df_wr = pd.DataFrame(df_wr[0])
# df_rb = df_rb.loc[(df_rb['Pos'] == 'RB')].copy() # filter out non-RBs if wanting to model for ONLY RB's (non-Flex)
df_wr.drop(columns = 'Rank', inplace = True) 
col_name = file_list[0][11:-4]
df_wr.rename(columns = {'Value': col_name}, inplace = True)

In [5]:
# read in data, transform, merge
for file in file_list[1:]:
    data = pd.read_html(file)
    df = pd.DataFrame(data[0])
    # df = df.loc[(df['Pos'] == 'WR')].copy() # filter to only WR's. Uncomment this if only wanting to see WR's
    df.drop(columns = ['Rank', 'Team', 'Pos'], inplace = True)
    col_name = file[11:-4]
    df.rename(columns = {'Value': col_name}, inplace = True)
    df_wr['Player'] = df_wr['Player'].str.lower() # cast column to lowercase for join
    df['Player'] = df['Player'].str.lower() # cast column to lowercase for join
    df_wr = pd.merge(df_wr, df, on = 'Player', how = 'outer')
    print(f'The shape of the merged df is {df_wr.shape}')

The shape of the merged df is (125, 5)
The shape of the merged df is (217, 6)
The shape of the merged df is (272, 7)
The shape of the merged df is (282, 8)
The shape of the merged df is (291, 9)
The shape of the merged df is (294, 10)
The shape of the merged df is (294, 11)
The shape of the merged df is (298, 12)
The shape of the merged df is (304, 13)
The shape of the merged df is (319, 14)
The shape of the merged df is (332, 15)
The shape of the merged df is (332, 16)
The shape of the merged df is (332, 17)
The shape of the merged df is (350, 18)
The shape of the merged df is (367, 19)
The shape of the merged df is (378, 20)
The shape of the merged df is (379, 21)
The shape of the merged df is (392, 22)
The shape of the merged df is (401, 23)
The shape of the merged df is (409, 24)
The shape of the merged df is (413, 25)
The shape of the merged df is (413, 26)
The shape of the merged df is (413, 27)
The shape of the merged df is (416, 28)
The shape of the merged df is (416, 29)
The s

In [6]:
# Attempt to convert all columns to float, those that fail are stored in list
numcols_to_change = df_wr.columns
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_wr[col] = df_wr[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
need to clean column: receiving/receiving-receptions_last_2_weeks
need to clean column: receiving/receiving-receptions_last_4_weeks
success!
success!
success!
need to clean column: receiving/receiving-yards_last_2_weeks
need to clean column: receiving/receiving-yards_last_4_weeks
success!
success!
success!
success!
success!
success!
success!
success!
need to clean column: receiving/receiving-touchdowns_last_2_weeks
need to clean column: receiving/receiving-touchdowns_last_4_weeks
success!
success!
success!
success!
success!
success!
success!
success!
need to clean column: receiving/receiving-targeted_last_2_weeks
need to clean column: receiving/receiving-targeted_last_4_weeks
success!
need to clean column: receiving/receiving-catch-rate_home
need to clean column: receiving/receiving-catch-rate_division
need to clean column: receiving/receiving-catch-rate_last_2_weeks
need to clean column

In [7]:
# Failed to convert to float are stored in this list
numcols_to_change2

['Player',
 'Team',
 'Pos',
 'receiving/receiving-receptions_last_2_weeks',
 'receiving/receiving-receptions_last_4_weeks',
 'receiving/receiving-yards_last_2_weeks',
 'receiving/receiving-yards_last_4_weeks',
 'receiving/receiving-touchdowns_last_2_weeks',
 'receiving/receiving-touchdowns_last_4_weeks',
 'receiving/receiving-targeted_last_2_weeks',
 'receiving/receiving-targeted_last_4_weeks',
 'receiving/receiving-catch-rate_home',
 'receiving/receiving-catch-rate_division',
 'receiving/receiving-catch-rate_last_2_weeks',
 'receiving/receiving-catch-rate_last_4_weeks',
 'receiving/receiving-catch-rate_top_10_nfl',
 'receiving/receiving-2pt-conversions-succeeded_home',
 'receiving/receiving-2pt-conversions-succeeded_division',
 'receiving/receiving-2pt-conversions-succeeded_last_2_weeks',
 'receiving/receiving-2pt-conversions-succeeded_last_4_weeks',
 'receiving/receiving-2pt-conversions-succeeded_top_10_nfl']

In [8]:
# Drop rows with '--' in row
def drop_rows(position):
    for header in position.columns:
        index_list = df_wr[df_wr.eq("--").any(1)].index
        position.drop(labels=index_list, axis=0, inplace=True)
        return
drop_rows(df_wr)

In [9]:
df_wr.shape

(625, 43)

In [10]:
# Second attempt to convert columns to float
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_wr[col] = df_wr[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
need to clean column: receiving/receiving-catch-rate_home
need to clean column: receiving/receiving-catch-rate_division
need to clean column: receiving/receiving-catch-rate_last_2_weeks
need to clean column: receiving/receiving-catch-rate_last_4_weeks
need to clean column: receiving/receiving-catch-rate_top_10_nfl
success!
success!
success!
success!
success!


In [11]:
# Remove punctuation from `rate` columns
def remove_punctuation(column, punc):
    return column.replace(punc, '', regex=True, inplace=True)

In [13]:
remove_punctuation(df_wr['receiving/receiving-catch-rate_home'], '%')
remove_punctuation(df_wr['receiving/receiving-catch-rate_division'], '%')
remove_punctuation(df_wr['receiving/receiving-catch-rate_last_2_weeks'], '%')
remove_punctuation(df_wr['receiving/receiving-catch-rate_last_4_weeks'], '%')
remove_punctuation(df_wr['receiving/receiving-catch-rate_top_10_nfl'], '%')

In [14]:
# Third attempt at casting columns to float
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_wr[col] = df_wr[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!


In [15]:
# Fill NaN's with 0
df_wr.fillna(0, inplace=True)

In [16]:
# Removing suffixes from player name to join df's correctly
df_wr['Player'] = df_wr['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1] if x.split()[-1] in ['iii', 'ii', 'jr', 'jr.'] else x)

In [17]:
# Removes periods in player names to join df's correctly
df_wr['Player'] = df_wr['Player'].map(lambda x: x.replace('.', ''))

In [18]:
df_wr.shape

(625, 43)

#### Bring in Football Reference

In [19]:
# re.sub(r'[^\. a-zA-Z]',' ', name).rstrip()

In [20]:
#bring in football reference data
fr_wr = pd.read_html('./nfl_data/receiving/2021_receiving_stats.xls')
fr_wr = pd.DataFrame(fr_wr[0]) # Saves df var to dataframe
fr_wr.fillna(0, inplace=True)
fr_wr['Player'] = fr_wr['Player'].map(lambda x: x.rstrip('_+!*@#$?^'))
fr_wr.drop(columns = ['Rk', 'Age'], inplace = True)
fr_wr.head()

Unnamed: 0,Player,Tm,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Cooper Kupp,LAR,WR,17,17,191,145,75.9%,1947,13.4,16,89,59,10.2,8.5,114.5,0
1,Davante Adams,GNB,WR,16,16,169,123,72.8%,1553,12.6,11,84,59,9.2,7.7,97.1,0
2,Tyreek Hill,KAN,WR,17,16,159,111,69.8%,1239,11.2,9,75,75,7.8,6.5,72.9,2
3,Justin Jefferson,MIN,WR,17,17,167,108,64.7%,1616,15.0,10,75,56,9.7,6.4,95.1,1
4,Mark Andrews,BAL,te,17,9,153,107,69.9%,1361,12.7,9,75,43,8.9,6.3,80.1,1


In [21]:
# Remove % from Ctch% column
remove_punctuation(fr_wr['Ctch%'], '%')

In [22]:
fr_wr.head()

Unnamed: 0,Player,Tm,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Cooper Kupp,LAR,WR,17,17,191,145,75.9,1947,13.4,16,89,59,10.2,8.5,114.5,0
1,Davante Adams,GNB,WR,16,16,169,123,72.8,1553,12.6,11,84,59,9.2,7.7,97.1,0
2,Tyreek Hill,KAN,WR,17,16,159,111,69.8,1239,11.2,9,75,75,7.8,6.5,72.9,2
3,Justin Jefferson,MIN,WR,17,17,167,108,64.7,1616,15.0,10,75,56,9.7,6.4,95.1,1
4,Mark Andrews,BAL,te,17,9,153,107,69.9,1361,12.7,9,75,43,8.9,6.3,80.1,1


In [23]:
fr_wr.dtypes

Player    object
Tm        object
Pos       object
G         object
GS        object
Tgt       object
Rec       object
Ctch%     object
Yds       object
Y/R       object
TD        object
1D        object
Lng       object
Y/Tgt     object
R/G       object
Y/G       object
Fmb       object
dtype: object

In [24]:
def drop_rows(df):
    for header in df.columns:
        index_list = df.loc[(df[header] == header)].index
        df.drop(labels=index_list, axis=0, inplace = True)
        return
drop_rows(fr_wr)

float_cols = fr_wr.columns[3:]

for float_col in float_cols:
    fr_wr[f'{float_col}'] = fr_wr[f'{float_col}'].astype(float)

In [25]:
fr_wr.dtypes

Player     object
Tm         object
Pos        object
G         float64
GS        float64
Tgt       float64
Rec       float64
Ctch%     float64
Yds       float64
Y/R       float64
TD        float64
1D        float64
Lng       float64
Y/Tgt     float64
R/G       float64
Y/G       float64
Fmb       float64
dtype: object

In [26]:
# Removing suffixes from player name to join df's correctly
fr_wr['Player'] = fr_wr['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1] if x.split()[-1] in ['III', 'II', 'Jr.'] else x)

In [27]:
# Removes periods in player names to join df's correctly
fr_wr['Player'] = fr_wr['Player'].map(lambda x: x.replace('.', ''))

In [28]:
# Cast Players to lowercase to ensure joining df's is correct
fr_wr['Player'] = fr_wr['Player'].str.lower()
fr_wr.head()

Unnamed: 0,Player,Tm,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb
0,cooper kupp,LAR,WR,17.0,17.0,191.0,145.0,75.9,1947.0,13.4,16.0,89.0,59.0,10.2,8.5,114.5,0.0
1,davante adams,GNB,WR,16.0,16.0,169.0,123.0,72.8,1553.0,12.6,11.0,84.0,59.0,9.2,7.7,97.1,0.0
2,tyreek hill,KAN,WR,17.0,16.0,159.0,111.0,69.8,1239.0,11.2,9.0,75.0,75.0,7.8,6.5,72.9,2.0
3,justin jefferson,MIN,WR,17.0,17.0,167.0,108.0,64.7,1616.0,15.0,10.0,75.0,56.0,9.7,6.4,95.1,1.0
4,mark andrews,BAL,te,17.0,9.0,153.0,107.0,69.9,1361.0,12.7,9.0,75.0,43.0,8.9,6.3,80.1,1.0


#### Merging both dataframes into All

In [29]:
df_wr_all = pd.merge(fr_wr, df_wr, on = ['Player'], how = 'outer')
df_wr_all.shape

(797, 59)

In [30]:
x = {i : f'{i}_Tot' for i in df_wr_all.columns if len(i) < 18 and i not in ['Player', 'Tm', 'Pos_x', 'Pos_y', 'Team']}
df_wr_all = df_wr_all.rename(columns = x)
df_wr_all.rename(columns={'Pos_y': 'Pos'}, inplace=True)
df_wr_all.drop(columns='Pos_x', inplace=True) # Drop extra position column

x = {i : f'{i}_Avg' for i in df_wr_all.columns if len(i) >= 18}
df_wr_all = df_wr_all.rename(columns = x)
df_wr_all.rename(columns={'Tm': 'Tm_Abr'}, inplace=True)
df_wr_all['Player'] = df_wr_all['Player'].str.title() # Convert players back to titlecase
df_wr_all.shape

(797, 58)

In [31]:
df_wr_all.to_csv('./clean_data/receiving.csv', index=0)