In [1]:
import pandas as pd
import requests
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Scrape from TeamRankings

In [2]:
if not os.path.exists('./nfl_data/rushing'):
    os.mkdir('./nfl_data/rushing')

In [4]:
stats = ['rushing-plays', 'rushing-net-yards', 'rushing-yards-per-attempt', 'rushing-touchdowns', 
        'rushing-longest-yards', 'rushing-2pt-conversions-succeeded']

sub_cats = ['home', 'division', 'last_2_weeks','last_4_weeks', 'top_10_nfl']

file_list = []

for stat in stats:
    for cat in sub_cats:
        url = f'https://www.teamrankings.com/nfl/player-stat/{stat}?split={cat}&rate=per-game'
        resp = requests.get(url)
        output = open(f'./nfl_data/rushing/{stat}_{cat}.xls', 'wb')
        output.write(resp.content)
        file_list.append(f'./nfl_data/rushing/{stat}_{cat}.xls')
        output.close() 
        data = pd.read_html(f'./nfl_data/rushing/{stat}_{cat}.xls')
        df = pd.DataFrame(data[0]) 
        print((stat, cat), df.shape)

('rushing-plays', 'home') (100, 5)
('rushing-plays', 'division') (100, 5)
('rushing-plays', 'last_2_weeks') (100, 5)
('rushing-plays', 'last_4_weeks') (100, 5)
('rushing-plays', 'top_10_nfl') (100, 5)
('rushing-net-yards', 'home') (100, 5)
('rushing-net-yards', 'division') (100, 5)
('rushing-net-yards', 'last_2_weeks') (100, 5)
('rushing-net-yards', 'last_4_weeks') (100, 5)
('rushing-net-yards', 'top_10_nfl') (100, 5)
('rushing-yards-per-attempt', 'home') (67, 5)
('rushing-yards-per-attempt', 'division') (88, 5)
('rushing-yards-per-attempt', 'last_2_weeks') (2, 5)
('rushing-yards-per-attempt', 'last_4_weeks') (11, 5)
('rushing-yards-per-attempt', 'top_10_nfl') (100, 5)
('rushing-touchdowns', 'home') (100, 5)
('rushing-touchdowns', 'division') (100, 5)
('rushing-touchdowns', 'last_2_weeks') (100, 5)
('rushing-touchdowns', 'last_4_weeks') (100, 5)
('rushing-touchdowns', 'top_10_nfl') (100, 5)
('rushing-longest-yards', 'home') (100, 5)
('rushing-longest-yards', 'division') (100, 5)
('rush

In [5]:
# get df started with the 1st file './nfl_data/rushing-yards-per-attempt_top_10_nfl.xls'.  This way we have something to merge to.

df_rb = pd.read_html('./nfl_data/rushing/rushing-plays_home.xls')
df_rb = pd.DataFrame(df_rb[0])
# df_rb = df_rb.loc[(df_rb['Pos'] == 'RB')].copy() # filter out non-RBs if wanting to model for ONLY RB's (non-Flex)
df_rb.drop(columns = 'Rank', inplace = True) 
col_name = file_list[0][27:-4]
df_rb.rename(columns = {'Value': col_name}, inplace = True)


In [6]:
df_rb.shape

(100, 4)

In [8]:
for file in file_list[1:]:
    data = pd.read_html(file)
    df = pd.DataFrame(data[0])
    # df = df.loc[(df['Pos'] == 'RB')].copy() # filter out non-RBs if wanting to model for ONLY RB's (non-Flex)
    df.drop(columns = ['Rank', 'Team', 'Pos'], inplace = True)
    col_name = file[27:-4]
    df.rename(columns = {'Value': col_name}, inplace = True)
    df_rb['Player'] = df_rb['Player'].str.lower() # cast column to lowercase for join
    df['Player'] = df['Player'].str.lower() # cast column to lowercase for join
    df_rb = pd.merge(df_rb, df, on = 'Player', how = 'outer')
    print(f'The shape of the merged df is {df_rb.shape}')

The shape of the merged df is (114, 5)
The shape of the merged df is (209, 6)
The shape of the merged df is (272, 7)
The shape of the merged df is (285, 8)
The shape of the merged df is (288, 9)
The shape of the merged df is (291, 10)
The shape of the merged df is (292, 11)
The shape of the merged df is (297, 12)
The shape of the merged df is (301, 13)
The shape of the merged df is (301, 14)
The shape of the merged df is (301, 15)
The shape of the merged df is (301, 16)
The shape of the merged df is (301, 17)
The shape of the merged df is (307, 18)
The shape of the merged df is (318, 19)
The shape of the merged df is (330, 20)
The shape of the merged df is (330, 21)
The shape of the merged df is (338, 22)
The shape of the merged df is (349, 23)
The shape of the merged df is (356, 24)
The shape of the merged df is (364, 25)
The shape of the merged df is (364, 26)
The shape of the merged df is (364, 27)
The shape of the merged df is (371, 28)
The shape of the merged df is (450, 29)
The s

In [9]:
df_rb.columns

Index(['Player', 'Team', 'Pos', 'plays_home', 'plays_division',
       'plays_last_2_weeks', 'plays_last_4_weeks', 'plays_top_10_nfl',
       'net-yards_home', 'net-yards_division', 'net-yards_last_2_weeks',
       'net-yards_last_4_weeks', 'net-yards_top_10_nfl',
       'yards-per-attempt_home', 'yards-per-attempt_division',
       'yards-per-attempt_last_2_weeks', 'yards-per-attempt_last_4_weeks',
       'yards-per-attempt_top_10_nfl', 'touchdowns_home',
       'touchdowns_division', 'touchdowns_last_2_weeks',
       'touchdowns_last_4_weeks', 'touchdowns_top_10_nfl',
       'longest-yards_home', 'longest-yards_division',
       'longest-yards_last_2_weeks', 'longest-yards_last_4_weeks',
       'longest-yards_top_10_nfl', '2pt-conversions-succeeded_home',
       '2pt-conversions-succeeded_division',
       '2pt-conversions-succeeded_last_2_weeks',
       '2pt-conversions-succeeded_last_4_weeks',
       '2pt-conversions-succeeded_top_10_nfl'],
      dtype='object')

In [10]:
df_rb.shape

(587, 33)

In [11]:
numcols_to_change = df_rb.columns
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_rb[col] = df_rb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
need to clean column: plays_last_2_weeks
need to clean column: plays_last_4_weeks
success!
success!
success!
need to clean column: net-yards_last_2_weeks
need to clean column: net-yards_last_4_weeks
success!
success!
success!
success!
success!
success!
success!
success!
need to clean column: touchdowns_last_2_weeks
need to clean column: touchdowns_last_4_weeks
need to clean column: touchdowns_top_10_nfl
success!
success!
success!
success!
success!
need to clean column: 2pt-conversions-succeeded_home
need to clean column: 2pt-conversions-succeeded_division
need to clean column: 2pt-conversions-succeeded_last_2_weeks
need to clean column: 2pt-conversions-succeeded_last_4_weeks
need to clean column: 2pt-conversions-succeeded_top_10_nfl


In [12]:
numcols_to_change2

['Player',
 'Team',
 'Pos',
 'plays_last_2_weeks',
 'plays_last_4_weeks',
 'net-yards_last_2_weeks',
 'net-yards_last_4_weeks',
 'touchdowns_last_2_weeks',
 'touchdowns_last_4_weeks',
 'touchdowns_top_10_nfl',
 '2pt-conversions-succeeded_home',
 '2pt-conversions-succeeded_division',
 '2pt-conversions-succeeded_last_2_weeks',
 '2pt-conversions-succeeded_last_4_weeks',
 '2pt-conversions-succeeded_top_10_nfl']

In [13]:
def drop_rows(position):
    for header in position.columns:
        index_list = df_rb[df_rb.eq("--").any(1)].index
        position.drop(labels=index_list, axis=0, inplace=True)
        return

In [14]:
drop_rows(df_rb)

In [15]:
df_rb.shape

(543, 33)

In [16]:
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_rb[col] = df_rb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!


In [17]:
df_rb.dtypes

Player                                     object
Team                                       object
Pos                                        object
plays_home                                float64
plays_division                            float64
plays_last_2_weeks                        float64
plays_last_4_weeks                        float64
plays_top_10_nfl                          float64
net-yards_home                            float64
net-yards_division                        float64
net-yards_last_2_weeks                    float64
net-yards_last_4_weeks                    float64
net-yards_top_10_nfl                      float64
yards-per-attempt_home                    float64
yards-per-attempt_division                float64
yards-per-attempt_last_2_weeks            float64
yards-per-attempt_last_4_weeks            float64
yards-per-attempt_top_10_nfl              float64
touchdowns_home                           float64
touchdowns_division                       float64


In [18]:
df_rb.fillna(0, inplace=True)

In [19]:
df_rb.shape

(543, 33)

In [20]:
# Removing suffixes from player name to join df's correctly
df_rb['Player'] = df_rb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1] if x.split()[-1] in ['iii', 'ii', 'jr', 'jr.'] else x)

In [21]:
# Removes periods in player names to join df's correctly
df_rb['Player'] = df_rb['Player'].map(lambda x: x.replace('.', ''))

In [22]:
df_rb.shape

(543, 33)

#### Bring in Football Reference

In [23]:
#bring in football reference data
fr_rb = pd.read_html('./nfl_data/rushing/2021_rushing_stats.xls')
fr_rb = pd.DataFrame(fr_rb[0]) # Saves df var to dataframe
fr_rb.fillna(0, inplace=True)
fr_rb = fr_rb.droplevel(0, axis=1)
fr_rb['Player'] = fr_rb['Player'].map(lambda x: x.rstrip('_+!*@#$?^'))
fr_rb.drop(columns = ['Rk', 'Age'], inplace = True)
fr_rb.head(15)

Unnamed: 0,Player,Tm,Pos,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb
0,Jonathan Taylor,IND,RB,17,17,332,1811,18,107,83,5.5,106.5,4
1,Najee Harris,PIT,RB,17,17,307,1200,7,62,37,3.9,70.6,0
2,Joe Mixon,CIN,RB,16,16,292,1205,13,60,32,4.1,75.3,2
3,Antonio Gibson,WAS,RB,16,14,258,1037,7,65,27,4.0,64.8,6
4,Dalvin Cook,MIN,RB,13,13,249,1159,6,57,66,4.7,89.2,3
5,Alvin Kamara,NOR,rb,13,10,240,898,4,42,30,3.7,69.1,0
6,Ezekiel Elliott,DAL,RB,17,17,237,1002,10,55,47,4.2,58.9,1
7,Nick Chubb,CLE,RB,14,14,228,1259,8,61,70,5.5,89.9,2
8,David Montgomery,CHI,RB,13,13,225,849,7,55,41,3.8,65.3,1
9,Derrick Henry,TEN,rb,8,8,219,937,10,49,76,4.3,117.1,1


In [24]:
fr_rb.shape

(383, 13)

In [25]:
fr_rb.dtypes

Player    object
Tm        object
Pos       object
G         object
GS        object
Att       object
Yds       object
TD        object
1D        object
Lng       object
Y/A       object
Y/G       object
Fmb       object
dtype: object

In [26]:
def drop_rows(df):
    for header in df.columns:
        index_list = df.loc[(df[header] == header)].index
        df.drop(labels=index_list, axis=0, inplace = True)
        return
drop_rows(fr_rb)

float_cols = fr_rb.columns[3:]

for float_col in float_cols:
    fr_rb[f'{float_col}'] = fr_rb[f'{float_col}'].astype(float)

In [27]:
fr_rb.shape

(371, 13)

In [28]:
# Removing suffixes from player name to join df's correctly
fr_rb['Player'] = fr_rb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1] if x.split()[-1] in ['III', 'II', 'Jr.'] else x)

In [29]:
# Removes periods in player names to join df's correctly
fr_rb['Player'] = fr_rb['Player'].map(lambda x: x.replace('.', ''))

In [30]:
# Cast Players to lowercase to ensure joining df's is correct
fr_rb['Player'] = fr_rb['Player'].str.lower()
fr_rb.head()

Unnamed: 0,Player,Tm,Pos,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb
0,jonathan taylor,IND,RB,17.0,17.0,332.0,1811.0,18.0,107.0,83.0,5.5,106.5,4.0
1,najee harris,PIT,RB,17.0,17.0,307.0,1200.0,7.0,62.0,37.0,3.9,70.6,0.0
2,joe mixon,CIN,RB,16.0,16.0,292.0,1205.0,13.0,60.0,32.0,4.1,75.3,2.0
3,antonio gibson,WAS,RB,16.0,14.0,258.0,1037.0,7.0,65.0,27.0,4.0,64.8,6.0
4,dalvin cook,MIN,RB,13.0,13.0,249.0,1159.0,6.0,57.0,66.0,4.7,89.2,3.0


#### Merging both dataframes into All

In [31]:
df_rb_all = pd.merge(fr_rb, df_rb, on = ['Player'], how = 'outer')
df_rb_all.shape

(685, 45)

In [32]:
df_rb_all.head()

Unnamed: 0,Player,Tm,Pos_x,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb,Team,Pos_y,plays_home,plays_division,plays_last_2_weeks,plays_last_4_weeks,plays_top_10_nfl,net-yards_home,net-yards_division,net-yards_last_2_weeks,net-yards_last_4_weeks,net-yards_top_10_nfl,yards-per-attempt_home,yards-per-attempt_division,yards-per-attempt_last_2_weeks,yards-per-attempt_last_4_weeks,yards-per-attempt_top_10_nfl,touchdowns_home,touchdowns_division,touchdowns_last_2_weeks,touchdowns_last_4_weeks,touchdowns_top_10_nfl,longest-yards_home,longest-yards_division,longest-yards_last_2_weeks,longest-yards_last_4_weeks,longest-yards_top_10_nfl,2pt-conversions-succeeded_home,2pt-conversions-succeeded_division,2pt-conversions-succeeded_last_2_weeks,2pt-conversions-succeeded_last_4_weeks,2pt-conversions-succeeded_top_10_nfl
0,jonathan taylor,IND,RB,17.0,17.0,332.0,1811.0,18.0,107.0,83.0,5.5,106.5,4.0,Indianapolis Colts,RB,18.56,18.0,0.0,0.0,17.71,107.89,102.5,0.0,0.0,88.0,5.8,5.7,0.0,0.0,5.0,1.0,1.0,0.0,0.0,1.0,83.0,83.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0
1,najee harris,PIT,RB,17.0,17.0,307.0,1200.0,7.0,62.0,37.0,3.9,70.6,0.0,Pittsburgh Steelers,RB,20.0,18.0,0.0,0.0,15.0,80.56,73.5,0.0,0.0,48.88,4.0,4.1,0.0,0.0,3.3,0.33,0.33,0.0,0.0,0.13,37.0,37.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0
2,joe mixon,CIN,RB,16.0,16.0,292.0,1205.0,13.0,60.0,32.0,4.1,75.3,2.0,Cincinnati Bengals,HB,18.0,17.8,15.0,16.67,15.0,72.7,88.6,72.0,71.33,58.5,4.0,5.0,4.8,4.3,3.9,0.9,1.2,0.0,0.33,0.33,32.0,32.0,14.0,23.0,23.0,0.0,0.0,0.0,0.0,0.0
3,antonio gibson,WAS,RB,16.0,14.0,258.0,1037.0,7.0,65.0,27.0,4.0,64.8,6.0,Washington Commanders,RB,18.0,13.0,0.0,0.0,15.0,67.71,61.2,0.0,0.0,52.29,3.8,4.7,0.0,0.0,3.5,0.57,0.4,0.0,0.0,0.29,27.0,27.0,0.0,0.0,17.0,0.14,0.0,0.0,0.0,0.14
4,dalvin cook,MIN,RB,13.0,13.0,249.0,1159.0,6.0,57.0,66.0,4.7,89.2,3.0,Minnesota Vikings,RB,18.0,18.25,0.0,0.0,15.8,96.4,66.75,0.0,0.0,55.4,5.4,3.7,0.0,0.0,3.5,0.6,0.25,0.0,0.0,0.4,30.0,29.0,0.0,0.0,30.0,0.2,0.25,0.0,0.0,0.2


In [33]:
x = {i : f'{i}_Tot' for i in df_rb_all.columns if len(i) < 18 and i not in ['Player', 'Pos_x', 'Pos_y', 'Tm', 'Team']}
df_rb_all = df_rb_all.rename(columns = x)
df_rb_all.drop(columns='Pos_x', inplace=True) # Drop extra position column
df_rb_all.rename(columns={'Pos_y': 'Pos'}, inplace=True)

x = {i : f'{i}_Avg' for i in df_rb_all.columns if len(i) >= 18}
df_rb_all = df_rb_all.rename(columns = x)
df_rb_all.rename(columns={'Tm': 'Tm_Abr'}, inplace=True)
df_rb_all['Player'] = df_rb_all['Player'].str.title() # Convert players back to titlecase
df_rb_all.shape

(685, 44)

In [34]:
df_rb_all.head()

Unnamed: 0,Player,Tm_Abr,G_Tot,GS_Tot,Att_Tot,Yds_Tot,TD_Tot,1D_Tot,Lng_Tot,Y/A_Tot,Y/G_Tot,Fmb_Tot,Team,Pos,plays_home_Tot,plays_division_Tot_Avg,plays_last_2_weeks_Avg,plays_last_4_weeks_Avg,plays_top_10_nfl_Tot_Avg,net-yards_home_Tot_Avg,net-yards_division_Avg,net-yards_last_2_weeks_Avg,net-yards_last_4_weeks_Avg,net-yards_top_10_nfl_Avg,yards-per-attempt_home_Avg,yards-per-attempt_division_Avg,yards-per-attempt_last_2_weeks_Avg,yards-per-attempt_last_4_weeks_Avg,yards-per-attempt_top_10_nfl_Avg,touchdowns_home_Tot_Avg,touchdowns_division_Avg,touchdowns_last_2_weeks_Avg,touchdowns_last_4_weeks_Avg,touchdowns_top_10_nfl_Avg,longest-yards_home_Avg,longest-yards_division_Avg,longest-yards_last_2_weeks_Avg,longest-yards_last_4_weeks_Avg,longest-yards_top_10_nfl_Avg,2pt-conversions-succeeded_home_Avg,2pt-conversions-succeeded_division_Avg,2pt-conversions-succeeded_last_2_weeks_Avg,2pt-conversions-succeeded_last_4_weeks_Avg,2pt-conversions-succeeded_top_10_nfl_Avg
0,Jonathan Taylor,IND,17.0,17.0,332.0,1811.0,18.0,107.0,83.0,5.5,106.5,4.0,Indianapolis Colts,RB,18.56,18.0,0.0,0.0,17.71,107.89,102.5,0.0,0.0,88.0,5.8,5.7,0.0,0.0,5.0,1.0,1.0,0.0,0.0,1.0,83.0,83.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0
1,Najee Harris,PIT,17.0,17.0,307.0,1200.0,7.0,62.0,37.0,3.9,70.6,0.0,Pittsburgh Steelers,RB,20.0,18.0,0.0,0.0,15.0,80.56,73.5,0.0,0.0,48.88,4.0,4.1,0.0,0.0,3.3,0.33,0.33,0.0,0.0,0.13,37.0,37.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0
2,Joe Mixon,CIN,16.0,16.0,292.0,1205.0,13.0,60.0,32.0,4.1,75.3,2.0,Cincinnati Bengals,HB,18.0,17.8,15.0,16.67,15.0,72.7,88.6,72.0,71.33,58.5,4.0,5.0,4.8,4.3,3.9,0.9,1.2,0.0,0.33,0.33,32.0,32.0,14.0,23.0,23.0,0.0,0.0,0.0,0.0,0.0
3,Antonio Gibson,WAS,16.0,14.0,258.0,1037.0,7.0,65.0,27.0,4.0,64.8,6.0,Washington Commanders,RB,18.0,13.0,0.0,0.0,15.0,67.71,61.2,0.0,0.0,52.29,3.8,4.7,0.0,0.0,3.5,0.57,0.4,0.0,0.0,0.29,27.0,27.0,0.0,0.0,17.0,0.14,0.0,0.0,0.0,0.14
4,Dalvin Cook,MIN,13.0,13.0,249.0,1159.0,6.0,57.0,66.0,4.7,89.2,3.0,Minnesota Vikings,RB,18.0,18.25,0.0,0.0,15.8,96.4,66.75,0.0,0.0,55.4,5.4,3.7,0.0,0.0,3.5,0.6,0.25,0.0,0.0,0.4,30.0,29.0,0.0,0.0,30.0,0.2,0.25,0.0,0.0,0.2


In [None]:
df_rb_all.to_csv('./clean_data/rushing.csv', index=0)