In [44]:
import pandas as pd
import requests
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Scrape from TeamRankings

In [45]:
if not os.path.exists('./nfl_data/passing'):
    os.mkdir('./nfl_data/passing')

In [46]:
stats = ['passing-plays-completed', 'passing-plays-attempted', 'passing-gross-yards', 'passing-touchdowns', 
        'passing-plays-intercepted', 'passing-longest-yards', 'qb-rating-nfl', 'passing-2pt-conversions-succeeded']

sub_cats = ['home', 'away', 'division', 'last_2_weeks','last_4_weeks', 'top_10_nfl']

file_list = []

for stat in stats:
    for cat in sub_cats:
        url = f'https://www.teamrankings.com/nfl/player-stat/{stat}?split={cat}&rate=per-game'
        resp = requests.get(url)
        output = open(f'./nfl_data/passing/{stat}_{cat}.xls', 'wb')
        output.write(resp.content)
        file_list.append(f'./nfl_data/passing/{stat}_{cat}.xls')
        output.close() 
        data = pd.read_html(f'./nfl_data/passing/{stat}_{cat}.xls')
        df = pd.DataFrame(data[0]) 
        print((stat, cat), df.shape)

('passing-plays-completed', 'home') (100, 5)
('passing-plays-completed', 'away') (100, 5)
('passing-plays-completed', 'division') (100, 5)
('passing-plays-completed', 'last_2_weeks') (100, 5)
('passing-plays-completed', 'last_4_weeks') (100, 5)
('passing-plays-completed', 'top_10_nfl') (100, 5)
('passing-plays-attempted', 'home') (100, 5)
('passing-plays-attempted', 'away') (100, 5)
('passing-plays-attempted', 'division') (100, 5)
('passing-plays-attempted', 'last_2_weeks') (100, 5)
('passing-plays-attempted', 'last_4_weeks') (100, 5)
('passing-plays-attempted', 'top_10_nfl') (100, 5)
('passing-gross-yards', 'home') (100, 5)
('passing-gross-yards', 'away') (100, 5)
('passing-gross-yards', 'division') (100, 5)
('passing-gross-yards', 'last_2_weeks') (100, 5)
('passing-gross-yards', 'last_4_weeks') (100, 5)
('passing-gross-yards', 'top_10_nfl') (100, 5)
('passing-touchdowns', 'home') (100, 5)
('passing-touchdowns', 'away') (100, 5)
('passing-touchdowns', 'division') (100, 5)
('passing-to

In [47]:
# get df started with the 1st file './nfl_data/passing-plays-completed_home.xls'.  This way we have something to merge to.

df_qb = pd.read_html('./nfl_data/passing/passing-plays-completed_home.xls')
df_qb = pd.DataFrame(df_qb[0]) 
df_qb = df_qb.loc[(df_qb['Pos'] == 'QB')].copy() # filter out non-QBs
df_qb.drop(columns = 'Rank', inplace = True) 
col_name = file_list[0][19:-4]
df_qb.rename(columns = {'Value': col_name}, inplace = True)

In [48]:
for file in file_list[1:]:
    data = pd.read_html(file)
    df = pd.DataFrame(data[0])
    df = df.loc[(df['Pos'] == 'QB')].copy()
    df.drop(columns = 'Rank', inplace = True) 
    col_name = file[19:-4]
    df.rename(columns = {'Value': col_name}, inplace = True)    
    df_qb = pd.merge(df_qb, df, on = ['Player', 'Team', 'Pos'], how = 'outer')
    print(f'The shape of the merged df is {df_qb.shape}')

The shape of the merged df is (74, 5)
The shape of the merged df is (75, 6)
The shape of the merged df is (76, 7)
The shape of the merged df is (76, 8)
The shape of the merged df is (77, 9)
The shape of the merged df is (78, 10)
The shape of the merged df is (80, 11)
The shape of the merged df is (80, 12)
The shape of the merged df is (80, 13)
The shape of the merged df is (80, 14)
The shape of the merged df is (80, 15)
The shape of the merged df is (80, 16)
The shape of the merged df is (80, 17)
The shape of the merged df is (80, 18)
The shape of the merged df is (80, 19)
The shape of the merged df is (80, 20)
The shape of the merged df is (80, 21)
The shape of the merged df is (80, 22)
The shape of the merged df is (81, 23)
The shape of the merged df is (81, 24)
The shape of the merged df is (81, 25)
The shape of the merged df is (81, 26)
The shape of the merged df is (81, 27)
The shape of the merged df is (82, 28)
The shape of the merged df is (82, 29)
The shape of the merged df is 

In [49]:
df_qb.columns

Index(['Player', 'Team', 'Pos', 'passing-plays-completed_home',
       'passing-plays-completed_away', 'passing-plays-completed_division',
       'passing-plays-completed_last_2_weeks',
       'passing-plays-completed_last_4_weeks',
       'passing-plays-completed_top_10_nfl', 'passing-plays-attempted_home',
       'passing-plays-attempted_away', 'passing-plays-attempted_division',
       'passing-plays-attempted_last_2_weeks',
       'passing-plays-attempted_last_4_weeks',
       'passing-plays-attempted_top_10_nfl', 'passing-gross-yards_home',
       'passing-gross-yards_away', 'passing-gross-yards_division',
       'passing-gross-yards_last_2_weeks', 'passing-gross-yards_last_4_weeks',
       'passing-gross-yards_top_10_nfl', 'passing-touchdowns_home',
       'passing-touchdowns_away', 'passing-touchdowns_division',
       'passing-touchdowns_last_2_weeks', 'passing-touchdowns_last_4_weeks',
       'passing-touchdowns_top_10_nfl', 'passing-plays-intercepted_home',
       'passing-pl

In [50]:
numcols_to_change = df_qb.columns

In [51]:
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_qb[col] = df_qb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
need to clean column: passing-plays-completed_division
need to clean column: passing-plays-completed_last_2_weeks
need to clean column: passing-plays-completed_last_4_weeks
need to clean column: passing-plays-completed_top_10_nfl
success!
need to clean column: passing-plays-attempted_away
success!
need to clean column: passing-plays-attempted_last_2_weeks
need to clean column: passing-plays-attempted_last_4_weeks
success!
success!
need to clean column: passing-gross-yards_away
need to clean column: passing-gross-yards_division
need to clean column: passing-gross-yards_last_2_weeks
need to clean column: passing-gross-yards_last_4_weeks
success!
success!
success!
need to clean column: passing-touchdowns_division
need to clean column: passing-touchdowns_last_2_weeks
need to clean column: passing-touchdowns_last_4_weeks
success!
need to clean column: passing-plays-intercepted_home
need to cl

In [52]:
numcols_to_change2

['Player',
 'Team',
 'Pos',
 'passing-plays-completed_division',
 'passing-plays-completed_last_2_weeks',
 'passing-plays-completed_last_4_weeks',
 'passing-plays-completed_top_10_nfl',
 'passing-plays-attempted_away',
 'passing-plays-attempted_last_2_weeks',
 'passing-plays-attempted_last_4_weeks',
 'passing-gross-yards_away',
 'passing-gross-yards_division',
 'passing-gross-yards_last_2_weeks',
 'passing-gross-yards_last_4_weeks',
 'passing-touchdowns_division',
 'passing-touchdowns_last_2_weeks',
 'passing-touchdowns_last_4_weeks',
 'passing-plays-intercepted_home',
 'passing-plays-intercepted_away',
 'passing-plays-intercepted_division',
 'passing-plays-intercepted_last_2_weeks',
 'passing-plays-intercepted_last_4_weeks',
 'passing-plays-intercepted_top_10_nfl',
 'passing-2pt-conversions-succeeded_home',
 'passing-2pt-conversions-succeeded_away',
 'passing-2pt-conversions-succeeded_division',
 'passing-2pt-conversions-succeeded_last_2_weeks',
 'passing-2pt-conversions-succeeded_las

In [53]:
def drop_rows(position):
    for header in position.columns:
        index_list = df_qb[df_qb.eq("--").any(1)].index
        position.drop(labels=index_list, axis=0, inplace=True)
        return

In [54]:
drop_rows(df_qb)

In [55]:
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_qb[col] = df_qb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!


In [56]:
df_qb.dtypes

Player                                             object
Team                                               object
Pos                                                object
passing-plays-completed_home                      float64
passing-plays-completed_away                      float64
passing-plays-completed_division                  float64
passing-plays-completed_last_2_weeks              float64
passing-plays-completed_last_4_weeks              float64
passing-plays-completed_top_10_nfl                float64
passing-plays-attempted_home                      float64
passing-plays-attempted_away                      float64
passing-plays-attempted_division                  float64
passing-plays-attempted_last_2_weeks              float64
passing-plays-attempted_last_4_weeks              float64
passing-plays-attempted_top_10_nfl                float64
passing-gross-yards_home                          float64
passing-gross-yards_away                          float64
passing-gross-

In [57]:
df_qb.fillna(0, inplace=True)

In [58]:
# Removing suffixes from player name to join df's correctly
df_qb['Player'] = df_qb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

#### Bring in Football Reference

In [59]:
#bring in football reference data
fr_qb = pd.read_html('./nfl_data/passing/2021_passing_stats.xls')
fr_qb = pd.DataFrame(fr_qb[0]) # Saves df var to dataframe
fr_qb.fillna(0, inplace=True)
fr_qb['Player'] = fr_qb['Player'].map(lambda x: x.lstrip('*').rstrip('*').rstrip('+').rstrip('*'))
fr_qb.drop(columns = ['Rk', 'Age', 'QBrec', '4QC', 'GWD', 'G', 'GS'], inplace = True)
fr_qb.head(15)

Unnamed: 0,Player,Tm,Pos,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,1D,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds.1,Sk%,NY/A,ANY/A
0,Tom Brady,TAM,QB,485,719,67.5,5316,43,6.0,12,1.7,269,62,7.4,7.8,11.0,312.7,102.1,68.1,22,144,3.0,6.98,7.41
1,Justin Herbert,LAC,QB,443,672,65.9,5014,38,5.7,15,2.2,256,72,7.5,7.6,11.3,294.9,97.7,65.6,31,214,4.4,6.83,6.95
2,Matthew Stafford,LAR,QB,404,601,67.2,4886,41,6.8,17,2.8,233,79,8.1,8.2,12.1,287.4,102.9,63.8,30,243,4.8,7.36,7.45
3,Patrick Mahomes,KAN,QB,436,658,66.3,4839,37,5.6,13,2.0,260,75,7.4,7.6,11.1,284.6,98.5,62.2,28,146,4.1,6.84,7.07
4,Derek Carr,LVR,QB,428,626,68.4,4804,23,3.7,14,2.2,217,61,7.7,7.4,11.2,282.6,94.0,52.4,40,241,6.0,6.85,6.6
5,Joe Burrow,CIN,QB,366,520,70.4,4611,34,6.5,14,2.7,202,82,8.9,9.0,12.6,288.2,108.3,54.3,51,370,8.9,7.43,7.51
6,Dak Prescott,DAL,QB,410,596,68.8,4449,37,6.2,10,1.7,227,51,7.5,8.0,10.9,278.1,104.2,54.6,30,144,4.8,6.88,7.34
7,Josh Allen,BUF,QB,409,646,63.3,4407,36,5.6,15,2.3,234,61,6.8,6.9,10.8,259.2,92.2,60.7,26,164,3.9,6.31,6.38
8,Kirk Cousins,MIN,QB,372,561,66.3,4221,33,5.9,7,1.2,192,64,7.5,8.1,11.3,263.8,103.1,52.3,28,197,4.8,6.83,7.42
9,Aaron Rodgers,GNB,QB,366,531,68.9,4115,37,7.0,4,0.8,213,75,7.7,8.8,11.2,257.2,111.9,69.1,30,188,5.3,7.0,8.0


In [60]:
def drop_rows(df):
    for header in df.columns:
        index_list = df.loc[(df[header] == header)].index
        df.drop(labels=index_list, axis=0, inplace = True)
        return
drop_rows(fr_qb)

int_cols = ['Cmp', 'Att', 'Yds', 'TD', 'Int', '1D', 'Lng', 'Sk', 'Yds.1']
float_cols = ['Cmp%', 'TD%', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Rate', 'NY/A', 'ANY/A', 'Sk%']

for int_col in int_cols:
    fr_qb[f'{int_col}'] = fr_qb[f'{int_col}'].astype(int)

for float_col in float_cols:
    fr_qb[f'{float_col}'] = fr_qb[f'{float_col}'].astype(float)

In [61]:
# Removing suffixes from player name to join df's correctly
fr_qb['Player'] = fr_qb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

#### Merge all into one df

In [62]:
df_qb_all = pd.merge(fr_qb, df_qb, on = ['Player'], how = 'inner')

x = {i : f'{i}_Tot' for i in df_qb_all.columns if len(i) < 18 and i not in ['Player', 'Tm', 'Pos', 'Team']}

df_qb_all = df_qb_all.rename(columns = x)

df_qb_all.rename(columns={'Pos_y_Tot': 'Pos'}, inplace=True)

df_qb_all.drop(columns='Pos_x_Tot', inplace=True)

x = {i : f'{i}_Avg' for i in df_qb_all.columns if len(i) > 18}
df_qb_all = df_qb_all.rename(columns = x)

df_qb_all.rename(columns={'Tm': 'Tm_Abr'}, inplace=True)
df_qb_all.shape

(71, 73)

In [63]:
df_qb_all.head()

Unnamed: 0,Player,Tm_Abr,Cmp_Tot,Att_Tot,Cmp%_Tot,Yds_Tot,TD_Tot,TD%_Tot,Int_Tot,Int%_Tot,1D_Tot,Lng_Tot,Y/A_Tot,AY/A_Tot,Y/C_Tot,Y/G_Tot,Rate_Tot,QBR_Tot,Sk_Tot,Yds.1_Tot,Sk%_Tot,NY/A_Tot,ANY/A_Tot,Team,Pos,passing-plays-completed_home_Avg,passing-plays-completed_away_Avg,passing-plays-completed_division_Avg,passing-plays-completed_last_2_weeks_Avg,passing-plays-completed_last_4_weeks_Avg,passing-plays-completed_top_10_nfl_Avg,passing-plays-attempted_home_Avg,passing-plays-attempted_away_Avg,passing-plays-attempted_division_Avg,passing-plays-attempted_last_2_weeks_Avg,passing-plays-attempted_last_4_weeks_Avg,passing-plays-attempted_top_10_nfl_Avg,passing-gross-yards_home_Avg,passing-gross-yards_away_Avg,passing-gross-yards_division_Avg,passing-gross-yards_last_2_weeks_Avg,passing-gross-yards_last_4_weeks_Avg,passing-gross-yards_top_10_nfl_Avg,passing-touchdowns_home_Avg,passing-touchdowns_away_Avg,passing-touchdowns_division_Avg,passing-touchdowns_last_2_weeks_Avg,passing-touchdowns_last_4_weeks_Avg,passing-touchdowns_top_10_nfl_Avg,passing-plays-intercepted_home_Avg,passing-plays-intercepted_away_Avg,passing-plays-intercepted_division_Avg,passing-plays-intercepted_last_2_weeks_Avg,passing-plays-intercepted_last_4_weeks_Avg,passing-plays-intercepted_top_10_nfl_Avg,passing-longest-yards_home_Avg,passing-longest-yards_away_Avg,passing-longest-yards_division_Avg,passing-longest-yards_last_2_weeks_Avg,passing-longest-yards_last_4_weeks_Avg,passing-longest-yards_top_10_nfl_Avg,qb-rating-nfl_home,qb-rating-nfl_away,qb-rating-nfl_division_Avg,qb-rating-nfl_last_2_weeks_Avg,qb-rating-nfl_last_4_weeks_Avg,qb-rating-nfl_top_10_nfl_Avg,passing-2pt-conversions-succeeded_home_Avg,passing-2pt-conversions-succeeded_away_Avg,passing-2pt-conversions-succeeded_division_Avg,passing-2pt-conversions-succeeded_last_2_weeks_Avg,passing-2pt-conversions-succeeded_last_4_weeks_Avg,passing-2pt-conversions-succeeded_top_10_nfl_Avg
0,Tom Brady,TAM,485,719,67.5,5316,43,6.0,12,1.7,269,62,7.4,7.8,11.0,312.7,102.1,68.1,22,144,3.0,6.98,7.41,Tampa Bay Buccaneers,QB,28.1,29.22,27.17,0.0,30.0,33.5,43.1,42.11,40.33,0.0,54.0,51.25,308.7,314.33,298.5,0.0,329.0,375.75,2.8,2.0,2.83,0.0,1.0,2.0,0.5,0.89,0.67,0.0,1.0,0.75,62.0,62.0,62.0,0.0,55.0,58.0,103.1,98.0,105.6,0.0,72.2,94.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Justin Herbert,LAC,443,672,65.9,5014,38,5.7,15,2.2,256,72,7.5,7.6,11.3,294.9,97.7,65.6,31,214,4.4,6.83,6.95,Los Angeles Chargers,QB,24.11,28.25,26.17,0.0,0.0,26.25,36.89,42.5,42.17,0.0,0.0,38.0,278.44,313.5,277.0,0.0,0.0,293.0,2.33,2.13,2.67,0.0,0.0,2.5,0.78,1.0,0.67,0.0,0.0,1.0,72.0,49.0,47.0,0.0,0.0,47.0,100.3,95.1,95.7,0.0,0.0,102.7,0.11,0.5,0.33,0.0,0.0,0.25
2,Matthew Stafford,LAR,404,601,67.2,4886,41,6.8,17,2.8,233,79,8.1,8.2,12.1,287.4,102.9,63.8,30,243,4.8,7.36,7.45,Los Angeles Rams,QB,24.55,23.2,23.25,26.0,28.33,25.7,35.91,34.6,34.0,40.0,41.0,38.6,288.27,290.3,274.5,283.0,328.67,301.5,2.55,2.2,2.0,3.0,2.33,2.2,0.82,1.1,1.0,2.0,1.0,1.2,75.0,79.0,68.0,35.0,70.0,79.0,106.6,100.9,100.1,89.9,101.9,96.2,0.09,0.1,0.0,0.0,0.0,0.1
3,Patrick Mahomes,KAN,436,658,66.3,4839,37,5.6,13,2.0,260,75,7.4,7.6,11.1,284.6,98.5,62.2,28,146,4.1,6.84,7.07,Kansas City Chiefs,QB,25.5,27.38,25.83,0.0,29.5,25.86,38.42,39.88,39.67,0.0,41.5,40.14,277.25,321.13,298.0,0.0,326.5,259.43,2.17,2.75,2.5,0.0,3.0,1.57,0.83,0.75,0.67,0.0,1.0,0.86,75.0,69.0,69.0,0.0,64.0,64.0,97.2,108.0,101.7,0.0,108.2,86.8,0.0,0.13,0.17,0.0,0.0,0.0
4,Derek Carr,LVR,428,626,68.4,4804,23,3.7,14,2.2,217,61,7.7,7.4,11.2,282.6,94.0,52.4,40,241,6.0,6.85,6.6,Las Vegas Raiders,QB,25.0,25.78,22.83,0.0,0.0,26.0,36.56,39.0,33.67,0.0,0.0,40.0,273.56,294.67,241.33,0.0,0.0,284.4,1.33,1.33,1.67,0.0,0.0,1.2,0.78,0.89,0.67,0.0,0.0,0.8,44.0,61.0,51.0,0.0,0.0,56.0,93.5,90.5,96.7,0.0,0.0,87.5,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
df_qb_all.to_csv('./clean_data/passing.csv', index=0)