# Collecting Player Data by Team

This is meant to collect player data for each team in the NFL since the 2020 season. 

For each year, player ratings were structured differently or came from different data sources, thus the way in which each players data is extracted for each distinct year varies.

For each year, a data frame is created that shows the *Position*, *Name*, *Rating*, *Team*, and *Season* for each player. This allows us to merge it with our data much easier in the later steps.

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# 2020 Player Data

In [2]:
html_fp = '/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2020.html'

with open(html_fp, "r") as file:
    html_content = file.read()
    
soup = BeautifulSoup(html_content, 'html.parser')

td_tags = soup.find_all('td')

td_text_list = []

for td in td_tags:
    td_text_list.append(td.text.strip())
    
player_list = td_text_list

In [3]:
# Initialize lists to store components
positions = []
names = []
ratings = []

# Iterate through each item in player_list
for item in player_list:
    # Split each item to extract position, name, and rating
    parts = item.split()
    
    # Extract position (first part)
    position = parts[0]
    
    # Extract name (rest of the parts except last part which is rating)
    name = ' '.join(parts[1:-1])
    
    # Extract rating (last part), remove '**' if present
    rating = parts[-1].replace('**', '').replace('*', '').strip('()')
    
    # Append to respective lists
    positions.append(position)
    names.append(name)
    ratings.append(rating)

# Create DataFrame
df_2020 = pd.DataFrame({
    'Position': positions,
    'Name': names,
    'Rating': ratings
})

In [4]:
df_2020

Unnamed: 0,Position,Name,Rating
0,QB,Lamar Jackson,90.1
1,DT,Derek Wolfe,68.5
2,RB,Mark Ingram II,78.7
3,DT,Brandon Williams,64.8
4,RB,J.K. Dobbins,83.8
...,...,...,...
763,CB,D.J. Hayden,76.9
764,RG,A.J. Cann,55.3
765,S,Ronnie Harrison,60.9
766,RT,Jawaan Taylor,63.8


In [5]:
team_names_list = ['Baltimore Ravens', 'New Orleans Saints', 'San Francisco 49ers', 'Kansas City Chiefs', 
                   'Tampa Bay Buccaneers', 'Dallas Cowboys', 'Buffalo Bills', 'Tennessee Titans', 
                   'Philadelphia Eagles', 'Pittsburgh Steelers', 'Cleveland Browns', 'Green Bay Packers', 
                   'Indianapolis Colts', 'Denver Broncos', 'Seattle Seahawks', 'Los Angeles Chargers', 
                   'Minnesota Vikings', 'New England Patriots', 'Detroit Lions', 'Atlanta Falcons',
                   'Chicago Bears', 'Los Angeles Rams', 'Houston Texans', 'Arizona Cardinals', 'Las Vegas Raiders', 'Cincinnati Bengals',
                   'New York Giants', 'New York Jets', 'Carolina Panthers', 'Miami Dolphins', 
                   'Washington Commanders', 'Jacksonville Jaguars']

team_names = []

for team_name in team_names_list:
    
    # Repeat the team name 24 times (assuming 24 players per team)
    team_names.extend([team_name] * 24)

df_2020['Team'] = team_names
df_2020['Season'] = 2020

### Some cleaning 2020

There are some issues with the naming of some of the positions that does not match the rest.

In [6]:
# Replace values
df_2020['Position'] = df_2020['Position'].replace({
    'DT': 'DI',
    'EDGE': 'Edge',
    'FS': 'S',
    'EDGE/DT': 'Edge',
    'DT/EDGE': 'DI',
    'S/CB': 'S',
    'CB/S': 'CB',
    'SS': 'S',
    
})

df_2020.Position.unique()

array(['QB', 'DI', 'RB', 'TE', 'Edge', 'WR', 'LB', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT', 'FB'], dtype=object)

# 2021 Player Data

In [7]:
html_fp = '/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2021.html'

with open(html_fp, "r") as file:
    html_content = file.read()
    
soup = BeautifulSoup(html_content, 'html.parser')

td_tags = soup.find_all('td')

td_text_list = []

for td in td_tags:
    td_text_list.append(td.text.strip())
    
player_list = td_text_list

In [8]:
# Initialize lists to store components
positions = []
names = []
ratings = []

# Iterate through each item in player_list
for item in player_list:
    # Split each item to extract position, name, and rating
    parts = item.split()
    
    # Extract position (first part)
    position = parts[0]
    
    # Extract name (rest of the parts except last part which is rating)
    name = ' '.join(parts[1:-1])
    
    # Extract rating (last part), remove '**' if present
    rating = parts[-1].replace('**', '').replace('*', '').strip('()')
    
    # Append to respective lists
    positions.append(position)
    names.append(name)
    ratings.append(rating)

# Create DataFrame
df_2021 = pd.DataFrame({
    'Position': positions,
    'Name': names,
    'Rating': ratings
})

In [9]:
team_names_list = ['Tampa Bay Buccaneers', 'Kansas City Chiefs', 'Cleveland Browns', 'Buffalo Bills', 
                   'Baltimore Ravens', 'Green Bay Packers', 'Los Angeles Rams', 'Dallas Cowboys', 
                   'Minnesota Vikings', 'Denver Broncos', 'San Francisco 49ers', 'Washington Commanders', 
                   'Tennessee Titans', 'Indianapolis Colts', 'Seattle Seahawks','Pittsburgh Steelers', 
                   'New Orleans Saints', 'New England Patriots', 'New York Giants', 'Los Angeles Chargers',
                   'Arizona Cardinals', 'Miami Dolphins', 'Chicago Bears', 'Cincinnati Bengals', 
                   'Atlanta Falcons', 'Las Vegas Raiders','Carolina Panthers', 'Jacksonville Jaguars', 
                   'Philadelphia Eagles', 'New York Jets', 'Detroit Lions', 'Houston Texans']

team_names = []

for team_name in team_names_list:
    
    # Repeat the team name 24 times (assuming 24 players per team)
    team_names.extend([team_name] * 24)

df_2021['Team'] = team_names
df_2021['Season'] = 2021

In [10]:
df_2021

Unnamed: 0,Position,Name,Rating,Team,Season
0,QB,Tom Brady,93.3,Tampa Bay Buccaneers,2021
1,DI,Ndamukong Suh,62.4,Tampa Bay Buccaneers,2021
2,RB,Leonard Fournette,65.5,Tampa Bay Buccaneers,2021
3,DI,Vita Vea,89.9,Tampa Bay Buccaneers,2021
4,WR,Mike Evans,74.9,Tampa Bay Buccaneers,2021
...,...,...,...,...,...
763,CB,Desmond King II,67.0,Houston Texans,2021
764,RG,Marcus Cannon,70.1,Houston Texans,2021
765,S,Justin Reid,60.7,Houston Texans,2021
766,RT,Tytus Howard,62.1,Houston Texans,2021


### Some 2021 cleaning

In [11]:
df_2021.Position.unique()

array(['QB', 'DI', 'RB', 'WR', 'EDGE', 'TE', 'LB', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT', 'FB', 'ED'], dtype=object)

In [12]:
# Replace values
df_2021['Position'] = df_2021['Position'].replace({
    'EDGE': 'Edge',
    'ED': 'Edge'  
})

df_2021.Position.unique()

array(['QB', 'DI', 'RB', 'WR', 'Edge', 'TE', 'LB', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT', 'FB'], dtype=object)

There is an issue with Franklin-Myers Rankings

In [13]:
df_2021[df_2021['Rating'] == 'Franklin-Myers(71.5']

Unnamed: 0,Position,Name,Rating,Team,Season
703,Edge,John,Franklin-Myers(71.5,New York Jets,2021


In [14]:
# Some issues with player at index 635
df_2021.loc[703, 'Position'] = 'Edge'
df_2021.loc[703, 'Name'] = 'John Franklin-Myers'
df_2021.loc[703, 'Rating'] = 71.5
df_2021.loc[703, 'Team'] = 'New York Jets'

In [15]:
df_2021.iloc[703]

Position                   Edge
Name        John Franklin-Myers
Rating                     71.5
Team              New York Jets
Season                     2021
Name: 703, dtype: object

# 2022 Player Data

In [16]:
html_fp = '/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2022.html'

with open(html_fp, "r") as file:
    html_content = file.read()
    
soup = BeautifulSoup(html_content, 'html.parser')

td_tags = soup.find_all('td')

td_text_list = []

for td in td_tags:
    td_text_list.append(td.text.strip())
    
player_list = td_text_list

In [17]:
# Initialize lists to store components
positions = []
names = []
ratings = []

# Iterate through each item in player_list
for item in player_list:
    # Split each item to extract position, name, and rating
    parts = item.split()
    
    # Extract position (first part)
    position = parts[0]
    
    # Extract name (rest of the parts except last part which is rating)
    name = ' '.join(parts[1:-1])
    
    # Extract rating (last part), remove '**' if present
    rating = parts[-1].replace('**', '').replace('*', '').strip('()')
    
    # Append to respective lists
    positions.append(position)
    names.append(name)
    ratings.append(rating)

# Create DataFrame
df_2022 = pd.DataFrame({
    'Position': positions,
    'Name': names,
    'Rating': ratings
})

In [18]:
team_names_list = ['Buffalo Bills', 'Tampa Bay Buccaneers', 'Los Angeles Chargers', 'Los Angeles Rams', 
                   'Philadelphia Eagles', 'Green Bay Packers', 'Cincinnati Bengals', 'Kansas City Chiefs',
                  'Baltimore Ravens', 'San Francisco 49ers', 'Miami Dolphins', 'New Orleans Saints', 
                   'Cleveland Browns', 'Denver Broncos', 'Indianapolis Colts', 'Dallas Cowboys', 
                  'Minnesota Vikings', 'Washington Commanders', 'Tennessee Titans', 'New England Patriots',
                  'Las Vegas Raiders', 'Pittsburgh Steelers', 'Arizona Cardinals', 'Carolina Panthers', 
                  'Detroit Lions', 'New York Jets', 'New York Giants', 'Jacksonville Jaguars', 'Seattle Seahawks',
                  'Atlanta Falcons', 'Houston Texans', 'Chicago Bears']

team_names = []

for team_name in team_names_list:
    
    # Repeat the team name 24 times (assuming 24 players per team)
    team_names.extend([team_name] * 24)

df_2022['Team'] = team_names
df_2022['season'] = 2022

### Some 2022 cleaning

In [19]:
df_2022.Position.unique()

array(['QB', 'DI', 'RB', 'Edge', 'WR', 'LB', 'TE', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT', 'FB', 'LB/DB', 'Micah'], dtype=object)

In [20]:
# Replace values
df_2022['Position'] = df_2022['Position'].replace({
    'LB/DB': 'LB'  
})

In [21]:
# Issue with Derek Carr's rating
df_2022['Rating'] = df_2022['Rating'].replace({'Carr': 82.0})
df_2022['Rating'] = df_2022['Rating'].replace('N/A', 65.3)

df_2022['Rating'] = df_2022['Rating'].astype(float)
df_2022['Rating'] = df_2022['Rating'].apply(lambda x: f"{x:.1f}")
df_2022['Rating'] = df_2022['Rating'].astype(float)

In [22]:
# Some issues with player at index 635
df_2022.loc[635, 'Position'] = 'LB'
df_2022.loc[635, 'Name'] = 'Micah McFadden'
df_2022.loc[635, 'Rating'] = 74.8
df_2022.loc[635, 'Team'] = 'New York Giants'

In [23]:
df_2022.Position.unique()

array(['QB', 'DI', 'RB', 'Edge', 'WR', 'LB', 'TE', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT', 'FB'], dtype=object)

# 2023 Player Data

In [24]:
html_fp = '/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2023.html'

with open(html_fp, "r") as file:
    html_content = file.read()
    
soup = BeautifulSoup(html_content, 'html.parser')

td_tags = soup.find_all('td')

td_text_list = []

for td in td_tags:
    td_text_list.append(td.text.strip())
    
player_list = td_text_list

In [25]:
# Initialize lists to store components
positions = []
names = []
ratings = []

# Iterate through each item in player_list
for item in player_list:
    
    if item in ['OFFENSE', 'DEFENSE']:
        continue
    # Split each item to extract position, name, and rating
    parts = item.split()
    
    # Extract position (first part)
    position = parts[0]
    
    # Extract name (rest of the parts except last part which is rating)
    name = ' '.join(parts[1:-1])
    
    # Extract rating (last part), remove '**' if present
    rating = parts[-1].replace('**', '').replace('*', '').strip('()')
    
    # Append to respective lists
    positions.append(position)
    names.append(name)
    ratings.append(rating)

# Create DataFrame
df_2023 = pd.DataFrame({
    'Position': positions,
    'Name': names,
    'Rating': ratings
})

In [26]:
team_names_list = ['Philadelphia Eagles', 'San Francisco 49ers', 'Cincinnati Bengals', 'Kansas City Chiefs', 
                   'Dallas Cowboys', 'Buffalo Bills', 'New York Jets', 'Baltimore Ravens',
                  'Los Angeles Chargers', 'Pittsburgh Steelers', 'Seattle Seahawks', 'Miami Dolphins', 
                   'Detroit Lions', 'Cleveland Browns', 'Jacksonville Jaguars', 'Minnesota Vikings', 
                  'New England Patriots', 'Washington Commanders', 'New Orleans Saints', 'New York Giants',
                  'Las Vegas Raiders', 'Chicago Bears', 'Green Bay Packers', 'Carolina Panthers', 
                  'Tennessee Titans', 'Denver Broncos', 'Atlanta Falcons', 'Tampa Bay Buccaneers', 'Houston Texans',
                  'Indianapolis Colts', 'Los Angeles Rams', 'Arizona Cardinals']

team_names = []

for team_name in team_names_list:
    
    # Repeat the team name 24 times (assuming 24 players per team)
    team_names.extend([team_name] * 24)

df_2023['Team'] = team_names
df_2023['Season'] = 2023

### Some 2023 cleaning

In [27]:
df_2023.Position.unique()

array(['QB', 'DI', 'RB', 'Edge', 'WR', 'LB', 'TE', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT'], dtype=object)

In [28]:
df_2023.Rating.unique()

array(['88.2', '60.2', '78.1', '92.3', '63.5', '84.6', '87.7', '89.8',
       '80.4', '83.8', '64.6', '78.6', '78.2', '60.3', '81.7', '71.7',
       '72.1', '73.1', '89.4', '72.2', '72.7', '79.4', '84.8', '69.1',
       '77.7', '80.1', '84.4', '63.4', '90.6', '74.1', '64.1', '78.7',
       '63.8', '68.1', '85.7', '84.7', '79.6', '91.7', '81.0', '59.5',
       '61.0', '62.9', '77.9', '51.8', '69.4', '69.7', '92.0', '85.2',
       '80.8', '68.7', '83.9', '82.9', '78.4', '69.9', '70.2', '79.0',
       '56.4', '52.3', '74.6', '75.4', '53.7', '67.1', '65.1', '64.5',
       '67.6', '56.0', '81.3', '76.2', '37.0', '57.5', '50.2', '64.8',
       '78.0', '67.7', '77.3', '91.3', '59.3', '61.2', '89.9', '74.9',
       '72.3', '70.3', '71.8', '68.2', '90.2', '79.9', '91.8', '86.2',
       '73.9', '62.6', '66.1', '65.5', '60.9', '73.3', '79.1', '60.5',
       '66.0', '72.8', '71.9', '67.5', '73.2', '73.6', '89.1', '85.8',
       '66.4', '80.9', '47.9', '64.2', '77.1', '66.5', '48.4', '73.8',
      

In [29]:
df_2023[df_2023['Rating'] == 'Ebiketie(64.5']

Unnamed: 0,Position,Name,Rating,Team,Season
631,Edge,Arnold,Ebiketie(64.5,Atlanta Falcons,2023


In [30]:
df_2023.loc[631, 'Name'] = 'Arnold Ebiketie'
df_2023.loc[631, 'Rating'] = 64.5

In [31]:
df_2023.Position.unique()

array(['QB', 'DI', 'RB', 'Edge', 'WR', 'LB', 'TE', 'LT', 'CB', 'LG', 'C',
       'RG', 'S', 'RT'], dtype=object)

# Player Rating Preliminary Cleaning

We now have player ratings for every NFL team for each NFL season going back to the 2020 season.

There are a few things to clean up before we can save these files.

1. For 2023, PFF did not record stats for FB's. So we can get rid of this column for all other seasons. This ensures that we have the same columns across all data frames.

2. Make the *Rating* column values floats.

In [32]:
df_2020.isna().sum(), df_2021.isna().sum(), df_2022.isna().sum(), df_2023.isna().sum()

(Position    0
 Name        0
 Rating      0
 Team        0
 Season      0
 dtype: int64,
 Position    0
 Name        0
 Rating      0
 Team        0
 Season      0
 dtype: int64,
 Position    0
 Name        0
 Rating      0
 Team        0
 season      0
 dtype: int64,
 Position    0
 Name        0
 Rating      0
 Team        0
 Season      0
 dtype: int64)

## 1. Get rid of rows with FB

In [34]:
# Drop rows in place

df_2020.drop(df_2020[df_2020['Position'] == 'FB'].index, inplace=True)
df_2021.drop(df_2021[df_2021['Position'] == 'FB'].index, inplace=True)
df_2022.drop(df_2022[df_2022['Position'] == 'FB'].index, inplace=True)


## 2. Convert Rating to float

In [35]:
df_2020['Rating'] = df_2020['Rating'].astype(float)
df_2021['Rating'] = df_2021['Rating'].astype(float)
df_2022['Rating'] = df_2022['Rating'].astype(float)
df_2023['Rating'] = df_2023['Rating'].astype(float)

# Write to CSV

In [37]:

df_2020.to_csv('/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2020_raw.csv', index=False)

df_2021.to_csv('/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2021_raw.csv', index=False)

df_2022.to_csv('/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2022_raw.csv', index=False)

df_2023.to_csv('/Users/epainter/Desktop/bet_model_v2/data/raw/pr_2023_raw.csv', index=False)
