In [1]:
!pip install requests beautifulsoup4 pandas



In [None]:
## ideas

# webscrape each draft page and filter position to qb and change the year
# create a list of each qb per year then webscrape stats based off career, seasons, games
# combine all individual qb data into one dataset with all qbs

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time  # Import the time module for adding delays

# List of years to scrape
years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009]

# List to store data across years
draft_data = []

# Loop over each year
for year in years:
    # Construct the URL for the draft page
    url = f'https://www.pro-football-reference.com/years/{year}/draft.htm'
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the table with the draft data
        table = soup.find('table', {'id': 'drafts'})
        
        # Ensure the table is found
        if table:
            # Loop through the rows in the table (skip the header row)
            for row in table.find_all('tr')[1:]:
                # Find the <th> element that contains the round info (this replaces 'get_text' from <td>)
                draft_round = row.find('th', {'data-stat': 'draft_round'})
                
                # Extract the round number from the "data-stat" attribute
                if draft_round:
                    round_number = draft_round.get_text()  # Get the text for round number
                else:
                    round_number = None  # If no round information is found, set to None
                
                # Find the <td> elements for other player data (Pick, Name, College, Position, Team)
                cells = row.find_all('td')
                
                if len(cells) > 5:  # Ensure the row has enough cells to extract data
                    pick = cells[0].get_text()
                    team = cells[1].get_text()
                    name = cells[2].get_text()
                    position = cells[3].get_text()
                    age = cells[4].get_text()
                    games = cells[11].get_text()
                    p_comp = cells[12].get_text()
                    p_att = cells[13].get_text()
                    p_yds = cells[14].get_text()
                    p_tds = cells[15].get_text()
                    ints = cells[16].get_text()
                    r_att = cells[17].get_text()
                    r_yds = cells[18].get_text()
                    r_tds = cells[19].get_text()


                    # Append the extracted data as a dictionary
                    player_data = {
                        'Year': year,  # Include the draft year for reference
                        'Draft Round': round_number,
                        'Pick': pick,
                        'Team': team,
                        'Name': name,
                        'Position': position,
                        'Age': age,
                        'GP': games,
                        'Pass Comps': p_comp,
                        'Pass Atts': p_att,
                        'Pass Yards': p_yds,
                        'Pass TDs': p_tds,
                        'Ints' : ints,
                        'Rush Atts': r_att,
                        'Rush Yards': r_yds,
                        'Rush TDs': r_tds
                    }
                    draft_data.append(player_data)
        else:
            print(f"Table not found for {year}")
    else:
        print(f"Failed to retrieve data for {year}. Status code: {response.status_code}")
    
    # Sleep for 1-2 seconds before making the next request to avoid rate limiting
    time.sleep(1 + (year % 3))  # Sleep for 1 to 3 seconds (this adds a small variation)

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(draft_data)

# Filter the DataFrame for quarterbacks (Position == 'QB')
df_qb = df[df['Position'] == 'QB']

# Filter for first round picks (Draft Round == '1')
df_first_round = df_qb[df_qb['Draft Round'] == '1']

# Display the first 10 rows of the filtered DataFrame for first-round quarterbacks
print(df_first_round.head(10))

# Optionally, save the filtered DataFrame to a CSV file
df_first_round.to_csv('fr_qb_draft_data.csv', index=False)

     Year Draft Round Pick Team                Name Position Age  GP  \
0    2024           1    1  CHI      Caleb Williams       QB  22   9   
1    2024           1    2  WAS      Jayden Daniels       QB  23  10   
2    2024           1    3  NWE          Drake Maye       QB  22   6   
7    2024           1    8  ATL       Michael Penix       QB  24   1   
9    2024           1   10  MIN       J.J. McCarthy       QB  21       
11   2024           1   12  DEN              Bo Nix       QB  24  10   
257  2023           1    1  CAR         Bryce Young       QB  22  23   
258  2023           1    2  HOU         C.J. Stroud       QB  21  25   
260  2023           1    4  IND  Anthony Richardson       QB  21  10   
535  2022           1   20  PIT       Kenny Pickett       QB  24  27   

    Pass Comps Pass Atts Pass Yards Pass TDs Ints Rush Atts Rush Yards  \
0          178       294       1785        9    5        40        236   
1          180       262       2147        9    2        85

In [3]:
# Loop through the rows in the table (skip the header row)
for row in table.find_all('tr')[1:]:
    cells = row.find_all('td')
    
    # Print the number of cells and their contents
    print(f"Number of cells in this row: {len(cells)}")
    for cell in cells:
        print(cell.get_text())  # This will print the content of each cell
    
    # Proceed with your logic
    if len(cells) > 5:  # Ensure the row has enough cells to extract data
        pick = cells[0].get_text()
        team = cells[1].get_text()
        name = cells[2].get_text()
        position = cells[3].get_text()
        age = cells[4].get_text()
        games = cells[11].get_text()
        p_comp = cells[12].get_text()
        p_att = cells[13].get_text()
        p_yds = cells[14].get_text()
        p_tds = cells[15].get_text()
        ints = cells[16].get_text()
        r_att = cells[17].get_text()
        r_yds = cells[18].get_text()
        r_tds = cells[19].get_text()

        #cell 5 - to
        #cell 6 - ap1
        #cell 7 - pb
        #cell 8 - st
        #cell 9 - wAV
        #cell 10 - DrAV
        #cell 11 - games played
        #cell 12 - completions
        #cell 13 - pass attempts
        #cell 14 - pass yds
        #cell 15 - pass tds
        #cell 16 - ints
        #cell 17 - rush atts
        #cell 18 - rush yds
        #cell 19 - rush tds
        #cell 20 - 
        #cell 21 - 
        #cell 22 - 
        #cell 23 - 
        #cell 24 - 
        #cell 25 - 
        #cell 26 - 
        #cell 27 - 
        #cell 28 - 

Number of cells in this row: 0
Number of cells in this row: 28
1
DET
Matthew Stafford
QB
21
2024
0
2
14
121
98
215
5048
7971
58309
366
187
419
1308
15
2
-3
0
1


Georgia
College Stats
Number of cells in this row: 28
2
STL
Jason Smith
T
23
2012
0
0
2
10
9
45
0
0
0
0
0
0
0
0
0
0
0



Baylor
College Stats
Number of cells in this row: 28
3
KAN
Tyson Jackson
DE
23
2016
0
0
7
34
23
122
0
0
0
0
0
0
0
0
0
0
0
167

9.0
LSU
College Stats
Number of cells in this row: 28
4
SEA
Aaron Curry
LB
23
2012
0
0
3
16
12
48
0
0
0
0
0
0
0
0
0
0
0
163

5.5
Wake Forest
College Stats
Number of cells in this row: 28
5
NYJ
Mark Sanchez
QB
22
2018
0
0
4
32
26
79
1314
2320
15357
86
89
170
457
13
0
0
0



USC
College Stats
Number of cells in this row: 28
6
CIN
Andre Smith
T
22
2021
0
0
8
41
37
121
0
0
0
0
0
0
0
0
0
0
0



Alabama
College Stats
Number of cells in this row: 28
7
OAK
Darrius Heyward-Bey
WR
22
2018
0
0
5
23
17
144
0
0
0
0
0
15
194
2
202
2897
16
17


Maryland
College Stats
Number of cells in this row: 28