In [4]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import time
import pandas_gbq
import numpy as np

In [46]:
#All NFL draft data is being pulled from sports-reference.com. Scraping and using their data is currently permitted under their terms of use
# provided you give credit to the site. 
#https://www.sports-reference.com/termsofuse.html

#They also request no more than twenty bot requests in one minute. Because I am referencing 25 pages, I am going to put a 10 second time delay between requests.
#https://www.sports-reference.com/bot-traffic.html

url_part1 = 'https://www.pro-football-reference.com/years/'

url_part2 = '/draft.htm'

year_range = range(2000, 2024, 1)

counter = 1

data = []

In [6]:
for year in year_range:
    url = url_part1 + str(year) + url_part2

    response = requests.get(url)

    html_content = response.content

    soup = BeautifulSoup(html_content, 'html.parser')

    table = soup.find('table', {'id': 'drafts'})

    headers = [th.text.strip() for th in table.find('thead').find_all('th')]

    for row in table.find('tbody').find_all('tr'):
        row_data = [td.text.strip() for td in row.find_all(['th', 'td'])]
        row_data.append(year)
        data.append(row_data)
    

    time.sleep(10)


remove_header_values = ['', 'Misc', 'Approx Val', 'Passing', 'Rushing', 'Receiving']

for value in remove_header_values:
    if value in headers:
        headers.remove(value)

headers = [x for x in headers if x != '']
headers.append('extraRow')
headers.append('Year')

In [27]:
counter = 0
for i in range(len(headers)):
    if headers[i] == 'TD':
        if counter == 0:
            headers[i] = 'passingTouchdowns'
            counter += 1
        elif counter == 1:
            headers[i] = 'rushingTouchdowns'
            counter += 1
        elif counter == 2:
            headers[i] = 'receivingTouchdowns'
            counter += 1

counter = 0
for i in range(len(headers)):
    if headers[i] == 'Att':
        if counter == 0:
            headers[i] = 'passingAttempts'
            counter += 1
        elif counter == 1:
            headers[i] = 'rushingAttempts'
            counter += 1

counter = 0
for i in range(len(headers)):
    if headers[i] == 'Yds':
        if counter == 0:
            headers[i] = 'passingYards'
            counter += 1
        elif counter == 1:
            headers[i] = 'rushingYards'
            counter += 1
        elif counter == 2:
            headers[i] = 'receivingYards'
            counter += 1


counter = 0
for i in range(len(headers)):
    if headers[i] == 'Int':
        if counter == 0:
            headers[i] = 'thrownInterceptions'
            counter += 1
        elif counter == 1:
            headers[i] = 'caughtInterceptions'
            counter += 1


In [28]:
df = pd.DataFrame(data, columns = headers)

In [29]:
df.head()

Unnamed: 0,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,...,rushingTouchdowns,Rec,receivingYards,receivingTouchdowns,Solo,caughtInterceptions,Sk,College/Univ,extraRow,Year
0,1,1,CLE,Courtney Brown,DE,22,2005,0,0,4,...,0,0,0,0,156.0,,19.0,Penn St.,College Stats,2000
1,1,2,WAS,LaVar Arrington,LB,22,2006,0,3,5,...,0,0,0,0,338.0,3.0,23.5,Penn St.,College Stats,2000
2,1,3,WAS,Chris Samuels,T,23,2009,0,6,9,...,0,0,0,0,,,,Alabama,College Stats,2000
3,1,4,CIN,Peter Warrick,WR,23,2005,0,0,4,...,2,275,2991,18,3.0,,,Florida St.,College Stats,2000
4,1,5,BAL,Jamal Lewis,RB,21,2009,1,1,9,...,58,221,1879,4,,,,Tennessee,College Stats,2000


In [30]:
for column_name in df.columns:
    print(column_name)

Rnd
Pick
Tm
Player
Pos
Age
To
AP1
PB
St
wAV
DrAV
G
Cmp
passingAttempts
passingYards
passingTouchdowns
thrownInterceptions
rushingAttempts
rushingYards
rushingTouchdowns
Rec
receivingYards
receivingTouchdowns
Solo
caughtInterceptions
Sk
College/Univ
extraRow
Year


In [31]:
df = df.drop(columns = ['College/Univ', 'extraRow'])

In [32]:
df = df.rename(columns = {
                            'Rnd':'draftRound',
                            'Pick':'draftPick',
                            'Tm': 'draftedBy',
                            'Player':'playerName',
                            'Pos': 'position',
                            'Age': 'age',
                            'To': 'playedUntilYear',
                            'AP1': 'timesFirstTeamAllPro',
                            'PB': 'timesProBowl',
                            'St': 'gamesStarted',
                            'wAV': 'weightedCareerApproximateValue',
                            'DrAV': 'draftedTeamApproximateValue',
                            'G':'gamesPlayed',
                            'Cmp':'passingCompletions',
                            'Rec': 'receptions',
                            'Solo':'soloTackles',
                            'Sk': 'sacks'})

In [33]:
for column_name in df.columns:
    print(column_name)

draftRound
draftPick
draftedBy
playerName
position
age
playedUntilYear
timesFirstTeamAllPro
timesProBowl
gamesStarted
weightedCareerApproximateValue
draftedTeamApproximateValue
gamesPlayed
passingCompletions
passingAttempts
passingYards
passingTouchdowns
thrownInterceptions
rushingAttempts
rushingYards
rushingTouchdowns
receptions
receivingYards
receivingTouchdowns
soloTackles
caughtInterceptions
sacks
Year


In [34]:
table_schema = [
    {'name': 'draftRound', 'type': 'INTEGER', 'mode': 'REQUIRED', 'description': 'The round that the player was selected in in the NFL draft'},
    {'name': 'draftPick', 'type': 'INTEGER', 'mode': 'REQUIRED', 'description' : 'The pick in the draft used to select the player'}, 
    {'name': 'draftedBy', 'type': 'STRING', 'mode': 'REQUIRED', 'description':'The NFL team that selected the player in the draft'},
    {'name': 'playerName', 'type': 'STRING', 'mode': 'REQUIRED', 'description':'The name of the player'},
    {'name': 'position', 'type': 'STRING', 'mode': 'NULLABLE', 'description':'The football position played by the player'},
    {'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The age of the player when they were drafted'},
    {'name': 'playedUntilYear', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The last year the player played in the NFL'},
    {'name': 'timeFirstTeamAllPro', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The number of times the player was selected to the all pro first team'},
    {'name': 'timesProBowl', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The number of times the player was selected to the Pro Bowl'},
    {'name': 'gamesStarted', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The number of games started by the player'},
    {'name': 'weightedCareerApproximateValue', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'A calculated value by profootballreference stating how well the players career panned out'},
    {'name': 'draftedTeamApproximateValue', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of career value the player provided to the team that drafted them'},
    {'name': 'gamesPlayed', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'Total number of games a player has played in their career'},
    {'name': 'passingCompletions', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of passes thrown by the player that were completed'},
    {'name': 'passingAttempts', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of passes thrown by the player'},
    {'name': 'passingYards', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of yards gained by passing'},
    {'name': 'passingTouchdowns', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of touchdowns thrown by the player'},
    {'name': 'thrownInterceptions', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of passes thrown by the player'},
    {'name': 'rushingAttempts', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of times the player has ran the ball'},
    {'name': 'rushingYards', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of yards the player has ran the ball for'},\
    {'name': 'rushingTouchdowns', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of touchdowns scored by the player when running the ball'},
    {'name': 'receptions', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of passes caught by the player'},
    {'name': 'receivingYards', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of yards gained by the player when catching the ball'},
    {'name': 'receivingTouchdowns', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of passes caught by the player resulting in a touchdown'},
    {'name': 'soloTackles', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of tackles made by the player singlehandedly'},
    {'name': 'sacks', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The amount of times the player tackled the quarterback'},
    {'name': 'Year', 'type': 'INTEGER', 'mode': 'NULLABLE', 'description':'The year the player was drafted'}
]   

In [35]:
from google.cloud import bigquery
import os

In [36]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/gshoda/key.json'

In [37]:
client = bigquery.Client()

In [44]:
dataset = 'sp24-i535-gshoda-nfldraft.NFL_Draft'

#unmute if creating the table for the firs time
#table_ref = client.dataset(dataset).table('Draft_Data')

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6274 entries, 0 to 6273
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   draftRound                      6274 non-null   object
 1   draftPick                       6274 non-null   object
 2   draftedBy                       6274 non-null   object
 3   playerName                      6274 non-null   object
 4   position                        6274 non-null   object
 5   age                             6274 non-null   object
 6   playedUntilYear                 6274 non-null   object
 7   timesFirstTeamAllPro            6274 non-null   object
 8   timesProBowl                    6274 non-null   object
 9   gamesStarted                    6274 non-null   object
 10  weightedCareerApproximateValue  6274 non-null   object
 11  draftedTeamApproximateValue     6274 non-null   object
 12  gamesPlayed                     6274 non-null   

In [42]:
df['draftRound'] = pd.to_numeric(df['draftRound'], errors = 'coerce').fillna('0').astype(int)
df['draftPick'] = pd.to_numeric(df['draftPick'], errors = 'coerce').fillna('0').astype(int)
df['age'] = pd.to_numeric(df['age'], errors = 'coerce').fillna('0').astype(int)
df['playedUntilYear'] = pd.to_numeric(df['playedUntilYear'], errors = 'coerce').fillna('0').astype(int)
df['timesFirstTeamAllPro'] = pd.to_numeric(df['timesFirstTeamAllPro'], errors = 'coerce').fillna('0').astype(int)
df['timesProBowl'] = pd.to_numeric(df['timesProBowl'], errors = 'coerce').fillna('0').astype(int)
df['gamesStarted'] = pd.to_numeric(df['gamesStarted'], errors = 'coerce').fillna('0').astype(int)
df['weightedCareerApproximateValue'] = pd.to_numeric(df['weightedCareerApproximateValue'], errors = 'coerce').fillna('0').astype(int)
df['draftedTeamApproximateValue'] = pd.to_numeric(df['draftedTeamApproximateValue'], errors = 'coerce').fillna('0').astype(int)
df['gamesPlayed'] = pd.to_numeric(df['gamesPlayed'], errors = 'coerce').fillna('0').astype(int)
df['passingCompletions'] = pd.to_numeric(df['passingCompletions'], errors = 'coerce').fillna('0').astype(int)
df['passingAttempts'] = pd.to_numeric(df['passingAttempts'], errors = 'coerce').fillna('0').astype(int)
df['passingYards'] = pd.to_numeric(df['passingYards'], errors = 'coerce').fillna('0').astype(int)
df['passingTouchdowns'] = pd.to_numeric(df['passingTouchdowns'], errors = 'coerce').fillna('0').astype(int)
df['thrownInterceptions'] = pd.to_numeric(df['thrownInterceptions'], errors = 'coerce').fillna('0').astype(int)
df['rushingAttempts'] = pd.to_numeric(df['rushingAttempts'], errors = 'coerce').fillna('0').astype(int)
df['rushingYards'] = pd.to_numeric(df['rushingYards'], errors = 'coerce').fillna('0').astype(int)
df['rushingTouchdowns'] = pd.to_numeric(df['rushingTouchdowns'], errors = 'coerce').fillna('0').astype(int)
df['receptions'] = pd.to_numeric(df['receptions'], errors = 'coerce').fillna('0').astype(int)
df['receivingYards'] = pd.to_numeric(df['receivingYards'], errors = 'coerce').fillna('0').astype(int)
df['receivingTouchdowns'] = pd.to_numeric(df['receivingTouchdowns'], errors = 'coerce').fillna('0').astype(int)
df['soloTackles'] = pd.to_numeric(df['soloTackles'], errors = 'coerce').fillna('0').astype(int)
df['caughtInterceptions'] = pd.to_numeric(df['caughtInterceptions'], errors = 'coerce').fillna('0').astype(int)
df['sacks'] = pd.to_numeric(df['sacks'], errors = 'coerce').fillna('0').astype(int)

In [43]:
pandas_gbq.to_gbq(df, 'NFL_Draft.Draft_Data', if_exists='replace', table_schema = table_schema)