In [1]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

In [2]:
url = 'https://www.pro-football-reference.com/years/2022/passing.htm'
response = requests.get(url)

assert response.status_code == 200

bs = BeautifulSoup(response.text)

In [3]:
columns = [
  'Player',
  'Tm',
  'Age',
  'Pos',
  'G',
  'GS',
  'QBrec',
  'Cmp',
  'Att',
  'Cmp%',
  'Yds',
  'TD',
  'TD%',
  'Int',
  'Int%',
  '1D',
  'Lng',
  'Y/A',
  'AY/A',
  'Y/C',
  'Y/G',
  'Rate',
  'QBR',
  'Sk',
  'Yds2',
  'Sk%',
  'NY/A',
  'ANY/A',
  '4QC',
  'GWD'
]

In [4]:
data = []

for row in bs.select('#div_passing tbody tr'):
    tds = row.select('td')

    if len(tds) != len(columns):
        continue

    data.append(
      [ td.text for td in tds]
    )

df = pd.DataFrame(data, columns=columns)
df.head()


Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,...,Y/G,Rate,QBR,Sk,Yds2,Sk%,NY/A,ANY/A,4QC,GWD
0,Matt Ryan,IND,37,QB,5,5,2-2-1,128,195,65.6,...,275.2,79.8,,21,168,9.7,5.59,4.6,3.0,2.0
1,Russell Wilson,DEN,34,QB,5,5,2-3-0,101,170,59.4,...,250.8,82.8,,16,99,8.6,6.21,5.91,2.0,2.0
2,Justin Herbert,LAC,24,QB,4,4,2-2-0,111,166,66.9,...,312.5,102.2,,4,21,2.4,7.23,7.76,,
3,Josh Allen,BUF,26,QB,4,4,3-1-0,113,168,67.3,...,306.8,101.0,,8,39,4.5,6.75,7.12,0.0,1.0
4,Jared Goff,DET,28,QB,4,4,1-3-0,92,151,60.9,...,281.5,99.9,,5,41,3.2,6.96,7.5,,


In [5]:
def build_bulk_insert_sql(df, table_name):
    sql = f"""INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES"""

    dtypes = list(df.dtypes.to_dict().items())
    df = df.fillna(np.nan).replace([np.nan], [None])

    for _, row in df.iterrows():
        insertable = []
        for col, dtype in dtypes:
            value = row[col]
            if dtype.name == 'object' and not value is None:
                insertable.append('"' + str(row[col]).strip() + '"')
            elif value is None:
                insertable.append('NULL')
            else:
                insertable.append(str(row[col]))

        insertable_sql = ', '.join(insertable)
        sql += f'\n({insertable_sql}),'

    return sql[:-1]

In [6]:
quarterbacks = df[['Player', 'Cmp', 'Att', 'Yds', 'Int', 'TD']].copy()
quarterbacks.columns = ['name', 'completions', 'attempts', 'yards', 'interceptions', 'touchdowns']

quarterbacks['completions'] = quarterbacks['completions'].astype(int)
quarterbacks['attempts'] = quarterbacks['attempts'].astype(int)
quarterbacks['yards'] = quarterbacks['yards'].astype(int)
quarterbacks['interceptions'] = quarterbacks['interceptions'].astype(int)
quarterbacks['touchdowns'] = quarterbacks['touchdowns'].astype(int)

quarterbacks.head()

Unnamed: 0,name,completions,attempts,yards,interceptions,touchdowns
0,Matt Ryan,128,195,1376,7,5
1,Russell Wilson,101,170,1254,3,4
2,Justin Herbert,111,166,1250,2,9
3,Josh Allen,113,168,1227,3,10
4,Jared Goff,92,151,1126,3,11


In [7]:
quarterbacks.dtypes

name             object
completions       int64
attempts          int64
yards             int64
interceptions     int64
touchdowns        int64
dtype: object

In [8]:
insert_sql = build_bulk_insert_sql(quarterbacks, 'quarterbacks')

In [9]:
## this is a bit odd with multi seasons... but only loading current so...
create_table_sql = """
  CREATE TABLE quarterbacks(
    name VARCHAR(200) NOT NULL,
    completions NUMERIC NOT NULL,
    attempts NUMERIC NOT NULL,
    yards NUMERIC NOT NULL,
    interceptions NUMERIC NOT NULL,
    touchdowns NUMERIC NOT NULL
  );
"""

In [10]:
import os
import sqlite3

if os.path.exists('../../data/nfl/quarterbacks.sqlite3'):
  os.remove('../../data/nfl/quarterbacks.sqlite3')

with sqlite3.connect('../../data/nfl/quarterbacks.sqlite3') as conn:
    cur = conn.cursor()
    cur.execute(create_table_sql)
    cur.execute(insert_sql)

In [11]:
with sqlite3.connect('../../data/nfl/quarterbacks.sqlite3') as conn:
    cur = conn.cursor()
    cur.execute('select * from quarterbacks;')

    for row in cur.fetchall()[:5]:
      print(row)

('Matt Ryan', 128, 195, 1376, 7, 5)
('Russell Wilson', 101, 170, 1254, 3, 4)
('Justin Herbert', 111, 166, 1250, 2, 9)
('Josh Allen', 113, 168, 1227, 3, 10)
('Jared Goff', 92, 151, 1126, 3, 11)
