In [1]:
#Dependencies 
import pandas as pd 
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [2]:
##save data URL for nfl attendance, read w/pandas web scraping
data = "https://www.pro-football-reference.com/years/2018/attendance.htm"
nfl_attendance = pd.read_html(data)

In [3]:
#see what columns we have
nfl_attendance = nfl_attendance[0]
for col in nfl_attendance.columns:
    print(col)

Tm
Total
Home
Away
Week 1
Week 2
Week 3
Week 4
Week 5
Week 6
Week 7
Week 8
Week 9
Week 10
Week 11
Week 12
Week 13
Week 14
Week 15
Week 16
Week 17


In [4]:
#Drop weekly columns, keep total only
nfl_attendance = nfl_attendance.drop([col for col in nfl_attendance.columns if 'Week' in col],axis=1)
nfl_attendance.head()

Unnamed: 0,Tm,Total,Home,Away
0,Arizona Cardinals,1018127,496111.0,522016.0
1,Atlanta Falcons,1119143,583184.0,535959.0
2,Baltimore Ravens,1053383,563451.0,489932.0
3,Buffalo Bills,1072899,519695.0,553204.0
4,Carolina Panthers,1102756,590182.0,512574.0


In [5]:
#Drop home and away columns, keep team and total attendance only
nfl_attendance = nfl_attendance.drop(["Home", "Away"], axis = 1)
nfl_attendance.head()

Unnamed: 0,Tm,Total
0,Arizona Cardinals,1018127
1,Atlanta Falcons,1119143
2,Baltimore Ravens,1053383
3,Buffalo Bills,1072899
4,Carolina Panthers,1102756


In [6]:
#Rename columns for easier ETL
nfl_attendance.columns = ["team", "total_attendance"]
nfl_attendance.head()

Unnamed: 0,team,total_attendance
0,Arizona Cardinals,1018127
1,Atlanta Falcons,1119143
2,Baltimore Ravens,1053383
3,Buffalo Bills,1072899
4,Carolina Panthers,1102756


In [7]:
#This is already in alphabetical order
nfl_attendance

Unnamed: 0,team,total_attendance
0,Arizona Cardinals,1018127
1,Atlanta Falcons,1119143
2,Baltimore Ravens,1053383
3,Buffalo Bills,1072899
4,Carolina Panthers,1102756
5,Chicago Bears,1045568
6,Cincinnati Bengals,911289
7,Cleveland Browns,1045441
8,Dallas Cowboys,1303393
9,Denver Broncos,1092324


In [8]:
#Data is extracted and ready to be loaded...doesn't need transforming atm
#now do the capacity data! and merge! 
url = "https://www.stadiumsofprofootball.com/comparisons/"
nfl_capacities = pd.read_html(url)
nfl_capacities = nfl_capacities[0]
nfl_capacities

Unnamed: 0,0,1,2,3,4,5
0,Name,Team(s),Capacity,Opened,Turf,Cost
1,Lambeau Field,Green Bay Packers,80735,9/29/1957,Grass,"$960,000"
2,RingCentral Coliseum,Oakland Raiders,53250,9/18/1966,Grass,$25.5 Million
3,Arrowhead Stadium,Kansas City Chiefs,76416,8/12/1972,Grass,$43 Million
4,New Era Field,Buffalo Bills,73967,8/17/1973,FieldTurf,$22 Million
5,Superdome,New Orleans Saints,76468,9/28/1975,FieldTurf,$134 Million
6,Hard Rock Stadium,Miami Dolphins,65326,8/16/1987,Grass,$115 Million
7,TIAA Bank Field,Jacksonville Jaguars,67264,8/18/1995,Grass,$134 Million
8,Bank of America Stadium,Carolina Panthers,73778,9/14/1996,Grass,$242 Million
9,FedEx Field,Washington Redskins,79000,9/14/1997,Grass,$250 Million


In [9]:
for col in nfl_capacities.columns:
    print(type(col))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


In [10]:
#Drop unneccessary columns--and stadium name, doesn't matter
nfl_capacities = nfl_capacities.drop([0,3, 4, 5], axis = 1)
nfl_capacities.head()

Unnamed: 0,1,2
0,Team(s),Capacity
1,Green Bay Packers,80735
2,Oakland Raiders,53250
3,Kansas City Chiefs,76416
4,Buffalo Bills,73967


In [11]:
#Change column names
nfl_capacities.columns = ["team", "stadium_capacity"]
nfl_capacities.head()

Unnamed: 0,team,stadium_capacity
0,Team(s),Capacity
1,Green Bay Packers,80735
2,Oakland Raiders,53250
3,Kansas City Chiefs,76416
4,Buffalo Bills,73967


In [12]:
#Drop first row, it's repetitive 
nfl_capacities = nfl_capacities.iloc[1:]
nfl_capacities

Unnamed: 0,team,stadium_capacity
1,Green Bay Packers,80735.0
2,Oakland Raiders,53250.0
3,Kansas City Chiefs,76416.0
4,Buffalo Bills,73967.0
5,New Orleans Saints,76468.0
6,Miami Dolphins,65326.0
7,Jacksonville Jaguars,67264.0
8,Carolina Panthers,73778.0
9,Washington Redskins,79000.0
10,Baltimore Ravens,71008.0


In [13]:
#Sort capacity data
nfl_capacities = nfl_capacities.sort_values(by = ['team']).reset_index()
nfl_capacities_cap = nfl_capacities.stadium_capacity
nfl_capacities_cap

0     63400
1     71000
2     71008
3     73967
4     73778
5     61500
6     65515
7     68000
8     80000
9     76125
10    65000
11    80735
12    71500
13    63000
14    67264
15    76416
16    30000
17    77500
18    65326
19    66200
20    68756
21    76468
22    82500
23    53250
24    69176
25    65500
26    68500
27    67000
28    65890
29    69143
30    79000
31      NaN
Name: stadium_capacity, dtype: object

In [14]:
nfl_attendance['stadium_capacity'] = nfl_capacities_cap
nfl_attendance

Unnamed: 0,team,total_attendance,stadium_capacity
0,Arizona Cardinals,1018127,63400.0
1,Atlanta Falcons,1119143,71000.0
2,Baltimore Ravens,1053383,71008.0
3,Buffalo Bills,1072899,73967.0
4,Carolina Panthers,1102756,73778.0
5,Chicago Bears,1045568,61500.0
6,Cincinnati Bengals,911289,65515.0
7,Cleveland Browns,1045441,68000.0
8,Dallas Cowboys,1303393,80000.0
9,Denver Broncos,1092324,76125.0


In [15]:
nfl_attendance.head()

Unnamed: 0,team,total_attendance,stadium_capacity
0,Arizona Cardinals,1018127,63400
1,Atlanta Falcons,1119143,71000
2,Baltimore Ravens,1053383,71008
3,Buffalo Bills,1072899,73967
4,Carolina Panthers,1102756,73778


In [16]:
nfl_attendance_capacity = nfl_attendance
nfl_attendance_capacity.head()

Unnamed: 0,team,total_attendance,stadium_capacity
0,Arizona Cardinals,1018127,63400
1,Atlanta Falcons,1119143,71000
2,Baltimore Ravens,1053383,71008
3,Buffalo Bills,1072899,73967
4,Carolina Panthers,1102756,73778


In [17]:
##Data is now cleaned and extracted & put into its own df. 
nfl_attendance_capacity.to_csv("nfl.csv")