### Imports

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import PyPDF2
import numpy as np
import re

### Load FCS stadiums from Wikipedia and make a dataframe

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_FCS_football_stadiums'
def get_fcs_stadiums(url):
    """
    Scrapes information from the wikipedia website and returns a dataframe of all stadium names and information
    
    INPUTS:
    url - a string of the desired website for scraping
    
    OUTPUTS:
    fcs_stadium_df - a dataframe of all of the stadium information
    """
    get_page = requests.get(url)
    page = get_page.text
    soup = BeautifulSoup(page,"html5lib")
    table = soup.find_all("td")
    records = []
    for item in table:
        #print(item.text)
        records.append(item.text)
    stadiums_array = np.array(records).reshape(-1, 10)
        #print(year, records_array.shape)
    fcs_stadium_df = pd.DataFrame(stadiums_array)
    return fcs_stadium_df

fcs_stadiums = get_fcs_stadiums(url)

##### Change column names and clean dataframe

In [3]:
fcs_stadiums.columns = ['PIC', 'STADIUM_NAME', 'CITY', 'STATE', 'TEAM', 'CONFERENCE', 'CAPACITY', 'RECORD', 'BUILT', 'EXPANDED']
fcs_stadiums.head()

Unnamed: 0,PIC,STADIUM_NAME,CITY,STATE,TEAM,CONFERENCE,CAPACITY,RECORD,BUILT,EXPANDED
0,,Ace W. Mumford Stadium,Baton Rouge,Louisiana,Southern Jaguars,SWAC,"7004285000000000000♠28,500","7004320000000000000♠32,000+",1928,"1980, 2009"
1,,Aggie Stadium,Davis,California,UC Davis Aggies,Big Sky,"7004108490000000000♠10,849",,2007,
2,,Aggie Stadium,Greensboro,North Carolina,North Carolina A&T Aggies,MEAC,"7004215000000000000♠21,500",,1981,
3,,Alerus Center,Grand Forks,North Dakota,North Dakota Fighting Hawks,Big Sky,"7004122830000000000♠12,283[1]",,2001[1],
4,,Alex G. Spanos Stadium,San Luis Obispo,California,Cal Poly Mustangs,Big Sky,"7004110750000000000♠11,075[2]",,2006[2],


In [4]:
fcs_stadiums = fcs_stadiums[['TEAM', 'CONFERENCE', 'CAPACITY']]
fcs_stadiums.tail()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
121,Liberty Flames,Big South,"7004192000000000000♠19,200[104]"
122,Yale Bulldogs,Ivy,"7004614460000000000♠61,446[106]"
123,William & Mary Tribe,CAA,"7004122590000000000♠12,259[107]"
124,Big Sky,"7004160000000000000♠16,000","7004176000000000000♠17,600\n(November 18, 1989..."
125,"7004142150000000000♠14,215",,1945 (?)


In [5]:
fcs_stadiums = fcs_stadiums.drop(fcs_stadiums.index[125])
fcs_stadiums = fcs_stadiums.drop(fcs_stadiums.index[124])
fcs_stadiums.tail()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
119,East Tennessee State Buccaneers,Southern,"7003769400000000000♠7,694"
120,Howard Bison,MEAC,"7004100000000000000♠10,000"
121,Liberty Flames,Big South,"7004192000000000000♠19,200[104]"
122,Yale Bulldogs,Ivy,"7004614460000000000♠61,446[106]"
123,William & Mary Tribe,CAA,"7004122590000000000♠12,259[107]"


### Write a function to clean the extra characters in the capacities

In [6]:
def capacity_cleaning(df):
    """
    Cleans a dataframe column consisting of extra characters from scraping
    
    INPUTS:
    df - the dataframe needing cleaning
    
    OUTPUTS:
    df - the original dataframe after the capacity values have been properly cleaned
    """
    cleaned_capacities = []
    cap_regex = re.compile(r'\d+,\d+')
    capacities = df['CAPACITY'].values
    for capacity in capacities:
        cap = cap_regex.search(capacity).group().replace(',', '')
        cleaned_capacities.append(cap)
    df['CAPACITY'] = cleaned_capacities
    return df

fcs_final = capacity_cleaning(fcs_stadiums)
fcs_final.head()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
0,Southern Jaguars,SWAC,28500
1,UC Davis Aggies,Big Sky,10849
2,North Carolina A&T Aggies,MEAC,21500
3,North Dakota Fighting Hawks,Big Sky,12283
4,Cal Poly Mustangs,Big Sky,11075


In [7]:
fcs_final.tail()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
119,East Tennessee State Buccaneers,Southern,7694
120,Howard Bison,MEAC,10000
121,Liberty Flames,Big South,19200
122,Yale Bulldogs,Ivy,61446
123,William & Mary Tribe,CAA,12259


### Load FBS Stadiums

In [8]:
url = 'https://en.wikipedia.org/wiki/List_of_NCAA_Division_I_FBS_football_stadiums'
def get_fbs_stadiums(url):
    """
    Scrapes information from the wikipedia website and returns a dataframe of all stadium names and information
    
    INPUTS:
    url - a string of the desired website for scraping
    
    OUTPUTS:
    fbs_stadium_df - a dataframe of all of the stadium information
    """
    get_page = requests.get(url)
    page = get_page.text
    soup = BeautifulSoup(page,"html5lib")
    table = soup.find_all("td")
    records = []
    for item in table:
        #print(item.text)
        records.append(item.text)
    stadiums_array = np.array(records[:-18]).reshape(-1, 11)
        #print(year, records_array.shape)
    fbs_stadium_df = pd.DataFrame(stadiums_array)
    return fbs_stadium_df
    #print(records[:-18])

fbs_stadiums = get_fbs_stadiums(url)

##### Change column names and clean dataframe

In [9]:
fbs_stadiums.columns = ['PIC', 'STADIUM_NAME', 'CITY', 'STATE', 'TEAM', 'CONFERENCE', 'CAPACITY', 'RECORD', 'BUILT', 'EXPANDED', 'SURFACE']
fbs_stadiums.head()

Unnamed: 0,PIC,STADIUM_NAME,CITY,STATE,TEAM,CONFERENCE,CAPACITY,RECORD,BUILT,EXPANDED,SURFACE
0,,Aggie Memorial Stadium,Las Cruces,NM,New Mexico State,Sun Belt,"7004303430000000000♠30,343[2]","7004329930000000000♠32,993\n(September 26, 199...",1978[2],2005[2],S5-M Synthetic Turf
1,,Alamodome,San Antonio,TX,UTSA,C-USA,"7004650000000000000♠65,000","7004567430000000000♠56,743\n(September 3, 2011...",1993,,SportField
2,,Alaska Airlines Field at Husky Stadium,Seattle,WA,Washington,Pac-12,"7004705000000000000♠70,500[3]","7004761250000000000♠76,125\n(September 23, 199...",1920,2013,FieldTurf
3,,Albertsons Stadium,Boise,ID,Boise State,Mountain West,"7004363870000000000♠36,387[4]","7004368640000000000♠36,864\n(September 20, 201...",1970[4],2012[4],FieldTurf
4,,Allen E. Paulson Stadium,Statesboro,GA,Georgia Southern,Sun Belt,"7004250000000000000♠25,000","7004257350000000000♠25,735\n(September 17, 201...",1984,2014,


In [10]:
fbs_stadiums = fbs_stadiums[['TEAM', 'CONFERENCE', 'CAPACITY']]
fbs_stadiums.head()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
0,New Mexico State,Sun Belt,"7004303430000000000♠30,343[2]"
1,UTSA,C-USA,"7004650000000000000♠65,000"
2,Washington,Pac-12,"7004705000000000000♠70,500[3]"
3,Boise State,Mountain West,"7004363870000000000♠36,387[4]"
4,Georgia Southern,Sun Belt,"7004250000000000000♠25,000"


In [12]:
fbs_final = capacity_cleaning(fbs_stadiums)
fbs_final.head()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
0,New Mexico State,Sun Belt,30343
1,UTSA,C-USA,65000
2,Washington,Pac-12,70500
3,Boise State,Mountain West,36387
4,Georgia Southern,Sun Belt,25000


In [13]:
fcs_final.head()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
0,Southern Jaguars,SWAC,28500
1,UC Davis Aggies,Big Sky,10849
2,North Carolina A&T Aggies,MEAC,21500
3,North Dakota Fighting Hawks,Big Sky,12283
4,Cal Poly Mustangs,Big Sky,11075


### Combine FCS and FBS dataframes and pickle the files

In [14]:
capacities_combined_df = pd.concat([fcs_final, fbs_final], ignore_index=True)

In [15]:
capacities_combined_df.head()

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
0,Southern Jaguars,SWAC,28500
1,UC Davis Aggies,Big Sky,10849
2,North Carolina A&T Aggies,MEAC,21500
3,North Dakota Fighting Hawks,Big Sky,12283
4,Cal Poly Mustangs,Big Sky,11075


In [19]:
capacities_combined_df[capacities_combined_df['TEAM'].str.contains('Notre Dame')]

Unnamed: 0,TEAM,CONFERENCE,CAPACITY
214,Notre Dame,Independent,80795


In [20]:
stadium_conf_path = '/Users/murdock/Documents/PDF_files/'
capacities_combined_df.to_pickle(stadium_conf_path + 'stadiums_conf_df.pkl')