In [1]:
# Import Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from pprint import pprint

import pandas as pd
import numpy as np

In [2]:
# Create a Chrome browser instance
browser = Browser('chrome')

In [3]:
# Years for analysis
years = np.arange(2000, 2023, 1)

# Create lists for the urls
offense_urls = []
defense_urls = []
standings_urls = []

# Iterate through the years to make urls
for year in years:
    
    # Generate the base url
    offense_url = f'https://www.sports-reference.com/cfb/years/{year}-team-offense.html'
    defense_url = f'https://www.sports-reference.com/cfb/years/{year}-team-defense.html'
    standings_url = f'https://www.sports-reference.com/cfb/years/{year}-standings.html'
    
    # Append the urls to the respective lists
    offense_urls.append(offense_url)
    defense_urls.append(defense_url)
    standings_urls.append(standings_url)

In [4]:
####################
# Offense
####################

# Script to automate browsing
combined_data = []

# Visit each year the url list
for url in offense_urls:  
    
    # Visit the page and create the soup object
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape the offensive table from the page
    tables = soup.find_all('tbody')
    table = tables[-1]
    
    # Execute JavaScript to stop further page loading
    browser.execute_script("window.stop();")
    
    # Extract data from the table
    # Create an empty list
    data_list = []
    
    # Get the year from the page
    year = soup.find('h1').find('span').text
    
    # Iterate through the rows in the table
    for row in table.find_all('tr'):
        
        # Check if the row belongs to the <tfoot> section and exclude
        if row.find_parent('tfoot'):
            continue
        
        # Empty list for the row data
        row_data = []
        
        # Append the year to the row_data
        row_data.append(year)

        # Iterate through the <td> tags for each cell data point in the current row
        for cell in row.find_all('td'):
            
            row_data.append(cell.get_text())

        # Append the row_data list to the data_list
        data_list.append(row_data)
            
    combined_data.extend(data_list)

## Create a Pandas DataFrame by using the list of rows and a list of the column names
columns = ['year', 'school', 'games', 'points', 'passing_cmp', 'passing_att', 'passing_pct', 'passing_yds', 'passing_td', 
           'rushing_att', 'rushing_yds', 'rushing_avg', 'rushing_td', 
           'total_plays', 'total_yds', 'total_avg', 
           'first_down_pass', 'first_down_rush', 'first_down_pen', 'first_down_total',
           'penalties', 'penalty_yds', 
           'fumbles', 'intceptions', 'turnovers' 
]

all_offense_df = pd.DataFrame(combined_data, columns=columns)

# Print the Files to a .csv
all_offense_df.to_csv('../resources/all_offense.csv', index=False)

In [5]:
####################
# Defense
####################

# Script to automate browsing
combined_data = []

# Visit each year the url list
for url in defense_urls:  
    
    # Visit the page and create the soup object
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape the offensive table from the page
    tables = soup.find_all('tbody')
    table = tables[-1]
    
    # Execute JavaScript to stop further page loading
    browser.execute_script("window.stop();")
    
    # Extract data from the table
    # Create an empty list
    data_list = []
    
    # Get the year from the page
    year = soup.find('h1').find('span').text
    
    # Iterate through the rows in the table
    for row in table.find_all('tr'):
        
        # Check if the row belongs to the <tfoot> section and exclude
        if row.find_parent('tfoot'):
            continue
        
        # Empty list for the row data
        row_data = []
        
        # Append the year to the row_data
        row_data.append(year)

        # Iterate through the <td> tags for each cell data point in the current row
        for cell in row.find_all('td'):
            
            row_data.append(cell.get_text())

        # Append the row_data list to the data_list
        data_list.append(row_data)
            
    combined_data.extend(data_list)

## Create a Pandas DataFrame by using the list of rows and a list of the column names
columns = ['year', 'school', 'games', 'points', 'passing_cmp', 'passing_att', 'passing_pct', 'passing_yds', 'passing_td', 
           'rushing_att', 'rushing_yds', 'rushing_avg', 'rushing_td', 
           'total_plays', 'total_yds', 'total_avg', 
           'first_down_pass', 'first_down_rush', 'first_down_pen', 'first_down_total',
           'penalties', 'penalty_yds', 
           'fumbles', 'intceptions', 'turnovers' 
]

all_defense_df = pd.DataFrame(combined_data, columns=columns)

# Print the Files to a .csv
all_defense_df.to_csv('../resources/all_defense.csv', index=False)

In [8]:
####################
# Standings
####################

# Script to automate browsing
combined_data = []

# Visit each year the url list
for url in standings_urls:  
    
    # Visit the page and create the soup object
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape the offensive table from the page
    tables = soup.find_all('tbody')
    table = tables[-1]
    
    # Execute JavaScript to stop further page loading
    browser.execute_script("window.stop();")
    
    # Extract data from the table
    # Create an empty list
    data_list = []
    
    # Get the year from the page
    year = soup.find('h1').find('span').text
    
    # Iterate through the rows in the table
    for row in table.find_all('tr'):
        
        # Check if the row belongs to the <tfoot> section and exclude
        if row.find_parent('tfoot'):
            continue
        
        # Empty list for the row data
        row_data = []
        
        # Append the year to the row_data
        row_data.append(year)

        # Iterate through the <td> tags for each cell data point in the current row
        for cell in row.find_all('td'):
            
            row_data.append(cell.get_text())

        # Append the row_data list to the data_list
        data_list.append(row_data)
            
    combined_data.extend(data_list)

## Create a Pandas DataFrame by using the list of rows and a list of the column names
columns = ['year', 'school', 'conference', 
           'wins', 'losses', 'winning_pct', 
           'conf_wins', 'conf_losses', 'conf_winning_pct',
           'ppg_offense', 'ppg_defense', 
           'SRS', 'SOS', 'ap_pre', 'ap_high', 'ap_post', 'notes' 
]

standings_df = pd.DataFrame(combined_data, columns=columns)

# Print the Files to a .csv
standings_df.to_csv('../resources/standings.csv', index=False)