In [1]:
# Import Dependencies
from bs4 import BeautifulSoup
from pprint import pprint
from splinter import Browser

import pandas as pd
import numpy as np

In [2]:
# Read in the schools list (###REFERENCE TO GITHUB WHERE THAT SCRIPT IS###)
school_list_df = pd.read_csv('resources/schools.csv')

In [3]:
# Get user inputs for school and year
school = input(f'Select the school to analyze (capitalize first letter of each word) ')

# Years for analysis
year_i = input(f'Starting Year (YYYY format)')
year_f = input(f'Ending Year (type in same year for single year analysis)')

# Generate the url list
base_url = 'https://www.sports-reference.com'

# Create the schools url portion
school_url = school_list_df.query("school == @school")["link"].values[0]

# Create the years integers
year_i = int(year_i)
year_f = int(year_f)
years = np.arange(year_i, 1+ year_f, 1)

# Create a list for the urls
urls = []

for year in years:
    url = f'{base_url}{school_url}{year}/gamelog/'
    urls.append(url)
    
print(f'You selected {school} from years {year_i} through {year_f}')

Select the school to analyze (capitalize first letter of each word) Nebraska
Starting Year (YYYY format)2020
Ending Year (type in same year for single year analysis)2021
You selected Nebraska from years 2020 through 2021


In [4]:
# Create a Chrome browser instance
browser = Browser('chrome')

In [7]:
# Script to automate browsing for Offense
combined_data = []

# Visit each year the url list
for url in urls:  
    
    # Visit the page and create the soup object
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape the offensive table from the page
    table = soup.find('table', {'class': 'sortable', 'id': 'offense'})
    
    # Extract data from the table
    # Create an empty list
    data_list = []
    
    # Iterate through the rows in the table
    for row in table.find_all('tr'):
        
        # Check if the row belongs to the <tfoot> section and exclude
        if row.find_parent('tfoot'):
            continue
        
        # Empty list for the row data
        row_data = []

        # Iterate through the <td> tags for each cell data point in the current row
        for cell in row.find_all('td'):
            
            row_data.append(cell.get_text())

        # Append the row_data list to the data_list
        data_list.append(row_data)
            
    combined_data.extend(data_list)
    
## Create a Pandas DataFrame by using the list of rows and a list of the column names
columns = ['Date', 'Home_Away', 'Opponent', 'Score', 'Passing Cmp', 
           'Passing Att', 'Passing Pct', 'Passing Yds', 'Passing TD', 'Rushing Att', 
           'Rushing Yds', 'Rushing Avg', 'Rushing TD', 'Total Plays', 'Total Yds', 
           'Total Avg', 'First Down Pass', 'First Down Rush', 'First Down Pen', 'First Down Tot',
           'Penalties', 'Penalty Yds', 'Fum', 'Int', 'TO'
]

offense_df = pd.DataFrame(combined_data, columns=columns)

# Print the Files to a .csv
offense_df.to_csv('resources/offense.csv', index=False)

In [None]:
# Script to automate browsing for Defense
combined_data = []

# Visit each year the url list
for url in urls:  
    # Visit the page and create the soup object
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # Scrape the defensive table
    table = soup.find('table', {'class': 'sortable', 'id': 'defense'})
    
    # Extract data from the table
    # Create an empty list
    data_list = []
    
    # Iterate through the rows in the table
    for row in table.find_all('tr'): 
        
        # Check if the row belongs to the <tfoot> section and exclude
        if row.find_parent('tfoot'):
            continue
        
        # Empty list for the row data
        row_data = []

        # Iterate through the <td> tags for each cell data point in the current row
        for cell in row.find_all('td'):
            row_data.append(cell.get_text())

        # Append the row_data list to the data_list
        data_list.append(row_data)
            
    combined_data.extend(data_list)
        
## Create a Pandas DataFrame by using the list of rows and a list of the column names
columns = ['Date', 'Home_Away', 'Opponent', 'Score', 'Passing Cmp', 
           'Passing Att', 'Passing Pct', 'Passing Yds', 'Passing TD', 'Rushing Att', 
           'Rushing Yds', 'Rushing Avg', 'Rushing TD', 'Total Plays', 'Total Yds', 
           'Total Avg', 'First Down Pass', 'First Down Rush', 'First Down Pen', 'First Down Tot',
           'Penalties', 'Penalty Yds', 'Fum', 'Int', 'TO'
]

defense_df = pd.DataFrame(combined_data, columns=columns)

# Print the Files to a .csv
defense_df.to_csv('resources/defense.csv', index=False)