In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [2]:
def scrape_baseball_reference(url, writer):
    # Fetch the HTML content of the page
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to fetch page")
        return

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find AL MVP table
    al_mvp_table = soup.find('table', {'id': 'AL_MVP_voting'})
    if al_mvp_table:
        scrape_table(al_mvp_table, writer, url)

    # Find NL MVP table
    nl_mvp_table = soup.find('table', {'id': 'NL_MVP_voting'})
    if nl_mvp_table:
        scrape_table(nl_mvp_table, writer, url)

def scrape_table(table, writer, year_url):
    year = year_url.split('_')[-1].split('.')[0]  # Extract year from URL
    # Extract table rows
    for tr in table.find_all('tr')[1:]:
        row_data = [year]  # Add year as the first element
        row_data.extend(td.text.strip() for td in tr.find_all('td'))
        writer.writerow(row_data)

def gen_base_urls(start_yr, end_yr):
    base_urls = []
    for year in range(start_yr, end_yr + 1):
        base_url = f"https://www.baseball-reference.com/awards/awards_{year}.shtml"
        base_urls.append(base_url)
    return base_urls
    
# Output CSV file
output_file = 'baseball_awards.csv'

# Open CSV file for writing with UTF-8 encoding
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)

    # Write header row
    column_headers = ["Year", "Name", "Tm", "Vote Pts", "1st Place", "Share", "WAR", "G", "AB", "R", "H", "HR", "RBI", "SB", "BB", "BA", "OBP", "SLG", "OPS", "W", "L", "ERA", "WHIP", "G", "GS", "SV", "IP", "H", "HR", "BB", "SO"]
    writer.writerow(column_headers)

    base_urls = gen_base_urls(1983,2023)
    
    # Loop through the base URLs
    for base_url in base_urls:
        # Scrape data for the current year
        scrape_baseball_reference(base_url, writer)
        
        year = base_url.split('_')[-1].split('.')[0]
        print(f"Data scraped for {year}")
        
        # Sleep for a short while to avoid overwhelming the server
        time.sleep(3)  # Adjust as needed

Data scraped for 1983
Data scraped for 1984
Data scraped for 1985
Data scraped for 1986
Data scraped for 1987
Data scraped for 1988
Data scraped for 1989
Data scraped for 1990
Data scraped for 1991
Data scraped for 1992
Data scraped for 1993
Data scraped for 1994
Data scraped for 1995
Data scraped for 1996
Data scraped for 1997
Data scraped for 1998
Data scraped for 1999
Data scraped for 2000
Data scraped for 2001
Data scraped for 2002
Data scraped for 2003
Data scraped for 2004
Data scraped for 2005
Data scraped for 2006
Data scraped for 2007
Data scraped for 2008
Data scraped for 2009
Data scraped for 2010
Data scraped for 2011
Data scraped for 2012
Data scraped for 2013
Data scraped for 2014
Data scraped for 2015
Data scraped for 2016
Data scraped for 2017
Data scraped for 2018
Data scraped for 2019
Data scraped for 2020
Data scraped for 2021
Data scraped for 2022
Data scraped for 2023
