In [1]:
# Import libraries
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import requests
from bs4 import BeautifulSoup

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
# Functions
def scrape_batting_table(season_id):
    """Scrape the batting table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the batting table for

    Returns:
        pd.DataFrame: The batting table for the given season_id
    """    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/stats.html?{season_id}&view=teambatting'
    
    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#bat_first > table:nth-child(1)')
    
    # Check if the table is empty
    if not table:
        print(f'No table found for season_id {season_id}')
        return None
    
    # Get the headers (th) from the table
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []
    
    # Get the rows (tr) from the table
    for tr in table[0].find_all('tr'):
        # Get the data (td) from the row
        row = [td.text.strip() for td in tr.find_all('td')]
        # Check if the row is empty if not append the season_id to the row and append the row to the rows list
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

def scrape_team_tables(season_id):
    """Scrape the batting table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the batting table for

    Returns:
        pd.DataFrame: The batting table for the given season_id
    """    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/stats.html?{season_id}&view=teambatting'
    
    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#bat_first > table:nth-child(1)')
    
    # Check if the table is empty
    if not table:
        print(f'No table found for season_id {season_id}')
        return None
    
    # Get the headers (th) from the table
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []
    
    # Get the rows (tr) from the table
    for tr in table[0].find_all('tr'):
        # Get the data (td) from the row
        row = [td.text.strip() for td in tr.find_all('td')]
        # Check if the row is empty if not append the season_id to the row and append the row to the rows list
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)



# %%
# Function to perform the scraping
def scrape_page(url, scrape_func):
    """Scrape the CCBC website for a given url and scrape_func

    Args:
        url (str): The url to scrape
        scrape_func (function): The function to use to scrape the table

    Returns:
        pd.DataFrame: The dataframe containing the scraped data
    """    
    
    # Get the page
    page = requests.get(url)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the select element with the id seasonid
    select = soup.find('select', {'id': 'seasonid'})
    # Get the options from the select element (this is the list of years)
    options = select.find_all('option')
    # Create a dictionary of the years (seasons) with the season_id as the key and the season name as the value
    seasons = {option['value']: option.text for option in options}
    print(seasons)
    
    # For season in season_ids scrape the table and append the results to a dataframe using the scrape_func
    df = pd.DataFrame()
    for season in seasons.keys():
        temp_df = scrape_func(season)
        # append the new row to the DataFrame
        df = pd.concat([df, temp_df])

    # Create a new variable called season that uses the season_id column to look up the season name in the seasons dictionary
    df['season'] = df['season_id'].map(seasons)

    # Drop the season_id column
    df.drop('season_id', axis=1, inplace=True)
    
    # Return the dataframe
    return df

def scrape_pitching_table(season_id):
    """Scrape the pitching table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the pitching table for (this is the year)

    Returns:
        pd.DataFrame: The pitching table for the given season_id
    """    
    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/stats.html?{season_id}&view=teampitching'

    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#pitch_first > table:nth-child(1)')
    if not table:
        print(f'No table found for season_id {season_id}')
        return None

    # Get the headers from the table (th)
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []

    # Get the rows from the table (tr)    
    for tr in table[0].find_all('tr'):
        # Get the data from the row (td)
        row = [td.text.strip() for td in tr.find_all('td')]
        # Check if the row is empty if not append the season_id to the row and append the row to the rows list
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

def scrape_standings_table(season_id):
    """Scrape the standings table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the standings table for (this is the year)

    Returns:
        pd.DataFrame: The standings table for the given season_id
    """    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/standings.html?{season_id}&stype=l'
    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#psbb_standings > table:nth-child(1)')
    if not table:
        print(f'No table found for season_id {season_id}')
        return None

    # Get the headers from the table (th)
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []

    # Get the rows from the table (tr)    
    for tr in table[0].find_all('tr'):
        # Get the data from the row (td)
        row = [td.text.strip() for td in tr.find_all('td')]
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

def scrape_qual_batters_table(season_id):
    """Scrape the qualified batters table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the qualified batters table for (this is the year)

    Returns:
        pd.DataFrame: The qualified batters table for the given season_id
    """    
    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/stats.html?{season_id}&view=batting'
    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#battingresults')
    if not table:
        print(f'No table found for season_id {season_id}')
        return None

    # Get the headers from the table (th)
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []

    # Get the rows from the table (tr)    
    for tr in table[0].find_all('tr'):
        # Get the data from the row (td)
        row = [td.text.strip() for td in tr.find_all('td')]
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

def scrape_qual_pitchers_table(season_id):
    """Scrape the qualified pitchers table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the qualified pitchers table for (this is the year)

    Returns:
        pd.DataFrame: The qualified pitchers table for the given season_id
    """    
    
    # Create the path string using the season_id
    path = f'http://pointstreak.com/baseball/stats.html?{season_id}&view=pitching'
    # Get the page
    page = requests.get(path)
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the table
    table = soup.select('#pitchingresults')
    if not table:
        print(f'No table found for season_id {season_id}')
        return None
    # Get the headers from the table (th)
    headers = [th.text.strip() for th in table[0].find_all('th')]
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []

    # Get the rows from the table (tr)    
    for tr in table[0].find_all('tr'):
        # Get the data from the row (td)
        row = [td.text.strip() for td in tr.find_all('td')]
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

In [8]:
def scrape_team_batting(season_id, team_id):
    """Scrape the team batting table from the CCBC website for a given season_id

    Args:
        season_id (str): The season_id to scrape the team batting table for (this is the year)
        team_id (str): The team_id to scrape the team batting table for

    Returns:
        pd.DataFrame: The team batting table for the given season_id
    """    
    
    # Create the path string using the season_id
    path = f'https://baseball.pointstreak.com/team_stats.html?teamid={team_id}&seasonid={season_id}'
    
    # Get the page
    page = requests.get(path)
    
    # Create the soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find the table
    table = soup.find('table', class_ = 'table table-hover table-striped nova-stats-table nova-stats-table--fixed-first-col')
    'table table-hover table-striped nova-stats-table nova-stats-table--fixed-first-col'

    # Fail Safe if no table is found
    if not table:
        print(f'No table found for season_id {season_id}')
        return None
    
    # Get the headers from the table (th)
    headers = [th.text.strip() for th in table.find_all('th')]
    
    # Add the season_id to the headers
    headers.append('season_id')
    
    # Create an empty list to store the rows
    rows = []

    # Get the rows from the table (tr)    
    for tr in table.find_all('tr'):
        # Get the data from the row (td)
        row = [td.text.strip() for td in tr.find_all('td')]
        if row:
            row.append(season_id)
            rows.append(row)
    return pd.DataFrame(rows, columns=headers)

season_id = '33653'
team_id = '142674'

test_df = scrape_team_batting(season_id, team_id)
test_df.head()

Unnamed: 0,Player,P,AVG,G,AB,R,H,2B,3B,HR,...,BB,HBP,SO,SF,SH,SB,CS,DP,E,season_id
0,"Morris, B",IF,0.342,31,117,20,40,4,2,1,...,15,3,17,1,0,19,4,1,10,33653
1,"Robertson, J",OF,0.333,3,3,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,33653
2,"Bourget, R",OF,0.327,31,107,32,35,10,1,2,...,14,6,23,2,1,11,3,3,4,33653
3,"Popik, T",OF,0.273,14,22,3,6,0,0,0,...,4,6,7,0,0,1,0,0,3,33653
4,"Browning, D",IF,0.256,24,82,15,21,3,0,0,...,11,4,15,1,1,5,3,1,4,33653
