In [44]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [45]:
url = "https://www.baseball-reference.com/managers/"
response = requests.get(url)

In [46]:
def get_manager_urls(url) -> dict:
    """Gets the urls of all the managers given the team url."""
    manager_urls = {}
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table_info = soup.find('div', 'table_container').find('table', 'sortable').find_all('tr')
    del table_info[0]
    for row in table_info:
        if 'thead' in row.get('class', []):
            continue
        year_max_td = row.find('td', {'data-stat': 'year_max'})
        year_max = year_max_td.text.strip()
        if int(year_max) >= 2007:
            
            manager_tag = row.find('td', {'data-stat': 'manager'}).find('a')
            manager_url = "https://www.baseball-reference.com" + manager_tag['href']
            manager_urls[manager_tag.text.strip()] = manager_url

    return manager_urls

In [47]:
managers_url = get_manager_urls(url)
print(len(managers_url))

122


In [48]:
team_name_mapping = {
    'Anaheim Angels': 'Los Angeles Angels',
    'Los Angeles Angels of Anaheim': 'Los Angeles Angels',
    'Florida Marlins': 'Miami Marlins',
    'Tampa Bay Devil Rays': 'Tampa Bay Rays',
    'Montreal Expos': 'Washington Nationals',
}

def normalize_team_name(team_name):
    return team_name_mapping.get(team_name, team_name)

In [49]:
def get_managerial_data(manager_name, manager_url) -> list:
    """Gets the manager's data and their tendencies."""
    data = {'Year': [], "Team": [], "Manager_name": [], "Manager_team_career": [],
           "Manager_MLB_career": [], "steal_2nd": [], "steal_3rd": [],
           "sac_bunts": [], "intentional_walks": [], "Substitutions_PH/G": [],
           "Substitutions_PR/G": [], "Substitutions_P/G": []}
    team_history = {}
    response = requests.get(manager_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        manager_stats_table = soup.find('div', id='all_manager_stats').find('div', 'table_container').find('tbody').find_all('tr')
        manager_tendencies_table = soup.find('div', id='all_manager_tendencies').find('div', 'table_container').find('tbody').find_all('tr')
    except AttributeError:
        return []
    merged_data = zip(manager_stats_table, manager_tendencies_table)
    for row in merged_data:
        year = row[0].find('td', {'data-stat': 'year_ID'}).text.strip()
        if year and int(year) <= 2023:
            year = int(year)
            age = int(row[0].find('td', {'data-stat': 'age'}).text.strip())
            team = normalize_team_name(row[0].find('td', {'data-stat': 'team_ID'}).text.strip())
            team_history[year] = team
            previous_year = year - 1
            team_career = 1
            while previous_year in team_history:
                if team_history[previous_year] == team:
                    team_career += 1
                    previous_year -= 1
                else:
                    break
            mlb_career = len(data['Year']) + 1 if data['Year'] else 1
            
            second_b_rate = row[1].find('td', {'data-stat': 'steal_2b_rate'}).text.strip()
            third_b_rate = row[1].find('td', {'data-stat': 'steal_3b_rate'}).text.strip()
            sac_rate = row[1].find('td', {'data-stat': 'sac_bunt_rate'}).text.strip()
            intentional_walks = row[1].find('td', {'data-stat': 'ibb_rate'}).text.strip()
            sub_phg = row[1].find('td', {'data-stat': 'pinch_hitters'}).text.strip()
            sub_prg = row[1].find('td', {'data-stat': 'pinch_runners'}).text.strip()
            sub_pg = row[1].find('td', {'data-stat': 'pitchers_used_per_game'}).text.strip()
            data['Year'].append(year)
            data['Team'].append(team)
            data['Manager_name'].append(manager_name)
            data['Manager_team_career'].append(team_career)
            data['Manager_MLB_career'].append(mlb_career)
            data['steal_2nd'].append(second_b_rate)
            data['steal_3rd'].append(third_b_rate)
            data['sac_bunts'].append(sac_rate)
            data['intentional_walks'].append(intentional_walks)
            data['Substitutions_PH/G'].append(sub_phg)
            data['Substitutions_PR/G'].append(sub_prg)
            data['Substitutions_P/G'].append(sub_pg)
    return [list(x) for x in zip(*data.values())]


In [50]:
def write_to_csv(data: []):
    """Writes the data into a csv."""
    with open("L:/RA_work/JAY/datasets/final/manager_data.csv", mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows([["Year", "Team", "Manager_name", "Manager_team_career", "Manager_MLB_career", 
                           "steal_2nd", "steal_3rd", "sac_bunts", "intentional_walks", "Substitutions_PH/G", 
                           "Substitutions_PR/G", "Substitutions_P/G"]])
        writer.writerows(data)

In [51]:
total_data = []
for name, url in managers_url.items():
    print(f"Fetching {name}'s data from: {url}")
    manager_data = get_managerial_data(name, url)
    total_data.extend(manager_data)
    
    time.sleep(4)


Fetching Manny Acta's data from: https://www.baseball-reference.com/managers/actama99.shtml
Fetching Sandy Alomar's data from: https://www.baseball-reference.com/managers/alomasa02.shtml
Fetching Brad Ausmus's data from: https://www.baseball-reference.com/managers/ausmubr01.shtml
Fetching Dusty Baker's data from: https://www.baseball-reference.com/managers/bakerdu01.shtml
Fetching Rocco Baldelli's data from: https://www.baseball-reference.com/managers/baldero01.shtml
Fetching Jeff Banister's data from: https://www.baseball-reference.com/managers/banisje01.shtml
Fetching Rod Barajas's data from: https://www.baseball-reference.com/managers/barajro01.shtml
Fetching Tony Beasley's data from: https://www.baseball-reference.com/managers/beaslto99.shtml
Fetching Buddy Bell's data from: https://www.baseball-reference.com/managers/bellbu01.shtml
Fetching David Bell's data from: https://www.baseball-reference.com/managers/bellda01.shtml
Fetching Bud Black's data from: https://www.baseball-refere

In [52]:
sorted_data = sorted(total_data, key=lambda x: x[0])
data = [x for x in sorted_data if x[0] >= 2007]
write_to_csv(data)