In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [2]:
url = "https://www.baseball-reference.com/managers/"
response = requests.get(url)
print(response)
print(response.headers.get("Retry-After"))

<Response [200]>
None


In [3]:
headers = ["Year", "Team", "Manager_Fired", "First_Manager", "Second_Manager", "Third_Manager"]

In [4]:
team_name_mapping = {
    'Anaheim Angels': 'Los Angeles Angels',
    'Los Angeles Angels of Anaheim': 'Los Angeles Angels',
    'Florida Marlins': 'Miami Marlins',
    'Tampa Bay Devil Rays': 'Tampa Bay Rays',
    'Montreal Expos': 'Washington Nationals',
}

urls = ["https://www.baseball-reference.com/leagues/majors/" + str(i) + ".shtml" for i in range(2007, 2024)]
years = list(range(2007, 2024))
year_urls = {k: v for k, v in zip(years, urls)}
def normalize_team_name(team_name):
    return team_name_mapping.get(team_name, team_name)

In [5]:
def get_team_urls(url) -> dict:
    """Gets the urls of all the managers given the team url."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table_info = soup.find('div', 'table_container').find('table', 'sortable').find('tbody').find_all('tr')
    del table_info[-1]
    for row in table_info:
        team = row.find('th', {'data-stat': 'team_name'}).find('a')
        url_segment = team['href']
        if url_segment:
            yield (team.text.strip(), "https://www.baseball-reference.com" + url_segment)
    


In [6]:
all_teams = {year: get_team_urls(year_urls[year]) for year in years}

In [7]:
def get_data(year, team, url) -> list:
    """Takes in one url, outputs a list of the data for that year."""
    response = requests.get(url)
    print(team)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    ps = soup.find('div', {'data-template': 'Partials/Teams/Summary'}).find_all('p')
    for i, j in enumerate(ps):
        
        if "Manager:" in j.text.strip():
    
            parts = j.text.strip().split()
            if "Manager" in parts[0]:
                stop_point = 0
                for index, thing in enumerate(parts):
                    if "General" in thing:
                        stop_point = index
                        break
                managers = parts[1:stop_point]

                
                if 'and' in managers:
                    index = managers.index('and')
                    if index > 3:
                        
                        half = managers[:index]
                        first = ' '.join(half[:3])
                        second = ' '.join(half[-3:])
                        
                        res = [year, normalize_team_name(team), True, first, second, ' '.join(managers[index + 1:index + 4])]
                    else:
                        res =  [year, normalize_team_name(team), True, ' '.join(managers[:3]), ' '.join(managers[4:7]), ' ']
                else:
                    res =  [year, normalize_team_name(team), False, ' '.join(managers[0:3]), ' ', ' ']
    print(res)           
    return res

In [8]:
def generate_data(year, gen) -> list:
    """Takes in a generator for each years' teams data, outputs the header."""
    res = []
    print(year)
    try:
        while True:
            time.sleep(4)
            value = next(gen)
            res.append(get_data(year, value[0], value[1]))
    except StopIteration:
        pass
    return res

In [9]:
total = []
for year, value in all_teams.items():
    temp = generate_data(year, value)
    total.extend(temp)
put_into_csv(total)


2007
Arizona Diamondbacks
[2007, 'Arizona Diamondbacks', False, 'Bob Melvin (90-72)', ' ', ' ']
Atlanta Braves
[2007, 'Atlanta Braves', False, 'Bobby Cox (84-78)', ' ', ' ']
Baltimore Orioles
[2007, 'Baltimore Orioles', True, 'Sam Perlozzo (29-40)', 'Dave Trembley (40-53)', ' ']
Boston Red Sox
[2007, 'Boston Red Sox', False, 'Terry Francona (96-66)', ' ', ' ']
Chicago Cubs
[2007, 'Chicago Cubs', False, 'Lou Piniella (85-77)', ' ', ' ']
Chicago White Sox
[2007, 'Chicago White Sox', False, 'Ozzie Guillén (72-90)', ' ', ' ']
Cincinnati Reds
[2007, 'Cincinnati Reds', True, 'Jerry Narron (31-51)', 'Pete Mackanin (41-39)', ' ']
Cleveland Indians
[2007, 'Cleveland Indians', False, 'Eric Wedge (96-66)', ' ', ' ']
Colorado Rockies
[2007, 'Colorado Rockies', False, 'Clint Hurdle (90-73)', ' ', ' ']
Detroit Tigers
[2007, 'Detroit Tigers', False, 'Jim Leyland (88-74)', ' ', ' ']
Florida Marlins
[2007, 'Miami Marlins', False, 'Fredi González (71-91)', ' ', ' ']
Houston Astros
[2007, 'Houston Astros

NameError: name 'put_into_csv' is not defined

In [13]:
put_into_csv(total)

In [12]:
def put_into_csv(total: list):
    """Takes in all the data as a list, turns into dataframe and puts into csv."""
    df = pd.DataFrame(total, columns=headers)
    df_sorted = df.sort_values(by=["Team", "Year"])
    with open("L:/RA_work/JAY/datasets/final/dismissal_data.csv", mode='w', newline='') as f:
        df_sorted.to_csv("L:/RA_work/JAY/datasets/final/dismissal_data.csv", index=False)