In [10]:
import re
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [36]:
class ScrapPlayer():
    def get_url(self, player_id, category_type):
        category = 'batting' if category_type == 'BATTER' else 'pitching'
        return f'https://www.espn.com/mlb/player/gamelog/_/id/{player_id}/year/2022/category/{category}'

    def transform(self, player, cols, category_type):
        id = player['id']
        name = player['name']
        if category_type == 'BATTER':
            return {
                'ID': id,
                'NAME': name,
                'DATE': re.sub(r'.+?([\d/]+)', '\g<1>', cols[0].text),
                'OPP': re.sub(r'(@|vs)(.+)', '\g<1> \g<2>', cols[1].text),
                'RESULT': re.sub(r'(W|L)(.+)', '\g<1> \g<2>', cols[2].text),
                'AB': cols[3].text,
                'R': cols[4].text,
                'H': cols[5].text,
                '2B': cols[6].text,
                '3B': cols[7].text,
                'HR': cols[8].text,
                'RBI': cols[9].text,
                'BB': cols[10].text,
                'HBP': cols[11].text,
                'SO': cols[12].text,
                'SB': cols[13].text,
                'CS': cols[14].text,
                'AVG': cols[15].text,
                'OBP': cols[16].text,
                'SLG': cols[17].text,
                'OPS': cols[18].text,
            }
        
        return {
            'ID': id,
            'NAME': name,
            'DATE': re.sub(r'.+?([\d/]+)', '\g<1>', cols[0].text),
            'OPP': re.sub(r'(@|vs)(.+)', '\g<1> \g<2>', cols[1].text),
            'RESULT': re.sub(r'(W|L)(.+)', '\g<1> \g<2>', cols[2].text),
            'IP': cols[3].text,
            'H': cols[4].text,
            'R': cols[5].text,
            'ER': cols[6].text,
            'HR': cols[7].text,
            'BB': cols[8].text,
            'K': cols[9].text,
            'GB': cols[10].text,
            'FB': cols[11].text,
            'P': cols[12].text,
            'TBF': cols[13].text,
            'GSC': cols[14].text,
            'DEC': cols[15].text,
            'REL': cols[16].text,
            'ERA': cols[17].text,
        }

    def run(self, players, sleep_time_in_seconds = 5):
        pitchers, batters = [], []
        for player in players:
            id = player['id']
            for is_a in player['is_a']:
                url = self.get_url(id, is_a)
                content = requests.get(url).content
                bs = BeautifulSoup(content, features='html.parser')
                
                try:
                    rows = bs.select('.mb4 .Table__TBODY tr')
                    stats = [
                        self.transform(player, row.find_all('td'), is_a)
                        for row
                        in rows
                        if not 'totals_row' in row.get('class')
                    ]

                    (batters if is_a == 'BATTER' else pitchers).extend(stats)
                except:
                    print(f'error: {url}')

                time.sleep(sleep_time_in_seconds)

        return pitchers, batters

In [23]:
import json

rosters = {}
with open('../../data/mlb/roster-853208662.json', 'r') as roster_json: 
  for team_roster in json.loads(roster_json.read()):
    name = team_roster['name']
    rosters[name] = team_roster

all = []
for player in rosters['Snoring Eeyores']['roster']:
  all.append(player)

In [37]:
pitchers, batters = ScrapPlayer().run(all[:1])
pitchers

[{'ID': 39878,
  'NAME': 'Corbin Burnes',
  'DATE': '4/19',
  'OPP': 'vs PIT',
  'RESULT': 'W 5-2',
  'IP': '7.0',
  'H': '4',
  'R': '2',
  'ER': '2',
  'HR': '2',
  'BB': '0',
  'K': '10',
  'GB': '11',
  'FB': '4',
  'P': '107',
  'TBF': '25',
  'GSC': '71.0',
  'DEC': 'W(1-0)',
  'REL': '-',
  'ERA': '2.37'},
 {'ID': 39878,
  'NAME': 'Corbin Burnes',
  'DATE': '4/13',
  'OPP': '@ BAL',
  'RESULT': 'W 4-2',
  'IP': '7.0',
  'H': '3',
  'R': '0',
  'ER': '0',
  'HR': '0',
  'BB': '1',
  'K': '8',
  'GB': '5',
  'FB': '10',
  'P': '97',
  'TBF': '25',
  'GSC': '78.0',
  'DEC': '-',
  'REL': '-',
  'ERA': '2.25'},
 {'ID': 39878,
  'NAME': 'Corbin Burnes',
  'DATE': '4/7',
  'OPP': '@ CHC',
  'RESULT': 'L 5-4',
  'IP': '5.0',
  'H': '4',
  'R': '3',
  'ER': '3',
  'HR': '1',
  'BB': '3',
  'K': '4',
  'GB': '3',
  'FB': '11',
  'P': '83',
  'TBF': '21',
  'GSC': '48.0',
  'DEC': '-',
  'REL': '-',
  'ERA': '5.40'}]