In [1]:
import re
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

from abc import ABC, abstractmethod

class ScrapPlayer(ABC):
    def __init__(self, debug=False):
        self._debug = debug

    def run(self, players, sleep_time_in_seconds = 5):
        records = []
        for player in players:
            id, name = player

            self.print(f'* fetching recent stats for {name}.')

            response = requests.get(
                f'https://www.espn.com/mlb/player/_/id/{id}'
            )

            assert response.status_code == 200

            bs = BeautifulSoup(
                response.content,
                features='html.parser'
            )

            rows = bs.select('.gamelogWidget--baseball table tbody tr')
            self.print(f'* found #{len(rows)} records')

            records.extend(
                [
                    self.transform(player, row.find_all('td'))
                    for row
                    in rows
                ]
            )

            self.print()
            self.print(f'--- sleeping {sleep_time_in_seconds} seconds ---')
            self.print()

            time.sleep(sleep_time_in_seconds)

        self.print(f'--- complete w/ #{len(records)} ---')
        self.print()

        return pd.DataFrame(records)
 
    @abstractmethod
    def transform(self, player, cols):
        pass

    def print(self, message = ''):
        if self._debug:
            print(message)

class Batter(ScrapPlayer):
    def transform(self, player, cols):
        id, name = player
        return {
            'ID': id,
            'NAME': name,
            'DATE': re.sub(r'.+?([\d/]+)', '\g<1>', cols[0].text),
            'OPP': re.sub(r'(@|vs)(.+)', '\g<1> \g<2>', cols[1].text),
            'RESULT': re.sub(r'(W|L)(.+)', '\g<1> \g<2>', cols[2].text),
            'AB': cols[3].text,
            'R': cols[4].text,
            'H': cols[5].text,
            '2B': cols[6].text,
            '3B': cols[7].text,
            'HR': cols[8].text,
            'RBI': cols[9].text,
            'BB': cols[10].text,
            'SO': cols[11].text,
        }

class Pitcher(ScrapPlayer):
    def transform(self, player, cols):
        id, name = player
        return {
            'ID': id,
            'NAME': name,
            'DATE': re.sub(r'.+?([\d/]+)', '\g<1>', cols[0].text),
            'OPP': re.sub(r'(@|vs)(.+)', '\g<1> \g<2>', cols[1].text),
            'RESULT': re.sub(r'(W|L)(.+)', '\g<1> \g<2>', cols[2].text),
            'GS': cols[3].text,
            'CG': cols[4].text,
            'SHO': cols[5].text,
            '1P': cols[6].text,
            'H': cols[7].text,
            'R': cols[8].text,
            'ER': cols[9].text,
            'HR': cols[10].text,
            'BB': cols[11].text,
            'K': cols[12].text,
        }

In [2]:
roster = {
    'batters': [
        ('30950', 'Yasmani Grandal'),
        ('32767', 'Matt Olson'),
        ('31037', 'Jean Segura'),
        ('34886', 'Alex Bregman'),
        ('33675', 'Willy Adames'),
        ('35156', 'David Fletcher'),
        ('32098', 'Anthony Rendon'),
        ('38980', 'Bryan Reynolds'),
        ('33711', 'Michael Conforto'),
        ('34986', 'Andrew Benintendi'),
        ('32818', 'Joey Gallo'),
        ('31399', 'Randal Grichuk'),
        ('32177', 'J.T. REALMUTO'),
    ],
    'pitchers': [
        ('39878', 'Corbin Burnes'),
        ('29155', 'Charlie Morton'),
        ('34973', 'Tyler Mahle'),
        ('36050', 'Giovanny Gallegos'),
        ('41221', 'Logan Gilbert'),
        ('31313', 'Patrick Corbin'),
        ('5883', 'Zack Greinke'),
        ('41261', 'Cristian Javier'),
        ('38303', 'David Bednar'),
        ('39251', 'Walker Buehler'),
        ('33249', 'Frankie Montas'),
        ('31053', 'Kyle Gibson'),
        ('32764', 'Lance McCullers Jr.'),
        ('35124', 'Luis Castillo'),
        ('32796', 'Jacob deGrom'),
        ('30373', 'Stephen Strasburg'),
    ]
}

In [3]:
batters_file_path = '../../data/mlb/batters.csv'
df_batters = Batter().run(roster['batters'])

In [4]:
pitchers_file_path = '../../data/mlb/pitchers.csv'
df_pitchers =  Pitcher().run(roster['pitchers'])

In [5]:
def save(df, cache_file_path):
    def merge(df_gospel, df_scrapped):
        df_gospel_slim = df_gospel[['ID', 'DATE']]
        df_scrapped_new = df_scrapped \
            .merge(df_gospel_slim, indicator='i', how='outer') \
            .query('i == "left_only"') \
            .drop(['i'], axis=1)

        return pd.concat([df_gospel, df_scrapped_new], axis=0)

    if not os.path.exists(cache_file_path):
        df.to_csv(cache_file_path, index=False)
        return
    
    df_gosbel = pd.read_csv(cache_file_path)
    merge(df_gosbel, df).to_csv(cache_file_path, index=False)

save(df_batters, batters_file_path)
save(df_pitchers, pitchers_file_path)