In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import sys
from lxml import html

In [None]:
response = requests.get('https://www.baseball-reference.com/teams/tgl.cgi?team=PHI&t=b&year=2020')

In [None]:
soup = BeautifulSoup(response.content, "lxml")

In [None]:
table = soup.find('table')

In [None]:
pd.set_option('display.max_rows', 15)


In [None]:
data = pd.read_html(str(table))[0]
data.rename(columns={'Unnamed: 3':'Home/Away', 'Thr':'Opp Starter Thr', '#':'Players Used'}, inplace=True)
data.drop(data[data['OBP'] == 'OBP'].index, inplace=True)
data.replace({'Home/Away': {'@':'A'}}, inplace=True)
data['Home/Away'].fillna('H', inplace=True)
data[['Result', 'Runs Against']] = data['Rslt'].str.split(',', expand=True)
data['Runs Against'] = data['Runs Against'].str.split('-').str[1]
data['Date'] = data['Date'].str.slice(stop=6)
data.drop(columns=['Rslt', 'Rk', 'Gtm', 'Opp. Starter (GmeSc)'], inplace=True)
game_ids = [hash(data['Date'].iloc[i] + ((data['Opp'].iloc[i] + 'PHI') if data['Home/Away'].iloc[i] == 'H' else ('PHI' + data['Opp'].iloc[i])) + str(data['R'].iloc[i] + data['Runs Against'].iloc[i])) % 2147483647 for i in range(len(data.index))]
data['game_id'] = game_ids
team_code = 'PHI'
team_ids = [sum([ord(char) for char in team_code]) for i in range(len(data.index))]
data['team_id'] = team_ids
data.set_index('game_id', inplace=True)
data = data[['team_id', 'Date', 'Home/Away', 'Opp', 'Result', 'Runs Against', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF',
 'ROE', 'GDP', 'SB', 'CS', 'LOB', 'Players Used', 'BA', 'OBP', 'SLG', 'OPS', 'Opp Starter Thr']]
data.loc[:, 'Runs Against':'Players Used'] = data.loc[:, 'Runs Against':'Players Used'].apply(pd.to_numeric)
data.loc[:, 'BA':'OPS'] = data.loc[:, 'BA':'OPS'].apply(pd.to_numeric)
data.sort_values(by=['game_id'])

In [None]:
tree = html.fromstring(response.content)
#tree.xpath('//div/div/div/div[contains(@data-template, \'Partials/Teams/Summary\')]/p[strong[contains(text(), \'Record\')]]/a/text()')[0]
tree.xpath('//div/div/div/div[contains(@data-template, \'Partials/Teams/Summary\')]/h1/span/text()')
tree.xpath('//div/div/div/div[contains(@data-template, \'Partials/Teams/Summary\')]/p/text()[contains(.,\'-\')]')[0].split()[0].split('-')

In [None]:
def insert_batting(conn, data, game_id):
    batting_query = []
    for team in ['home', 'away']:
        batting_data = data[team]['teamStats']['batting']
        singles = batting_data['hits'] - (batting_data['doubles'] + batting_data['triples'] + batting_data['homeRuns'])
        for stat in ['avg', 'obp', 'slg', 'ops', 'stolenBasePercentage']:
            batting_data[stat] = float(batting_data[stat]) if batting_data[stat] != '.---' else None
        batting_data = list(batting_data.values())[:-1]
        batting_query.append((game_id, data[team]['team']['id'], *batting_data, singles))
    # Insert into Batting table for game
    batting_box_score = '''
    INSERT INTO BattingGame (
        game_id,
        team_id,
        fly_outs,
        ground_outs,
        runs,
        doubles,
        triples,
        home_runs,
        strike_outs,
        walks,
        intentional_walks,
        hits,
        hit_by_pitch,
        BA,
        AB,
        OBP,
        SLG,
        OPS,
        caught_stealing,
        bases_stolen,
        stolen_base_percentage,
        ground_into_double_play,
        ground_into_triple_play,
        plate_appearances,
        total_bases,
        RBI,
        LOB,
        sac_bunts,
        sac_flies,
        catchers_interference,
        pickoffs,
        singles)
    VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
    try:
        conn.cursor().executemany(batting_box_score, batting_query)
        conn.commit()
    except sqlite3.IntegrityError:
        print('Game already added')
    except Exception as e:
        db_error_cleanup(conn, e)


def insert_pitching(conn, data, game_id):
    pitching_query = []
    for team in ['home', 'away']:
        pitching_data = data[team]['teamStats']['pitching']
        singles = pitching_data['hits'] - (pitching_data['doubles'] + pitching_data['triples'] + pitching_data['homeRuns'])
        for stat in ['obp', 'era', 'inningsPitched', 'stolenBasePercentage', 'whip']:
            pitching_data[stat] = float(pitching_data[stat]) if pitching_data[stat] != '.---' else None
        for stat in ['hitBatsmen', 'groundOutsToAirouts', 'runsScoredPer9', 'homeRunsPer9']:
            pitching_data.pop(stat)
        pitching_data = list(pitching_data.values())
        pitching_query.append((game_id, data[team]['team']['id'], *pitching_data, singles))
    # Insert into Pitching table for game
    pitching_box_score = '''
    INSERT INTO PitchingGame (
        game_id,
        team_id,
        ground_outs,
        air_outs,
        runs,
        doubles,
        triples,
        home_runs,
        strike_outs,
        walks,
        intentional_walks,
        hits,
        hit_by_pitch,
        AB,
        OBP,
        caught_stealing,
        stolen_bases,
        stolen_base_percentage,
        ERA,
        IP,
        save_oppurtunities,
        earned_runs,
        WHIP,
        batter_faced,
        outs,
        complete_games,
        shutouts,
        balks,
        wild_pitches,
        pickoffs,
        RBI,
        inherited_runners,
        inherited_runners_scored,
        catchers_interference,
        sac_bunts,
        sac_flies,
        singles)
    VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
    try:
        conn.cursor().executemany(pitching_box_score, pitching_query)
        conn.commit()
    except sqlite3.IntegrityError:
        print('Game already added')
    except Exception as e:
        db_error_cleanup(conn, e)