In [None]:
!pip install duckdb pandas

In [1]:
# Duckdb is a SQL engine that allows us to execute powerful, analytics-friendly
# queries against local or remote databases and flat files.
import duckdb
import pandas as pd

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Create a database file on disk
conn = duckdb.connect('example.db')
# Enable remote access
conn.sql("INSTALL httpfs")
conn.sql("LOAD httpfs")
# This database file points to files totaling multiple GBs,
# but it's only about 300KB itself. The `ATTACH` command
# gives us access to views that sit on top of remote Parquet files.
try:
  conn.sql("ATTACH 'https://data.baseball.computer/dbt/bc_remote.db' (READ_ONLY)")
except duckdb.BinderException:
  # This command will fail if you run it more than once because it already exists,
  # in which case we don't need to do anything
  pass

conn.sql("USE bc_remote")
conn.sql("USE main_models")

Potential A/B Testing ideas: Offensive and pitching stats several years prior and post expansion
    Offensive and Defensive stats post DH expansion
    Offensive stats post pitch clock
    
Explore active player's career homerun totals and what percentage of them were on their current teams. 

In [None]:
df: pd.DataFrame = conn.sql("""WITH players AS 
                            (SELECT DISTINCT
                                r.player_id, 
                                r.last_name, 
                                r.first_name 
                            FROM misc.roster r  
                            WHERE r.year = 2023 
                            ), 
                            offStats AS ( 
                            SELECT DISTINCT
                                p.player_id, 
                                p.last_name, 
                                p.first_name, 
                                --pgo.game_id,
                                sum(ebs.hits) AS hits,
                                sum(ebs.home_runs) AS HRs
                            FROM event_batting_stats ebs 
                            left join players p on ebs.batter_id = p.player_id 
                            left join stg_gamelog gl on ebs.game_id = gl.game_id
                            where gl.season = 2023
                            GROUP BY p.player_id, p.last_name, p.first_name
                            )
                            SELECT * FROM offStats;""").df()
    
df.sort_values('hits', ascending = False)

In [None]:
df.groupby('player_id').sum('home_runs')

In [None]:
df: pd.DataFrame = conn.sql("""SELECT 
                                pgo.*
                                --(pgo.hits) AS totalHits, 
                                --sum(pgo.home_runs) AS totalHRs 
                            FROM event_batting_stats pgo 
                            left join misc.roster r on pgo.batter_id = r.player_id 
                            left join stg_gamelog gl on pgo.game_id = gl.game_id
                            where pgo.batter_id = 'sancg002' AND gl.season = 2023 AND pgo.game_id = 'SDN202308180';""").df()

df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT DISTINCT player_id, game_id, COUNT(*)
                                FROM player_game_offense_stats
                                GROUP BY player_id, game_id
                                HAVING COUNT(*) > 1""").df()

df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT DISTINCT player_id, first_name, last_name, COUNT(*)
                                FROM misc.roster
                                where year = 2023 
                                GROUP BY player_id, first_name, last_name
                                HAVING COUNT(*) > 1
                                """).df()
df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT *
                                FROM misc.roster
                                where year = 2023 and last_name = 'Alonso'
                                """).df()
df

For code below
1. SUM(CASE WHEN sequence_item = 'Ball' THEN 1 ELSE 0 END) OVER (...):

    This is the window function that calculates the cumulative sum of occurrences of 'Ball' for each sequence_id.
    It does this within the partition of game_id and event_key and orders the data by sequence_id.
    The result is an incremental count of 'Ball' values as it processes each row, creating the behavior you described (e.g., 1, 2, 3, 3, 4).

2. PARTITION BY game_id, event_key: Ensures that the counting restarts for each unique combination of game_id and event_key.

3. ORDER BY sequence_id: Ensures that the Ball_count increments in the correct order based on sequence_id.

4. GROUP BY: We use GROUP BY to include other counts (for SwingStrike and CalledStrike) while still showing the incremental Ball_count for each sequence_id.

In [117]:
df: pd.DataFrame = conn.sql("""WITH PitchCount AS (select
                                                        eps.game_id,
                                                        eps.event_key,
                                                        eps.sequence_id,
                                                        eps.sequence_item,
                                                        SUM(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                                        'Ball',
                                                                                        'Pitchout',
                                                                                        'IntentionalBall') THEN 1 ELSE 0 END)
                                                            OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS BallCount,
                                                        SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                                        'MissedBunt', 
                                                                                        'StrikeUnknownType', 
                                                                                        'SwingingOnPitchout', 
                                                                                        'SwingingStrike') THEN 1 
                                                            ELSE 0 
                                                        END)
                                                            OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS StrikeCount
                                                    FROM event.event_pitch_sequences eps
                                                    where 
                                                        eps.game_id = 'SDN202308180'
                                                        --AND event_key = '654600812'
                                                        
                                    )
                                    select pc.game_id, 
                                        ev.batter_id,
                                        ev.pitcher_id,
                                        ev.batting_side,
                                        ev.batting_team_id,
                                        ev.fielding_team_id,
                                        pc.event_key, 
                                        pc.sequence_id, 
                                        pc.sequence_item,
                                        pc.BallCount,
                                        CASE 
                                            -- For strikes, keep the same StrikeCount
                                            WHEN sequence_item IN ('CalledStrike', 'SwingingStrike', 'StrikeUnknownType') THEN 
                                                -- This ensures that StrikeCount remains the same for strikes
                                                LAST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                            
                                            -- For balls, keep StrikeCount unchanged
                                            WHEN sequence_item IN ('Ball', 'Pitchout', 'IntentionalBall') THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                            
                                            -- For fouls, increment StrikeCount if it's less than 2
                                            WHEN sequence_item IN ('Foul', 'FoulBunt', 'FoulOnPitchout', 'FoulTip', 'FoulTipBunt') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) < 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) + 1
                                    
                                            -- Once StrikeCount is 2, don't increment it for foul balls that are not tipped
                                            WHEN sequence_item IN ('Foul', 'FoulOnPitchout') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)

                                            -- Once StrikeCount is 2, increment it for foul balls that are tipped
                                            WHEN sequence_item IN ('FoulBunt', 'FoulTip', 'FoulTipBunt') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) + 1
                                    
                                            -- Default case, just keep the same StrikeCount
                                            ELSE FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                        END AS StrikeCount,
                                        COUNT(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                            'Ball',
                                                                            'Pitchout',
                                                                            'IntentionalBall') THEN 1 END) AS Ball, 
                                        count(CASE WHEN sequence_item = 'SwingStrike' THEN 1 END) AS swingStrike, 
                                        count(CASE WHEN sequence_item = 'CalledStrike' THEN 1 END) AS CalledStrike,
                                        count(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                            'MissedBunt', 
                                                                            'StrikeUnknownType', 
                                                                            'SwingingOnPitchout', 
                                                                            'SwingingStrike',
                                                                            'Foul',
                                                                            'FoulBunt', 
                                                                            'FoulOnPitchout', 
                                                                            'FoulTip', 
                                                                            'FoulTipBunt') 
                                                          THEN 1 END) AS Strike,
                                        ev.outs,
                                        ev.base_state,
                                        ev.outs_on_play,
                                        ev.runs_on_play,
                                        ev.plate_appearance_result,
                                        ev.batted_trajectory,
                    from PitchCount pc
                    LEFT JOIN event.EVENTS EV ON pc.game_id = ev.game_id 
                        AND pc.event_key = ev.event_key
                    where pc.game_id = 'SDN202308180' AND StrikeCount >= 2
                    GROUP BY 
                        pc.game_id, pc.event_key, pc.sequence_id, pc.sequence_item, pc.BallCount, StrikeCount, ev.batter_id, ev.pitcher_id, ev.batting_side,
                        ev.batting_team_id, ev.fielding_team_id, ev.outs, ev.base_state, ev.outs_on_play, ev.runs_on_play, ev.plate_appearance_result, ev.batted_trajectory,
                    order by pc.event_key, pc.sequence_id
                            """).df()
df


# Pitch Sequence Values not used yet
#'HitBatter', 
#'InPlay', 
#'InPlayOnPitchout', 

#'NoPitch', 
#'PickoffAttemptFirst', 
#'PickoffAttemptSecond', 
#'PickoffAttemptThird', 




Unnamed: 0,game_id,batter_id,pitcher_id,batting_side,batting_team_id,fielding_team_id,event_key,sequence_id,sequence_item,BallCount,StrikeCount,Ball,swingStrike,CalledStrike,Strike,outs,base_state,outs_on_play,runs_on_play,plate_appearance_result,batted_trajectory
0,SDN202308180,phamt001,lugos001,Away,ARI,SDN,654600813,3,SwingingStrike,1.0,2.0,0,0,0,1,1,1,1,0,StrikeOut,
1,SDN202308180,phamt001,lugos001,Away,ARI,SDN,654600813,4,Ball,2.0,2.0,1,0,0,0,1,1,1,0,StrikeOut,
2,SDN202308180,phamt001,lugos001,Away,ARI,SDN,654600813,5,Ball,3.0,2.0,1,0,0,0,1,1,1,0,StrikeOut,
3,SDN202308180,phamt001,lugos001,Away,ARI,SDN,654600813,6,CalledStrike,3.0,3.0,0,0,1,1,1,1,1,0,StrikeOut,
4,SDN202308180,tatif002,pfaab001,Home,SDN,ARI,654600816,7,SwingingStrike,3.0,2.0,0,0,0,1,0,1,2,0,StrikeOut,
5,SDN202308180,sotoj001,pfaab001,Home,SDN,ARI,654600817,4,CalledStrike,2.0,2.0,0,0,1,1,2,0,1,0,InPlayOut,Fly
6,SDN202308180,sotoj001,pfaab001,Home,SDN,ARI,654600817,5,Foul,2.0,2.0,0,0,0,1,2,0,1,0,InPlayOut,Fly
7,SDN202308180,sotoj001,pfaab001,Home,SDN,ARI,654600817,6,Foul,2.0,2.0,0,0,0,1,2,0,1,0,InPlayOut,Fly
8,SDN202308180,sotoj001,pfaab001,Home,SDN,ARI,654600817,7,Foul,2.0,2.0,0,0,0,1,2,0,1,0,InPlayOut,Fly
9,SDN202308180,sotoj001,pfaab001,Home,SDN,ARI,654600817,8,Ball,3.0,2.0,1,0,0,0,2,0,1,0,InPlayOut,Fly


In [67]:
df: pd.DataFrame = conn.sql("""SELECT * FROM event.event_pitch_sequences where event_key = '654600842' """).df()

df

Unnamed: 0,game_id,event_id,event_key,sequence_id,sequence_item,runners_going_flag,blocked_by_catcher_flag,catcher_pickoff_attempt_at_base
0,SDN202308180,32,654600842,1,CalledStrike,False,False,
1,SDN202308180,32,654600842,2,CalledStrike,False,False,
2,SDN202308180,32,654600842,3,InPlay,False,False,


In [None]:
conn.close()