In [None]:
!pip install duckdb pandas

In [1]:
# Duckdb is a SQL engine that allows us to execute powerful, analytics-friendly
# queries against local or remote databases and flat files.
import duckdb
import pandas as pd

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Create a database file on disk
conn = duckdb.connect('example.db')
# Enable remote access
conn.sql("INSTALL httpfs")
conn.sql("LOAD httpfs")
# This database file points to files totaling multiple GBs,
# but it's only about 300KB itself. The `ATTACH` command
# gives us access to views that sit on top of remote Parquet files.
try:
  conn.sql("ATTACH 'https://data.baseball.computer/dbt/bc_remote.db' (READ_ONLY)")
except duckdb.BinderException:
  # This command will fail if you run it more than once because it already exists,
  # in which case we don't need to do anything
  pass

conn.sql("USE bc_remote")
conn.sql("USE main_models")

Potential A/B Testing ideas: Offensive and pitching stats several years prior and post expansion
    Offensive and Defensive stats post DH expansion
    Offensive stats post pitch clock
    
Explore active player's career homerun totals and what percentage of them were on their current teams. 

In [5]:
df: pd.DataFrame = conn.sql("""WITH players AS 
                            (SELECT DISTINCT
                                r.player_id, 
                                r.last_name, 
                                r.first_name 
                            FROM misc.roster r  
                            WHERE r.year = 2023 
                            ), 
                            offStats AS ( 
                            SELECT DISTINCT
                                p.player_id, 
                                p.last_name, 
                                p.first_name, 
                                --pgo.game_id,
                                sum(ebs.hits) AS hits,
                                sum(ebs.home_runs) AS HRs
                            FROM event_batting_stats ebs 
                            left join players p on ebs.batter_id = p.player_id 
                            left join stg_gamelog gl on ebs.game_id = gl.game_id
                            where gl.season = 2023
                            GROUP BY p.player_id, p.last_name, p.first_name
                            )
                            SELECT * FROM offStats;""").df()
    
df.sort_values('hits', ascending = False)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,player_id,last_name,first_name,hits,HRs
116,acunr001,Acuna,Ronald,219.0,41.0
51,freef001,Freeman,Freddie,212.0,29.0
27,arral001,Arraez,Luis,206.0,10.0
222,semim001,Semien,Marcus,202.0,31.0
28,turnt001,Turner,Trea,187.0,29.0
...,...,...,...,...,...
307,crowp001,Crow-Armstrong,Pete,0.0,0.0
624,mills001,Miller,Shelby,0.0,0.0
232,fulmm001,Fulmer,Michael,0.0,0.0
220,naqut001,Naquin,Tyler,0.0,0.0


In [None]:
df.groupby('player_id').sum('home_runs')

In [None]:
df: pd.DataFrame = conn.sql("""SELECT 
                                pgo.*
                                --(pgo.hits) AS totalHits, 
                                --sum(pgo.home_runs) AS totalHRs 
                            FROM event_batting_stats pgo 
                            left join misc.roster r on pgo.batter_id = r.player_id 
                            left join stg_gamelog gl on pgo.game_id = gl.game_id
                            where pgo.batter_id = 'sancg002' AND gl.season = 2023 AND pgo.game_id = 'SDN202308180';""").df()

df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT DISTINCT player_id, game_id, COUNT(*)
                                FROM player_game_offense_stats
                                GROUP BY player_id, game_id
                                HAVING COUNT(*) > 1""").df()

df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT DISTINCT player_id, first_name, last_name, COUNT(*)
                                FROM misc.roster
                                where year = 2023 
                                GROUP BY player_id, first_name, last_name
                                HAVING COUNT(*) > 1
                                """).df()
df

In [None]:
df: pd.DataFrame = conn.sql("""SELECT *
                                FROM misc.roster
                                where year = 2023 and last_name = 'Alonso'
                                """).df()
df

For code below
1. SUM(CASE WHEN sequence_item = 'Ball' THEN 1 ELSE 0 END) OVER (...):

    This is the window function that calculates the cumulative sum of occurrences of 'Ball' for each sequence_id.
    It does this within the partition of game_id and event_key and orders the data by sequence_id.
    The result is an incremental count of 'Ball' values as it processes each row, creating the behavior you described (e.g., 1, 2, 3, 3, 4).

2. PARTITION BY game_id, event_key: Ensures that the counting restarts for each unique combination of game_id and event_key.

3. ORDER BY sequence_id: Ensures that the Ball_count increments in the correct order based on sequence_id.

4. GROUP BY: We use GROUP BY to include other counts (for SwingStrike and CalledStrike) while still showing the incremental Ball_count for each sequence_id.

In [7]:
df: pd.DataFrame = conn.sql("""WITH PitchCount AS (SELECT
                                    eps.game_id,
                                    eps.event_key,
                                    eps.sequence_id,
                                    eps.sequence_item,
                                    SUM(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                    'Ball',
                                                                    'Pitchout',
                                                                    'IntentionalBall') THEN 1 ELSE 0 
                                        END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id
                                                    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS BallCount,
                                    SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                    'MissedBunt', 
                                                                    'StrikeUnknownType', 
                                                                    'SwingingOnPitchout', 
                                                                    'SwingingStrike') THEN 1 ELSE 0 
                                    END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS StrikeCount,
                                    SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                    'MissedBunt', 
                                                                    'StrikeUnknownType', 
                                                                    'SwingingOnPitchout', 
                                                                    'SwingingStrike') THEN 1 ELSE 0 
                                    END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id
                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS StrikeCount2
                                    FROM event.event_pitch_sequences eps
                                    --where 
                                        --eps.game_id = 'SDN202308180'
                                        --AND event_key = '654600812'   
                                    ),
                                    ComputedStike AS (SELECT
                                        pc.game_id,
                                        pc.event_key,
                                        pc.sequence_id,
                                        pc.sequence_item,
                                        pc.BallCount,
                                        pc.StrikeCount,
                                        pc.StrikeCount as StrikeCount2,
                                        FROM PitchCount pc
                                    ),
                                    PlateAppearanceCountResults AS (SELECT 
                                        gl.season,
                                        gl.date,
                                        gl.doubleheader_status,
                                        gl.game_type, 
                                        pcr.game_id, 
                                        ev.batter_id,
                                        ev.pitcher_id,
                                        ev.batting_side,
                                        gl.away_team_id,
                                        gl.home_team_id,
                                        pcr.event_key, 
                                        pcr.sequence_id, 
                                        pcr.sequence_item,
                                        pcr.BallCount,
                                        pcr.StrikeCount,
                                        pcr.StrikeCount2,
                                        COUNT(CASE WHEN pcr.sequence_item IN ('AutomaticBall',
                                                                            'Ball',
                                                                            'Pitchout',
                                                                            'IntentionalBall') THEN 1 END) AS Ball, 
                                        count(CASE WHEN pcr.sequence_item = 'SwingStrike' THEN 1 END) AS swingStrike, 
                                        count(CASE WHEN pcr.sequence_item = 'CalledStrike' THEN 1 END) AS CalledStrike,
                                        count(CASE WHEN pcr.sequence_item IN ('CalledStrike', 
                                                                            'MissedBunt', 
                                                                            'StrikeUnknownType', 
                                                                            'SwingingOnPitchout', 
                                                                            'SwingingStrike',
                                                                            'Foul',
                                                                            'FoulBunt', 
                                                                            'FoulOnPitchout', 
                                                                            'FoulTip', 
                                                                            'FoulTipBunt') 
                                                          THEN 1 END) AS Strike,
                                        ev.outs,
                                        ev.base_state,
                                        ev.outs_on_play,
                                        ev.runs_on_play,
                                        ev.plate_appearance_result,
                                        ev.batted_trajectory
                                        FROM ComputedStike pcr
                                        LEFT JOIN event.EVENTS EV ON pcr.game_id = ev.game_id 
                                            AND pcr.event_key = ev.event_key
                                        LEFT JOIN stg_gamelog GL on pcr.game_id = gl.game_id 
                                        --where pcr.StrikeCount >= 2 AND pcr.game_id = 'SDN202308180' 
                                        GROUP BY 
                                            gl.season, 
                                            gl.date, 
                                            gl.doubleheader_status, 
                                            gl.date, 
                                            gl.game_type,
                                            pcr.game_id, 
                                            pcr.event_key, 
                                            pcr.sequence_id, 
                                            pcr.sequence_item, 
                                            pcr.BallCount, 
                                            StrikeCount,
                                            StrikeCount2,
                                            ev.batter_id, 
                                            ev.pitcher_id, 
                                            ev.batting_side,
                                            gl.away_team_id, 
                                            gl.home_team_id, 
                                            ev.outs, 
                                            ev.base_state, 
                                            ev.outs_on_play, 
                                            ev.runs_on_play, 
                                            ev.plate_appearance_result, 
                                            ev.batted_trajectory  
                                    )
                                    SELECT * from PlateAppearanceCountResults pcr
                                    where season = 2023 AND game_id = 'SDN202308180' --AND event_key = '654600812'
                                    ORDER BY event_key, sequence_id
                                    """).df()
df

Unnamed: 0,season,date,doubleheader_status,game_type,game_id,batter_id,pitcher_id,batting_side,away_team_id,home_team_id,event_key,sequence_id,sequence_item,BallCount,StrikeCount,StrikeCount2,Ball,swingStrike,CalledStrike,Strike,outs,base_state,outs_on_play,runs_on_play,plate_appearance_result,batted_trajectory
0,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,carrc005,lugos001,Away,ARI,SDN,654600811,1,Foul,0.0,0.0,0.0,0,0,0,1,0,0,1,0,InPlayOut,Fly
1,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,carrc005,lugos001,Away,ARI,SDN,654600811,2,InPlay,0.0,0.0,0.0,0,0,0,0,0,0,1,0,InPlayOut,Fly
2,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,1,Ball,1.0,0.0,0.0,1,0,0,0,1,0,0,0,Walk,
3,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,2,Ball,2.0,0.0,0.0,1,0,0,0,1,0,0,0,Walk,
4,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,3,Ball,3.0,0.0,0.0,1,0,0,0,1,0,0,0,Walk,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,kennb002,hadej001,Away,ARI,SDN,654600887,4,Ball,2.0,1.0,1.0,1,0,0,0,1,0,1,0,StrikeOut,
252,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,kennb002,hadej001,Away,ARI,SDN,654600887,5,SwingingStrike,2.0,2.0,2.0,0,0,0,1,1,0,1,0,StrikeOut,
253,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,lewik001,hadej001,Away,ARI,SDN,654600889,1,CalledStrike,0.0,1.0,1.0,0,0,1,1,2,0,1,0,StrikeOut,
254,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,lewik001,hadej001,Away,ARI,SDN,654600889,2,SwingingStrike,0.0,2.0,2.0,0,0,0,1,2,0,1,0,StrikeOut,


In [39]:
import numpy as np

foulOut = ['FoulBunt', 'FoulTip', 'FoulTipBunt']
foul = ['Foul', 'FoulBunt', 'FoulOnPitchout', 'FoulTip', 'FoulTipBunt']
nonStrike = ['Ball', 'Pitchout', 'IntentionalBall', 'InPlay', 'InPlayOnPitchout', 'HitByPitch','NoPitch', 'PickoffAttemptFirst', 'PickoffAttemptSecond', 'PickoffAttemptThird']

df['StrikeCount2'] = df.groupby(['game_id', 'event_key'])['StrikeCount2'].transform(
   lambda x: np.where((x < 2) & (df.loc[x.index, 'sequence_item'].isin(foul)), x + 1,
          np.where((x == 2) & (df.loc[x.index, 'sequence_item'].isin(foulOut)), 3.0,
                   np.where((x.isin([1,2])) & (df.loc[x.index, 'sequence_item'].isin(nonStrike)), x.shift(1),
                            np.where((x == 0) & (df.loc[x.index, 'sequence_item'].isin(nonStrike)), 0, x)))))

df

Unnamed: 0,season,date,doubleheader_status,game_type,game_id,batter_id,pitcher_id,batting_side,away_team_id,home_team_id,event_key,sequence_id,sequence_item,BallCount,StrikeCount,StrikeCount2,Ball,swingStrike,CalledStrike,Strike,outs,base_state,outs_on_play,runs_on_play,plate_appearance_result,batted_trajectory
0,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,carrc005,lugos001,Away,ARI,SDN,654600811,1,Foul,0.0,0.0,2.0,0,0,0,1,0,0,1,0,InPlayOut,Fly
1,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,carrc005,lugos001,Away,ARI,SDN,654600811,2,InPlay,0.0,0.0,-1.0,0,0,0,0,0,0,1,0,InPlayOut,Fly
2,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,1,Ball,1.0,0.0,-1.0,1,0,0,0,1,0,0,0,Walk,
3,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,2,Ball,2.0,0.0,-1.0,1,0,0,0,1,0,0,0,Walk,
4,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,martk001,lugos001,Away,ARI,SDN,654600812,3,Ball,3.0,0.0,-1.0,1,0,0,0,1,0,0,0,Walk,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,kennb002,hadej001,Away,ARI,SDN,654600887,4,Ball,2.0,1.0,-1.0,1,0,0,0,1,0,1,0,StrikeOut,
252,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,kennb002,hadej001,Away,ARI,SDN,654600887,5,SwingingStrike,2.0,2.0,2.0,0,0,0,1,1,0,1,0,StrikeOut,
253,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,lewik001,hadej001,Away,ARI,SDN,654600889,1,CalledStrike,0.0,1.0,1.0,0,0,1,1,2,0,1,0,StrikeOut,
254,2023,2023-08-18,SingleGame,RegularSeason,SDN202308180,lewik001,hadej001,Away,ARI,SDN,654600889,2,SwingingStrike,0.0,2.0,2.0,0,0,0,1,2,0,1,0,StrikeOut,


In [29]:
grouped = df.groupby(['game_id', 'event_key'])['StrikeCount2']
for name, group in grouped:
    print(name)
    print(group)

('SDN202308180', 654600811)
0    2.0
1    1.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600812)
2    0.0
3    0.0
4    0.0
5    1.0
6    1.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600813)
7     1.0
8     1.0
9     2.0
10    2.0
11    2.0
12    3.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600814)
13    3.0
14    1.0
15    1.0
16    1.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600815)
17    2.0
18    1.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600816)
19    1.0
20    1.0
21    2.0
22    2.0
23    2.0
24    2.0
25    2.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600817)
26    2.0
27    2.0
28    1.0
29    2.0
30    2.0
31    2.0
32    2.0
33    2.0
34    2.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600818)
35    1.0
36    1.0
37    2.0
38    2.0
39    2.0
40    2.0
Name: StrikeCount2, dtype: float64
('SDN202308180', 654600819)
41    1.0
42    1.0
43    1.0
44    1.0
45    2.0
46    2.0
Name:

These are previously used methodologies that I shifted from because 1) df2 is not counting StrikeCount2 correctly using SQL and I couldn't get the code to do
what I wanted. 2) df3 was not seeming counting strikes as a whole correctly. Due to these issues, I shifted to a SQL query with python used as a manipulation 
tool, which would be typical of my usual workflow anyway. 

In [None]:
df2: pd.DataFrame = conn.sql("""WITH PitchCount AS (SELECT
                                    eps.game_id,
                                    eps.event_key,
                                    eps.sequence_id,
                                    eps.sequence_item,
                                    SUM(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                    'Ball',
                                                                    'Pitchout',
                                                                    'IntentionalBall') THEN 1 ELSE 0 
                                        END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id
                                                    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS BallCount,
                                    SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                    'MissedBunt', 
                                                                    'StrikeUnknownType', 
                                                                    'SwingingOnPitchout', 
                                                                    'SwingingStrike') THEN 1 ELSE 0 
                                    END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS StrikeCount,
                                    SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                    'MissedBunt', 
                                                                    'StrikeUnknownType', 
                                                                    'SwingingOnPitchout', 
                                                                    'SwingingStrike') THEN 1 ELSE 0 
                                    END) OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id
                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS StrikeCount2
                                    FROM event.event_pitch_sequences eps
                                    --where 
                                        --eps.game_id = 'SDN202308180'
                                        --AND event_key = '654600812'   
                                    ),
                                    ComputedStike AS (SELECT
                                        pc.game_id,
                                        pc.event_key,
                                        pc.sequence_id,
                                        pc.sequence_item,
                                        pc.BallCount,
                                        pc.StrikeCount,
                                        CASE 
                                        -- For balls or put in play, keep StrikeCount unchanged
                                        WHEN pc.sequence_item IN ('Ball', 'Pitchout', 'IntentionalBall', 'InPlay', 'InPlayOnPitchout', 'HitByPitch'
                                                                    ,'NoPitch', 'PickoffAttemptFirst', 'PickoffAttemptSecond', 'PickoffAttemptThird') THEN 
                                            LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
                                        
                                        -- For fouls, increment StrikeCount if it's less than 2
                                        WHEN pc.sequence_item IN ('Foul', 'FoulBunt', 'FoulOnPitchout', 'FoulTip', 'FoulTipBunt') 
                                             AND LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) < 2 THEN 
                                            LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) + 1
                                
                                        -- Once StrikeCount = 2, don't increment it for foul balls that are not tipped
                                        WHEN pc.sequence_item IN ('Foul', 'FoulOnPitchout') 
                                             AND LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                            LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)

                                        -- Once StrikeCount is 2, increment it for foul balls that are tipped
                                        WHEN pc.sequence_item IN ('FoulBunt', 'FoulTip', 'FoulTipBunt') 
                                             AND LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                            LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) + 1

                                        -- For strikes, keep the same StrikeCount
                                        WHEN pc.sequence_item IN ('CalledStrike', 'SwingingStrike', 'StrikeUnknownType') THEN 
                                            -- This ensures that StrikeCount2 remains the same for strikes
                                            LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) 
                                
                                        -- Default case, just keep the same StrikeCount
                                        ELSE LAST_VALUE(StrikeCount2) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id
                                                                            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
                                        END AS StrikeCount2,
                                        FROM PitchCount pc
                                    ),
                                    PlateAppearanceCountResults AS (SELECT 
                                        gl.season,
                                        gl.date,
                                        gl.doubleheader_status,
                                        gl.game_type, 
                                        pcr.game_id, 
                                        ev.batter_id,
                                        ev.pitcher_id,
                                        ev.batting_side,
                                        gl.away_team_id,
                                        gl.home_team_id,
                                        pcr.event_key, 
                                        pcr.sequence_id, 
                                        pcr.sequence_item,
                                        pcr.BallCount,
                                        pcr.StrikeCount,
                                        pcr.StrikeCount2,
                                        COUNT(CASE WHEN pcr.sequence_item IN ('AutomaticBall',
                                                                            'Ball',
                                                                            'Pitchout',
                                                                            'IntentionalBall') THEN 1 END) AS Ball, 
                                        count(CASE WHEN pcr.sequence_item = 'SwingStrike' THEN 1 END) AS swingStrike, 
                                        count(CASE WHEN pcr.sequence_item = 'CalledStrike' THEN 1 END) AS CalledStrike,
                                        count(CASE WHEN pcr.sequence_item IN ('CalledStrike', 
                                                                            'MissedBunt', 
                                                                            'StrikeUnknownType', 
                                                                            'SwingingOnPitchout', 
                                                                            'SwingingStrike',
                                                                            'Foul',
                                                                            'FoulBunt', 
                                                                            'FoulOnPitchout', 
                                                                            'FoulTip', 
                                                                            'FoulTipBunt') 
                                                          THEN 1 END) AS Strike,
                                        ev.outs,
                                        ev.base_state,
                                        ev.outs_on_play,
                                        ev.runs_on_play,
                                        ev.plate_appearance_result,
                                        ev.batted_trajectory
                                        FROM ComputedStike pcr
                                        LEFT JOIN event.EVENTS EV ON pcr.game_id = ev.game_id 
                                            AND pcr.event_key = ev.event_key
                                        LEFT JOIN stg_gamelog GL on pcr.game_id = gl.game_id 
                                        --where pcr.StrikeCount >= 2 AND pcr.game_id = 'SDN202308180' 
                                        GROUP BY 
                                            gl.season, 
                                            gl.date, 
                                            gl.doubleheader_status, 
                                            gl.date, 
                                            gl.game_type,
                                            pcr.game_id, 
                                            pcr.event_key, 
                                            pcr.sequence_id, 
                                            pcr.sequence_item, 
                                            pcr.BallCount, 
                                            StrikeCount,
                                            StrikeCount2,
                                            ev.batter_id, 
                                            ev.pitcher_id, 
                                            ev.batting_side,
                                            gl.away_team_id, 
                                            gl.home_team_id, 
                                            ev.outs, 
                                            ev.base_state, 
                                            ev.outs_on_play, 
                                            ev.runs_on_play, 
                                            ev.plate_appearance_result, 
                                            ev.batted_trajectory  
                                    )
                                    SELECT * from PlateAppearanceCountResults pcr
                                    where season = 2023 AND game_id = 'SDN202308180' --AND event_key = '654600812'
                                    ORDER BY event_key, sequence_id
                                    """).df()
df2

In [None]:
df3: pd.DataFrame = conn.sql("""WITH PitchCount AS (select
                                                        eps.game_id,
                                                        eps.event_key,
                                                        eps.sequence_id,
                                                        eps.sequence_item,
                                                        SUM(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                                        'Ball',
                                                                                        'Pitchout',
                                                                                        'IntentionalBall') THEN 1 ELSE 0 END)
                                                            OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS BallCount,
                                                        SUM(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                                        'MissedBunt', 
                                                                                        'StrikeUnknownType', 
                                                                                        'SwingingOnPitchout', 
                                                                                        'SwingingStrike') THEN 1 
                                                            ELSE 0 
                                                        END)
                                                            OVER(PARTITION BY eps.game_id, eps.event_key order by eps.sequence_id) AS StrikeCount
                                                    FROM event.event_pitch_sequences eps
                                                    where 
                                                        eps.game_id = 'SDN202308180'
                                                        --AND event_key = '654600812'
                                                        
                                    )
                                    select gl.season,
                                        gl.date,
                                        gl.doubleheader_status,
                                        gl.game_type, 
                                        pc.game_id, 
                                        ev.batter_id,
                                        ev.pitcher_id,
                                        ev.batting_side,
                                        gl.away_team_id,
                                        gl.home_team_id,
                                        pc.event_key, 
                                        pc.sequence_id, 
                                        pc.sequence_item,
                                        pc.BallCount,
                                        CASE 
                                            -- For strikes, keep the same StrikeCount
                                            WHEN sequence_item IN ('CalledStrike', 'SwingingStrike', 'StrikeUnknownType') THEN 
                                                -- This ensures that StrikeCount remains the same for strikes
                                                LAST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                            
                                            -- For balls, keep StrikeCount unchanged
                                            WHEN sequence_item IN ('Ball', 'Pitchout', 'IntentionalBall') THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                            
                                            -- For fouls, increment StrikeCount if it's less than 2
                                            WHEN sequence_item IN ('Foul', 'FoulBunt', 'FoulOnPitchout', 'FoulTip', 'FoulTipBunt') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) < 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) + 1
                                    
                                            -- Once StrikeCount is 2, don't increment it for foul balls that are not tipped
                                            WHEN sequence_item IN ('Foul', 'FoulOnPitchout') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)

                                            -- Once StrikeCount is 2, increment it for foul balls that are tipped
                                            WHEN sequence_item IN ('FoulBunt', 'FoulTip', 'FoulTipBunt') 
                                                 AND FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) >= 2 THEN 
                                                FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id) + 1
                                    
                                            -- Default case, just keep the same StrikeCount
                                            ELSE FIRST_VALUE(pc.StrikeCount) OVER (PARTITION BY pc.game_id, pc.event_key ORDER BY pc.sequence_id)
                                        END AS StrikeCount,
                                        COUNT(CASE WHEN sequence_item IN ('AutomaticBall',
                                                                            'Ball',
                                                                            'Pitchout',
                                                                            'IntentionalBall') THEN 1 END) AS Ball, 
                                        count(CASE WHEN sequence_item = 'SwingStrike' THEN 1 END) AS swingStrike, 
                                        count(CASE WHEN sequence_item = 'CalledStrike' THEN 1 END) AS CalledStrike,
                                        count(CASE WHEN sequence_item IN ('CalledStrike', 
                                                                            'MissedBunt', 
                                                                            'StrikeUnknownType', 
                                                                            'SwingingOnPitchout', 
                                                                            'SwingingStrike',
                                                                            'Foul',
                                                                            'FoulBunt', 
                                                                            'FoulOnPitchout', 
                                                                            'FoulTip', 
                                                                            'FoulTipBunt') 
                                                          THEN 1 END) AS Strike,
                                        ev.outs,
                                        ev.base_state,
                                        ev.outs_on_play,
                                        ev.runs_on_play,
                                        ev.plate_appearance_result,
                                        ev.batted_trajectory,
                    from PitchCount pc
                    LEFT JOIN event.EVENTS EV ON pc.game_id = ev.game_id 
                        AND pc.event_key = ev.event_key
                    LEFT JOIN stg_gamelog GL on pc.game_id = gl.game_id 
                    where StrikeCount >= 2 AND pc.game_id = 'SDN202308180' 
                    GROUP BY 
                        gl.season, gl.date, gl.doubleheader_status, gl.date, gl.game_type,
                        pc.game_id, pc.event_key, pc.sequence_id, pc.sequence_item, pc.BallCount, StrikeCount, ev.batter_id, ev.pitcher_id, ev.batting_side,
                        gl.away_team_id, gl.home_team_id, ev.outs, ev.base_state, ev.outs_on_play, ev.runs_on_play, ev.plate_appearance_result, ev.batted_trajectory,
                    order by pc.event_key, pc.sequence_id """).df()

df3

In [None]:
# Identify what values are in TableB and not in TableA
key_diff = set(df.event_key).difference(df1.event_key)
where_diff = df.event_key.isin(key_diff)

where_diff

In [None]:
conn.close()