In [1]:
import pandas as pd
import pyarrow.dataset as pads
import pyarrow as pa
import os
import duckdb as db

pd.set_option('display.max_columns', None)

In [2]:
# Point to where smt_2025.db was created
con = db.connect("/Users/buttz/Desktop/SMTChallenge2025/smt_2025.db")

In [3]:
event_code_dict = {
    1: 'pitch',
    2: 'ball acquired',
    3: 'throw (ball-in-play)',
    4: 'ball hit into play',
    5: 'end of play',
    6: 'pickoff throw',
    7: 'ball acquired - unknown field position',
    8: 'throw (ball-in-play) - unknown field position',
    9: 'ball deflection',
    10: 'ball deflection off of wall',
    11: 'home run',
    16: 'ball bounce'
}

player_position_dict = {
    1: 'pitcher',
    2: 'catcher',
    3: 'first basemen',
    4: 'second basemen',
    5: 'third basemen',
    6: 'shortstop',
    7: 'left field',
    8: 'center field',
    9: 'right field',
    10: 'batter',
    11: 'runner on first base',
    12: 'runner on second base',
    13: 'runner on third base',
    255: 'ball event with no player (e.g., ball bounce)',
    14: 'home plate umpire',
    15: 'field umpire #1',
    16: 'field umpire #2',
    17: 'field umpire #3',
    18: 'first base coach',
    19: 'third base coach'
}

In [27]:
# Query a table
player_pos = con.execute("SELECT * FROM player_pos WHERE game_str = 'y1_d069_ACN_QEA'").df().copy()
player_pos.loc[:, 'player_position_name'] = player_pos['player_position'].map(player_position_dict)
display(player_pos.shape)
player_pos

(217316, 11)

Unnamed: 0,game_str,play_id,timestamp,player_position,field_x,field_y,home_team,away_team,year,day,player_position_name
0,y1_d069_ACN_QEA,1,14853,1,0.6102,58.6314,QEA,ACN,year_1,day_069,pitcher
1,y1_d069_ACN_QEA,1,14853,2,1.7643,-6.4374,QEA,ACN,year_1,day_069,catcher
2,y1_d069_ACN_QEA,1,14853,3,69.9423,98.1246,QEA,ACN,year_1,day_069,first basemen
3,y1_d069_ACN_QEA,1,14853,4,37.2924,146.9091,QEA,ACN,year_1,day_069,second basemen
4,y1_d069_ACN_QEA,1,14853,5,-66.9441,86.1147,QEA,ACN,year_1,day_069,third basemen
...,...,...,...,...,...,...,...,...,...,...,...
217311,y1_d069_ACN_QEA,155,6167694,8,-16.2468,299.7699,QEA,ACN,year_1,day_069,center field
217312,y1_d069_ACN_QEA,155,6167694,9,106.2663,267.1890,QEA,ACN,year_1,day_069,right field
217313,y1_d069_ACN_QEA,155,6167694,10,50.2662,40.9512,QEA,ACN,year_1,day_069,batter
217314,y1_d069_ACN_QEA,155,6167694,15,65.6535,81.0207,QEA,ACN,year_1,day_069,field umpire #1


In [28]:
# Query a table
rosters = con.execute("SELECT * FROM rosters").df().copy()
display(rosters.shape)
rosters.head()

(143, 5)

Unnamed: 0,DYE,OXG,QEA,RZQ,YJD
0,DYE-0009,OXG-0018,QEA-0008,RZQ-0003,YJD-0001
1,DYE-0012,OXG-0029,QEA-0010,RZQ-0004,YJD-0002
2,DYE-0015,OXG-0052,QEA-0013,RZQ-0005,YJD-0007
3,DYE-0022,OXG-0054,QEA-0027,RZQ-0006,YJD-0011
4,DYE-0023,OXG-0061,QEA-0039,RZQ-0014,YJD-0016


In [29]:
game_events = con.execute("SELECT * FROM game_events WHERE game_str = 'y1_d069_ACN_QEA'").df().copy()
game_events.loc[:, 'event_code_name'] = game_events['event_code'].map(event_code_dict)
game_events.loc[:, 'player_position_name'] = game_events['player_position'].map(player_position_dict)
display(game_events.shape)
game_events.head()

(633, 13)

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,year,day,event_code_name,player_position_name
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,year_1,day_069,pitch,pitcher
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,year_1,day_069,ball acquired,catcher
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,year_1,day_069,end of play,
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,year_1,day_069,pitch,pitcher
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,year_1,day_069,ball acquired,catcher


In [30]:
game_events['game_str'].apply(lambda x: "".join(x.split('_')[0])).unique()

array(['y1'], dtype=object)

In [31]:
ball_pos = con.execute("SELECT * FROM ball_pos WHERE game_str = 'y1_d069_ACN_QEA'").df().copy()
display(ball_pos.shape)
ball_pos.head()

(5460, 10)

Unnamed: 0,game_str,play_id,timestamp,ball_position_x,ball_position_y,ball_position_z,home_team,away_team,year,day
0,y1_d069_ACN_QEA,1,14853,3.50046,52.0443,5.71419,QEA,ACN,year_1,day_069
1,y1_d069_ACN_QEA,1,14903,3.21072,45.6972,5.54322,QEA,ACN,year_1,day_069
2,y1_d069_ACN_QEA,1,14953,2.92995,39.4179,5.31909,QEA,ACN,year_1,day_069
3,y1_d069_ACN_QEA,1,15003,2.658168,33.2067,5.0418,QEA,ACN,year_1,day_069
4,y1_d069_ACN_QEA,1,15053,2.395365,27.06366,4.71132,QEA,ACN,year_1,day_069


In [33]:
game_info = con.execute("SELECT * FROM game_info WHERE game_str = 'y1_d069_ACN_QEA'").df().copy()
display(game_info.shape)
game_info


(26, 21)

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,year,day
0,y1_d069_ACN_QEA,QEA,ACN,1.0,1.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,year_1,day_069
1,y1_d069_ACN_QEA,QEA,ACN,1.0,2.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,year_1,day_069
2,y1_d069_ACN_QEA,QEA,ACN,1.0,3.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,year_1,day_069
3,y1_d069_ACN_QEA,QEA,ACN,2.0,4.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1416,,,,year_1,day_069
4,y1_d069_ACN_QEA,QEA,ACN,69.0,134.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,,year_1,day_069
5,y1_d069_ACN_QEA,QEA,ACN,69.0,135.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,,year_1,day_069
6,y1_d069_ACN_QEA,QEA,ACN,70.0,136.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,,year_1,day_069
7,y1_d069_ACN_QEA,QEA,ACN,70.0,137.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,,year_1,day_069
8,y1_d069_ACN_QEA,QEA,ACN,70.0,138.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,,year_1,day_069
9,y1_d069_ACN_QEA,QEA,ACN,70.0,139.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1924,,,,year_1,day_069
