### mining 모듈이 Import 되지 않을 때 실행
---

In [1]:
import sys
import os

os.chdir("./Process-mining-research-on-MLB-pitchers-pitch-types")
current_dir = os.path.abspath(os.getcwd()) 
if current_dir not in sys.path: sys.path.insert(0, current_dir)

### 1. One Step EDA
---

In [2]:
from mining import one_step_EDA_from_bigquery 
import warnings
warnings.filterwarnings('ignore')

eda = one_step_EDA_from_bigquery(
    path="key.json", 
    limit=None, 
    start_name='start', 
    end_name='end', 
    case_type=None
)

In [3]:
eda.Transition.Frequency.all_cnts

Unnamed: 0,Source,Target,Variable
0,start,SL,117
1,start,SI,333
2,start,CH,4
3,SL,SI,280
4,SL,SL,113
6,SL,CH,11
7,SI,SI,960
9,SI,SL,306
10,SI,CH,28
11,CH,SI,20


In [4]:
eda.Transition.Probability.visualizer(layered=True, grouped=False)

### 2. step by step EDA
---

In [None]:
from mining.utils import load_data_from_bigquery
from mining.preprocessing import define_at_bat_cases
from mining.preprocessing import add_node_and_preprocess
from mining.preprocessing import one_way_filter
from mining.probability import BasedTraces
from mining.exploratory import ProcessEDA

In [6]:
df = load_data_from_bigquery()
df[['game_date', 'batter', 'pitch_type','events']].head(10)

Unnamed: 0,game_date,batter,pitch_type,events
0,2019-10-01,665742,SI,single
1,2019-10-01,665742,SL,
2,2019-10-01,665742,SI,
3,2019-10-01,543685,SI,walk
4,2019-10-01,543685,SI,
5,2019-10-01,543685,SI,
6,2019-10-01,543685,SI,
7,2019-10-01,543685,SI,
8,2019-10-01,543685,SI,
9,2019-10-01,475582,SI,single


In [7]:
df_grouped = define_at_bat_cases(df)
df_grouped.head(10)[['game_date', 'batter', 'pitch_type', 'events','processID', 'pitchOrder']]

Unnamed: 0,game_date,batter,pitch_type,events,processID,pitchOrder
0,2019-10-01,665742,SI,single,0,2
1,2019-10-01,665742,SL,,0,1
2,2019-10-01,665742,SI,,0,0
3,2019-10-01,543685,SI,walk,1,5
4,2019-10-01,543685,SI,,1,4
5,2019-10-01,543685,SI,,1,3
6,2019-10-01,543685,SI,,1,2
7,2019-10-01,543685,SI,,1,1
8,2019-10-01,543685,SI,,1,0
9,2019-10-01,475582,SI,single,2,3


In [8]:
# 결측치 indexing (pitch_type)
missing_index = set(df_grouped[df_grouped['pitch_type'].isna()]['processID'])
valid_index = ~df_grouped['processID'].isin(missing_index)

# 결측치 제거
df_valid = df_grouped[valid_index]
df_valid.head(10)[['game_date', 'batter', 'pitch_type', 'events','processID', 'pitchOrder']]

Unnamed: 0,game_date,batter,pitch_type,events,processID,pitchOrder
0,2019-10-01,665742,SI,single,0,2
1,2019-10-01,665742,SL,,0,1
2,2019-10-01,665742,SI,,0,0
3,2019-10-01,543685,SI,walk,1,5
4,2019-10-01,543685,SI,,1,4
5,2019-10-01,543685,SI,,1,3
6,2019-10-01,543685,SI,,1,2
7,2019-10-01,543685,SI,,1,1
8,2019-10-01,543685,SI,,1,0
9,2019-10-01,475582,SI,single,2,3


In [19]:
# 시작, 종료 노드 추가
df_added = add_node_and_preprocess(df_valid, start_name='start', end_name='end', case_type=None)
display(df_added.head(10)[['case:concept:name','time:timestamp', 'batter', 'pitch_type', 'events', 'pitchOrder']])

print('\n === 구종 테이블 ===')
display(df_added.pitch_type.value_counts().to_frame())

print('\n === Events 테이블 ===')
display(df_added.events.value_counts().to_frame())


Unnamed: 0,case:concept:name,time:timestamp,batter,pitch_type,events,pitchOrder
0,0,2019-09-30 23:59:59,665742,start,,-1
1,0,2019-10-01 00:00:00,665742,SI,,0
2,0,2019-10-01 00:00:01,665742,SL,,1
3,0,2019-10-01 00:00:02,665742,SI,single,2
4,0,2019-10-01 00:00:03,665742,end,single,3
5,1,2019-09-30 23:59:59,543685,start,,-1
6,1,2019-10-01 00:00:00,543685,SI,,0
7,1,2019-10-01 00:00:01,543685,SI,,1
8,1,2019-10-01 00:00:02,543685,SI,,2
9,1,2019-10-01 00:00:03,543685,SI,,3



 === 구종 테이블 ===


Unnamed: 0_level_0,count
pitch_type,Unnamed: 1_level_1
SI,3394
SL,1121
start,1081
end,1081
CH,129
FF,4



 === Events 테이블 ===


Unnamed: 0_level_0,count
events,Unnamed: 1_level_1
strikeout,744
field_out,593
single,180
walk,159
home_run,66
double,46
hit_by_pitch,31
force_out,24
grounded_into_double_play,13
sac_fly,10


In [20]:
# strikeout만 filtering
df_filtered = one_way_filter(df_added, colName = 'events', posCondition = ['strikeout'])
display(df_filtered.head(10)[['case:concept:name','time:timestamp', 'batter', 'pitch_type', 'events', 'pitchOrder']])

print('\n === 구종 테이블 ===')
display(df_filtered.pitch_type.value_counts().to_frame())

print('\n === Events 테이블 ===')
display(df_filtered.events.value_counts().to_frame())


Unnamed: 0,case:concept:name,time:timestamp,batter,pitch_type,events,pitchOrder
19,3,2019-10-01 00:00:03,607208,start,strikeout,-1
20,3,2019-10-01 00:00:00,607208,SL,,0
21,3,2019-10-01 00:00:01,607208,SI,,1
22,3,2019-10-01 00:00:02,607208,SI,,2
23,3,2019-10-01 00:00:03,607208,SI,,3
24,3,2019-10-01 00:00:04,607208,SI,strikeout,4
25,3,2019-10-01 00:00:01,607208,end,,5
34,5,2019-09-30 23:59:59,645302,start,,-1
35,5,2019-10-01 00:00:00,645302,SI,,0
36,5,2019-10-01 00:00:01,645302,SL,,1



 === 구종 테이블 ===


Unnamed: 0_level_0,count
pitch_type,Unnamed: 1_level_1
SI,1593
SL,543
start,454
end,454
CH,46



 === Events 테이블 ===


Unnamed: 0_level_0,count
events,Unnamed: 1_level_1
strikeout,744


In [None]:
# Event Log 데이터를 Probability로 계산
calc_eventlog = BasedTraces(df_filtered) 
final_result = calc_eventlog()

# Process EDA
eda = ProcessEDA(final_result)

In [24]:
eda.Transition.Frequency.all_cnts

Unnamed: 0,Source,Target,Variable
0,start,SL,117
1,start,SI,333
2,start,CH,4
3,SL,SI,280
4,SL,SL,113
6,SL,CH,11
7,SI,SI,960
9,SI,SL,306
10,SI,CH,28
11,CH,SI,20


In [25]:
eda.Transition.Probability.visualizer(layered=True, grouped=False)