In [56]:
%reset -fs

In [57]:
#!conda install -y -c conda-forge xgboost

In [58]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [59]:
columns = ['date', 't1', 'pitcher', 't2', 'batter', 'inning', 'result', 'pitch_type', 
           'mph', 'rpm', 'vbreak', 'up_down', 'hbreak', 'left_right', 'count']
df = pd.DataFrame(columns = columns)
for month in range(4, 11):
    print(month)
    files = os.listdir(str(month) + '/')
    df_month = pd.concat([pd.read_csv(str(month) + '/' + file, index_col = 0) for file in files if file.endswith('.csv')], ignore_index = True)
    if len(df) == 0:
        df = df_month
    else:
        df = pd.concat([df, df_month], ignore_index = True)
    
            

4
5
6
7
8
9
10


Pitch types:
- 4-Seam Fastball
- Slider
- Sinker
- Changeup
- Curveball
- Cutter
- Knuckle Curve
- Splitter
- Fastball
- Slow Curve
- Eephus
- Knucle Ball

Groups:
- 4-Seam/Fastball
- Slider
- Sinker
- Changeup
- Curve/Knuckle Curve/Slow Curve
- Knuckle Ball/ Splitter
- Cutter

Result types:
- Ball
- Foul
- Called Strike
- Swinging Strike
- In play, out(s)
- In play, no out
- Ball in Dirt
- In play, run(s)
- Foul Tip
- Hit By Pitch
- Foul Bunt
- Missed Bunt
- Pitchout

Groups:
- Strikes
- Balls - Filter out
- Contact
- Fouls - Filter out

In [60]:
df['rpm'] = df['rpm'].apply(int)
df['vbreak'] = df['vbreak'].apply(int)
df['hbreak'] = df['hbreak'].apply(int)

In [61]:
df['left_right'].value_counts()

←    327578
→    260159
Name: left_right, dtype: int64

In [62]:
df['left_right'] = df['left_right'].apply(lambda x: x == '←')

In [65]:
result_dict = {'Ball': 'Ball',
               'Foul': 'Foul',
               'Called Strike': 'Strike',
               'Swinging Strike': 'Strike',
               'In play, out(s)': 'Contact',
               'In play, no out': 'Contact',
               'Ball In Dirt': 'Ball',
               'In play, run(s)': 'Contact',
               'Foul Tip': 'Strike',
               'Hit By Pitch': 'Ball',
               'Foul Bunt': 'Strike',
               'Missed Bunt': 'Strike',
               'Pitchout': 'Ball'}

In [66]:
def group_pitches(x):
    if 'Fastball' in x:
        return 'Fastball'
    elif 'Curve' in x:
        return 'Curveball'
    elif ('Knuc' in x) or (x == 'Splitter'):
        return 'Splitter'
    else:
        return x


In [67]:
df['result'].value_counts()

Ball               201182
Foul               107570
Called Strike       99187
Swinging Strike     68763
In play, out(s)     67077
In play, no out     23025
Ball In Dirt        14730
In play, run(s)     13612
Foul Tip             5782
Hit By Pitch         1784
Foul Bunt            1302
Missed Bunt           311
Pitchout               25
Foul Pitchout           1
Name: result, dtype: int64

In [68]:
df['result'] = df['result'].map(result_dict)
df['pitch_type'] = df['pitch_type'].apply(group_pitches)

In [69]:
df['result'].value_counts()

Ball       202991
Strike     175345
Foul       107570
Contact    103714
Name: result, dtype: int64

In [70]:
df = df.dropna()
df = df.loc[(df['result'] == 'Strike') | (df['result'] == 'Contact')]
codes, uniques = pd.factorize(df['result'])
df['result'] = codes

In [72]:
uniques

Index(['Strike', 'Contact'], dtype='object')

In [73]:

pitch_types = ['Fastball', 'Slider', 'Sinker', 'Changeup', 'Curveball', 'Splitter', 'Cutter']
#pitch_type = pitch_types[1]



Features:
- MPH
- RPM
- VBreak
- HBreak
- Break is Left

In [74]:
from joblib import dump

In [76]:
for pitch_type in pitch_types:
    df_filter = df.loc[df['pitch_type'] == pitch_type]
    X = df_filter[['mph', 'rpm', 'vbreak', 'hbreak', 'left_right']]
    y = df_filter['result']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=22)
    estimators = 300
    max_depth = 5
    rf = RandomForestClassifier(n_estimators = estimators, max_depth = max_depth, criterion = 'entropy')
    rf.fit(X_train, y_train)
    preds = rf.predict_proba(X_val)
    print(f'Random Forest val score for {pitch_type}: {rf.score(X_val, y_val):.3f}')
    dump(rf, pitch_type + '.rf')

Random Forest val score for Fastball: 0.633
Random Forest val score for Slider: 0.672
Random Forest val score for Sinker: 0.579
Random Forest val score for Changeup: 0.568
Random Forest val score for Curveball: 0.683
Random Forest val score for Splitter: 0.623
Random Forest val score for Cutter: 0.613


Model Scores:
- Fastball: .633
- Slider: 0.672
- Sinker 0.579
- Changeup 0.569
- Curve/Knuckle Curve/Slow Curve 0.683
- Knuckle Ball/ Splitter 0.623
- Cutter 0.614