In [1]:
import pandas as pd
import os 
import numpy as np
from collections import Counter

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

os.chdir('C:/Users/George/Documents/Baseball/2008-2017_pitch_data')

In [2]:
rawData = pd.read_csv('baseballdata1517.csv')
rawData = rawData.drop(['Unnamed: 0'], axis=1)

In [263]:
randomPitchers = list(pd.Series(rawData['pitcher'].unique()).sample(10))

In [264]:
data = rawData[rawData['pitcher'].isin(randomPitchers)]

In [265]:
data.player_name.unique()

array(['Neal Cotts', 'Kevin Jepsen', 'Samuel Deduno',
       'Odrisamer Despaigne', 'Liam Hendriks', 'Rafael Martin',
       'Taylor Jungmann', 'Rich Hill', 'Kelvin Marte', 'Daniel Descalso'],
      dtype=object)

In [266]:
#Binary on base not on a base
data[['on_3b', 'on_2b', 'on_1b']] = data[['on_3b', 'on_2b', 'on_1b']].isna() == False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [267]:
#adding indices 
data['game_date_index'] = data['game_date'].apply(lambda a: a.replace('-', ''))
data['game_index'] = data['game_date_index'] + '_' + data.away_team + '_' + data.home_team 
data['pitcher_apperance_index'] = data['game_date_index'] + '_' + data['pitcher'].astype(str)    
data['atbat_id'] = data['pitcher_apperance_index'] + "_" + data['at_bat_number'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [268]:
hit_list = ['double', 'single', 'home_run', 'triple']
data2 = pd.DataFrame()
for i in data['pitcher_apperance_index'].unique():
    
    #selecing single game
    apperance = data[data['pitcher_apperance_index'] == i] 
    apperance = apperance.sort_values(['at_bat_number','pitch_number']) #sorting at-bats and pitches
    
    #pitch count
    apperance['pitch_count_game'] = np.arange(len(apperance)) 
    
    #Number of walks
    apperance['number_of_walks'] = (apperance['events'] == 'walk').cumsum().shift(1).fillna(0)
    
    #Number of hits
    apperance['number_of_hits'] = apperance['events'].isin(hit_list).cumsum().shift(1).fillna(0)

    #Prior Pitch
    apperance['last_pitch'] = apperance['pitch_type'].shift(1).fillna('NewAB')

    data2 = data2.append(apperance)

In [269]:
data2['last_pitch'] = data2['pitch_type'].shift(1).fillna('NewAB')

In [270]:
conditions = [
    (data2['on_1b'] == True) & (data2['on_2b'] == True) & (data2['on_3b'] == True),
    (data2['on_2b'] == True) | (data2['on_3b'] == True),
    (data2['on_1b'] == True)
]

status = ['BL', 'RISP', '1B']

data2['Runners_On'] = np.select(conditions, status, default='BasesEmpty')

In [271]:
data2['Fav_Pitcher_Matchup'] = data2['p_throws'] == data2['stand']

In [272]:
toOneHot = np.array(data2[['Runners_On', 'last_pitch']])
onehotencoder = OneHotEncoder(sparse=False) 
oneHotData = onehotencoder.fit_transform(toOneHot)
oneHotData = pd.DataFrame(oneHotData, columns=onehotencoder.get_feature_names(['Runners_On', 'last_pitch']))
data2 = data2.reset_index(drop=True)
data2 = pd.concat([data2, oneHotData], axis=1)

In [273]:
data2['pitch_type'].fillna('FF', inplace = True)

In [274]:
# Label encoding Pitch Type so can be fed into models
labelencoder = LabelEncoder()
PitchTypeEncoded = labelencoder.fit_transform(data2['pitch_type'])

data2['pitch_type_le'] = PitchTypeEncoded

### Testing on different pitchers

In [344]:
selected_pitcher = randomPitchers[8]
single_pitcher = data2[data2['pitcher'] == selected_pitcher]
print(single_pitcher['player_name'].unique())
single_pitcher.pitch_type.value_counts()

['Kevin Jepsen']


FF    1258
CU     506
CH     176
FT      96
IN       6
Name: pitch_type, dtype: int64

In [345]:
pitch_arsenal = pd.DataFrame(single_pitcher.pitch_type.value_counts() / len(single_pitcher)).reset_index()
pitch_arsenal.columns = ['pitch', 'pitch_fequency']
pitchesToDrop = pitch_arsenal.loc[pitch_arsenal['pitch_fequency'] < 0.05, 'pitch'].values

In [346]:
pitch_columns = lambda a : "last_pitch_{}".format(a)

last_pitch_columns = pd.Series(single_pitcher['pitch_type'].unique()).apply(pitch_columns)
last_pitch_columns = list(last_pitch_columns)

In [347]:
StandardPredVars = ['strikes', 'balls','pitch_count_game','Runners_On_RISP', 'Fav_Pitcher_Matchup']
PredVars = StandardPredVars + last_pitch_columns
#PredVars.remove('last_pitch_nan')

In [348]:
single_pitcher.drop(single_pitcher[single_pitcher['pitch_type'].isin(pitchesToDrop)].index , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [349]:
X = single_pitcher[PredVars]
y = single_pitcher['pitch_type_le'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [350]:
dt_clf = DecisionTreeClassifier(min_samples_split=2,
                                max_depth=4,
                                max_features=5,
                                random_state=23)
dt_clf.fit(X_train, y_train)
preds = dt_clf.predict(X_test)
acc = (preds == y_test).mean()
print('Accuracy is {}%'.format(round(acc*100,1)))

Accuracy is 66.2%


In [351]:
print(classification_report_imbalanced(y_test, preds))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00        58
          1       0.56      0.33      0.92      0.41      0.55      0.28       116
          4       0.68      0.91      0.23      0.78      0.46      0.22       311

avg / total       0.57      0.66      0.49      0.60      0.42      0.21       485



  'precision', 'predicted', average, warn_for)


### Trying out Imbalance Learn package

In [352]:
print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

Training target statistics: Counter({4: 947, 1: 390, 0: 118})
Testing target statistics: Counter({4: 311, 1: 116, 0: 58})


### Random Oversampled SVC 

In [353]:
pipe = make_pipeline(RandomOverSampler(random_state=23), SVC())
pipe.fit(X_train, y_train)
print(classification_report_imbalanced(y_test, pipe.predict(X_test)))



                   pre       rec       spe        f1       geo       iba       sup

          0       0.29      0.69      0.77      0.41      0.73      0.53        58
          1       0.42      0.61      0.73      0.50      0.67      0.44       116
          4       0.83      0.48      0.83      0.61      0.63      0.38       311

avg / total       0.67      0.53      0.80      0.56      0.65      0.41       485



### SVM SMOTE Oversampling

In [354]:
from imblearn.over_sampling import SVMSMOTE

In [355]:
sm = SVMSMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(np.array(X_train), y_train.ravel()) 

In [356]:
smote_clf = SVC(random_state=23)

smote_clf.fit(X_train_res, y_train_res)
preds = smote_clf.predict(X_test)



In [357]:
print(classification_report_imbalanced(y_test, preds))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.33      0.52      0.86      0.41      0.67      0.43        58
          1       0.39      0.57      0.72      0.46      0.64      0.41       116
          4       0.77      0.56      0.70      0.65      0.62      0.38       311

avg / total       0.63      0.56      0.72      0.57      0.63      0.39       485



In [358]:
(preds == y_test).mean()

0.5567010309278351

In [359]:
pd.Series(preds).value_counts() / len(preds)

4    0.468041
1    0.346392
0    0.185567
dtype: float64

In [360]:
pd.Series(y_test).value_counts() / len(y_test)

4    0.641237
1    0.239175
0    0.119588
Name: pitch_type_le, dtype: float64