In [317]:
import pandas as pd
import os 
import numpy as np
from collections import Counter

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

os.chdir('C:/Users/George/Documents/Baseball/2008-2017_pitch_data')

In [2]:
rawData = pd.read_csv('baseballdata1517.csv')
rawData = rawData.drop(['Unnamed: 0'], axis=1)

In [3]:
randomPitchers = list(pd.Series(rawData['pitcher'].unique()).sample(10))

In [4]:
data = rawData[rawData['pitcher'].isin(randomPitchers)]

In [5]:
#Binary on base not on a base
data[['on_3b', 'on_2b', 'on_1b']] = data[['on_3b', 'on_2b', 'on_1b']].isna() == False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [6]:
#adding indices 
data['game_date_index'] = data['game_date'].apply(lambda a: a.replace('-', ''))
data['game_index'] = data['game_date_index'] + '_' + data.away_team + '_' + data.home_team 
data['pitcher_apperance_index'] = data['game_date_index'] + '_' + data['pitcher'].astype(str)    
data['atbat_id'] = data['pitcher_apperance_index'] + "_" + data['at_bat_number'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [7]:
hit_list = ['double', 'single', 'home_run', 'triple']
data2 = pd.DataFrame()
for i in data['pitcher_apperance_index'].unique():
    
    #selecing single game
    apperance = data[data['pitcher_apperance_index'] == i] 
    apperance = apperance.sort_values(['at_bat_number','pitch_number']) #sorting at-bats and pitches
    
    #pitch count
    apperance['pitch_count_game'] = np.arange(len(apperance)) 
    
    #Number of walks
    apperance['number_of_walks'] = (apperance['events'] == 'walk').cumsum().shift(1).fillna(0)
    
    #Number of hits
    apperance['number_of_hits'] = apperance['events'].isin(hit_list).cumsum().shift(1).fillna(0)
    
    data2 = data2.append(apperance)

In [8]:
data3 = pd.DataFrame()
for i in data2['atbat_id'].unique():
    atbat = data2[data2['atbat_id'] == i]
    atbat = atbat.sort_values('pitch_number')

    
    atbat['last_pitch'] = atbat['pitch_type'].shift(1).fillna('NewAB')

    atbat[['last_plate_x', 'last_plate_z']] = atbat[['plate_x', 'plate_z']].shift(1)
    atbat[['last_2_plate_x', 'last_2_plate_z']] = atbat[['plate_x', 'plate_z']].shift(1).rolling(2).mean()
    
    atbat['last_pitch_speed'] = atbat['effective_speed'].shift(1)
    atbat['last_2_pitch_speed'] = atbat['effective_speed'].shift(1).rolling(2).mean()
    
    data3 = data3.append(atbat)    

In [None]:
conditions = [
    (data3['on_1b'] == True) & (data3['on_2b'] == True) & (data3['on_3b'] == True),
    (data3['on_2b'] == True) | (data3['on_3b'] == True),
    (data3['on_1b'] == True)
]

status = ['BL', 'RISP', '1B']

data3['Runners_On'] = np.select(conditions, status, default='BasesEmpty')

In [10]:
data3['Fav_Pitcher_Matchup'] = data3['p_throws'] == data3['stand']

In [14]:
toOneHot = np.array(data3[['Runners_On', 'last_pitch']])
onehotencoder = OneHotEncoder(sparse=False) 
oneHotData = onehotencoder.fit_transform(toOneHot)
oneHotData = pd.DataFrame(oneHotData, columns=onehotencoder.get_feature_names(['Runners_On', 'last_pitch']))
data3 = data3.reset_index(drop=True)
data3 = pd.concat([data3, oneHotData], axis=1)

In [98]:
data3['pitch_type'].fillna('FF', inplace = True)

In [99]:
# Label encoding Pitch Type so can be fed into models
labelencoder = LabelEncoder()
PitchTypeEncoded = labelencoder.fit_transform(data3['pitch_type'])

data3['pitch_type_le'] = PitchTypeEncoded

### Testing on different pitchers

In [528]:
selected_pitcher = randomPitchers[1]
single_pitcher = data3[data3['pitcher'] == selected_pitcher]
single_pitcher.pitch_type.value_counts()

FF    3951
CH    1153
SL    1095
KC     929
FT     293
FC     248
IN       5
PO       1
Name: pitch_type, dtype: int64

In [529]:
pitch_columns = lambda a : "last_pitch_{}".format(a)

last_pitch_columns = pd.Series(single_pitcher['pitch_type'].unique()).apply(pitch_columns)
last_pitch_columns = list(last_pitch_columns)

In [530]:
StandardPredVars = ['strikes', 'balls','pitch_count_game','Runners_On_RISP', 'Fav_Pitcher_Matchup']
PredVars = StandardPredVars + last_pitch_columns
#PredVars.remove('last_pitch_nan')

In [531]:
single_pitcher.drop(single_pitcher[single_pitcher['pitch_type'].isin(['PO' ,'IN'])].index , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [532]:
X = single_pitcher[PredVars]
y = single_pitcher['pitch_type_le'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [541]:
dt_clf = DecisionTreeClassifier(min_samples_split=2,
                                max_depth=4,
                                max_features=5,
                                random_state=23)
dt_clf.fit(X_train, y_train)
preds = dt_clf.predict(X_test)
acc = (preds == y_test).mean()
print('Accuracy is {}%'.format(round(acc*100,1)))

Accuracy is 49.4%


In [542]:
print(classification_report_imbalanced(y_test, preds))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       296
          2       0.00      0.00      1.00      0.00      0.00      0.00        75
          3       0.49      1.00      0.01      0.66      0.12      0.02       943
          4       0.00      0.00      1.00      0.00      0.00      0.00        73
          6       0.00      0.00      1.00      0.00      0.00      0.00       254
          9       0.44      0.03      0.99      0.05      0.16      0.02       277

avg / total       0.31      0.49      0.51      0.33      0.08      0.01      1918



  'precision', 'predicted', average, warn_for)


### Trying out Imbalance Learn package

In [543]:
print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

Training target statistics: Counter({3: 3008, 0: 857, 9: 818, 6: 675, 4: 220, 2: 173})
Testing target statistics: Counter({3: 943, 0: 296, 9: 277, 6: 254, 2: 75, 4: 73})


### Stock SVC

In [546]:
pipeline = make_pipeline(SVC(random_state=23))
pipeline.fit(X_train, y_train)
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))



                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       296
          2       0.00      0.00      1.00      0.00      0.00      0.00        75
          3       0.49      1.00      0.00      0.66      0.00      0.00       943
          4       0.00      0.00      1.00      0.00      0.00      0.00        73
          6       0.00      0.00      1.00      0.00      0.00      0.00       254
          9       0.00      0.00      1.00      0.00      0.00      0.00       277

avg / total       0.24      0.49      0.51      0.32      0.00      0.00      1918



  'precision', 'predicted', average, warn_for)


### Random Oversampled SVC 

In [547]:
pipe = make_pipeline(RandomOverSampler(random_state=23), SVC())
pipe.fit(X_train, y_train)
print(classification_report_imbalanced(y_test, pipe.predict(X_test)))



                   pre       rec       spe        f1       geo       iba       sup

          0       0.21      0.32      0.78      0.25      0.50      0.24       296
          2       0.04      0.17      0.84      0.07      0.38      0.14        75
          3       0.54      0.14      0.89      0.22      0.35      0.11       943
          4       0.10      0.29      0.89      0.15      0.51      0.24        73
          6       0.22      0.38      0.80      0.28      0.55      0.29       254
          9       0.16      0.16      0.85      0.16      0.37      0.13       277

avg / total       0.36      0.21      0.85      0.21      0.41      0.16      1918



### SVM SMOTE Oversampling

In [548]:
from imblearn.over_sampling import SVMSMOTE

In [549]:
sm = SVMSMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(np.array(X_train), y_train.ravel()) 

In [550]:
smote_clf = SVC(random_state=23)

smote_clf.fit(X_train_res, y_train_res)
preds = smote_clf.predict(X_test)



In [551]:
print(classification_report_imbalanced(y_test, preds))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.20      0.24      0.82      0.22      0.45      0.19       296
          2       0.06      0.08      0.94      0.07      0.27      0.07        75
          3       0.51      0.37      0.65      0.43      0.49      0.23       943
          4       0.10      0.22      0.93      0.14      0.45      0.19        73
          6       0.18      0.29      0.80      0.22      0.48      0.22       254
          9       0.17      0.13      0.90      0.15      0.34      0.11       277

avg / total       0.34      0.29      0.76      0.30      0.45      0.20      1918

