# Hall of Fame classifier

Estimate probabilities of being elected to the hall of fame for current players

In [1]:
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

###  Read the Master file to get names

In [2]:
master = pd.read_csv("../../baseballdatabank-2017.1/core/Master.csv")
print(master.shape)
print(master.columns)

(19105, 24)
Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')


###  Trim down to only variables we want

In [3]:
names = master[['playerID','nameFirst','nameLast']]
names.head()


Unnamed: 0,playerID,nameFirst,nameLast
0,aardsda01,David,Aardsma
1,aaronha01,Hank,Aaron
2,aaronto01,Tommie,Aaron
3,aasedo01,Don,Aase
4,abadan01,Andy,Abad


###  Get the Pitching data

In [4]:
pitching = pd.read_csv("../../baseballdatabank-2017.1/core/Pitching.csv")
print(pitching.shape)
print(pitching.columns)
pitching.head()

(44963, 30)
Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp',
       'ERA', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')


Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,,,,0,,,42,,,
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,,,,0,,,292,,,
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,,,,0,,,9,,,
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,,,,0,,,257,,,
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,,,,0,,,21,,,


###  Summarize by playerID

In [5]:
pitchers = pitching.groupby('playerID').sum()
print(pitchers.columns)
pitchers.head()

Index(['yearID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts',
       'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'IBB', 'WP', 'HBP', 'BK',
       'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')


Unnamed: 0_level_0,yearID,stint,W,L,G,GS,CG,SHO,SV,IPouts,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aardsda01,18084,9,16,18,331,0,0,0,69,1011,...,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0
aasedo01,25786,13,66,60,448,91,22,5,82,3328,...,45.0,21.0,7.0,3,4730.0,234.0,503,,,
abadfe01,16107,9,6,26,315,6,0,0,1,822,...,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0
abbeybe01,11365,7,22,40,79,66,52,0,1,1704,...,,18.0,0.0,0,0.0,12.0,442,,,
abbeych01,1896,1,0,0,1,0,0,0,0,6,...,,1.0,0.0,0,0.0,1.0,3,,,


###  Note that playerID is no longer a column, it's an index  

We can turn it back into a column by resetting the index

In [6]:
pitchers = pitchers.reset_index()
print(pitchers.columns)
pitchers.head()

Index(['playerID', 'yearID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV',
       'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'IBB', 'WP',
       'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')


Unnamed: 0,playerID,yearID,stint,W,L,G,GS,CG,SHO,SV,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,aardsda01,18084,9,16,18,331,0,0,0,69,...,22.0,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0
1,aasedo01,25786,13,66,60,448,91,22,5,82,...,45.0,21.0,7.0,3,4730.0,234.0,503,,,
2,abadfe01,16107,9,6,26,315,6,0,0,1,...,9.0,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0
3,abbeybe01,11365,7,22,40,79,66,52,0,1,...,,18.0,0.0,0,0.0,12.0,442,,,
4,abbeych01,1896,1,0,0,1,0,0,0,0,...,,1.0,0.0,0,0.0,1.0,3,,,


###  Throw out non-numeric and non-summable columns

In [7]:
pitchers2 = pitchers.drop(['yearID','stint','ERA'],axis=1)
print(pitchers2.columns)

Index(['playerID', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER',
       'HR', 'BB', 'SO', 'BAOpp', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R',
       'SH', 'SF', 'GIDP'],
      dtype='object')


###  Read the HallOfFame data

In [8]:
hall = pd.read_csv("../../baseballdatabank-2017.1/core/HallOfFame.csv")
print(hall.shape)
print(hall.columns)
hall.head()

(4156, 9)
Index(['playerID', 'yearid', 'votedBy', 'ballots', 'needed', 'votes',
       'inducted', 'category', 'needed_note'],
      dtype='object')


Unnamed: 0,playerID,yearid,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


###  We're only interested in those who were inducted

In [9]:
in_hall = hall.loc[hall['inducted']=='Y'][['playerID','inducted']]
in_hall.head()

Unnamed: 0,playerID,inducted
0,cobbty01,Y
1,ruthba01,Y
2,wagneho01,Y
3,mathech01,Y
4,johnswa01,Y


###  Left join HallOfFame data with pitcher data

Only pitchers inducted get 'Y', others get NaN  (missing value)

In [10]:
pitchers3 = pitchers2.merge(in_hall,how='left',on='playerID')
print(pitchers3.columns)
print(pitchers3.shape)
pitchers3.head()

Index(['playerID', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER',
       'HR', 'BB', 'SO', 'BAOpp', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R',
       'SH', 'SF', 'GIDP', 'inducted'],
      dtype='object')
(9302, 26)


Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,IPouts,H,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,inducted
0,aardsda01,16,18,331,0,0,0,69,1011,296,...,12.0,16.0,1,1475.0,141.0,169,17.0,11.0,21.0,
1,aasedo01,66,60,448,91,22,5,82,3328,1085,...,21.0,7.0,3,4730.0,234.0,503,,,,
2,abadfe01,6,26,315,6,0,0,1,822,260,...,9.0,11.0,1,1168.0,81.0,119,7.0,10.0,18.0,
3,abbeybe01,22,40,79,66,52,0,1,1704,686,...,18.0,0.0,0,0.0,12.0,442,,,,
4,abbeych01,0,0,1,0,0,0,0,6,6,...,1.0,0.0,0,0.0,1.0,3,,,,


### See if we can pick out the number of rows from the shape

In [11]:
pitchers3.shape[0]

9302

###  Set codes 1=inducted 0=not inducted in a Pandas Series

In [12]:
count=0
hof1 = np.zeros(pitchers3.shape[0])
for row in pitchers3.iterrows():
    i = row[0]
    if (pd.notnull(row[1][25])):
        hof1[i]=1


### Add the series as a column to the pitchers data

In [13]:
pitchers3['hof'] = pd.Series(hof1)
print(pitchers3.columns)

Index(['playerID', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER',
       'HR', 'BB', 'SO', 'BAOpp', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R',
       'SH', 'SF', 'GIDP', 'inducted', 'hof'],
      dtype='object')


### Summarize the pitchers data

In [14]:
pitchers3.describe()

Unnamed: 0,W,L,G,GS,CG,SHO,SV,IPouts,H,ER,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,hof
count,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,...,9259.0,9132.0,9302.0,9219.0,9259.0,9302.0,2764.0,2764.0,2764.0,9302.0
mean,22.794453,22.794345,114.470329,45.849925,15.231133,2.168351,7.273705,1227.139647,410.44055,174.514943,...,12.242791,11.019054,1.457859,1665.505369,30.722324,207.953988,9.348046,8.197902,23.22576,0.010858
std,43.93138,37.717317,164.183047,93.20372,45.442337,6.346569,29.358414,2116.168285,694.073851,277.869566,...,21.249325,19.072953,3.451431,2866.455077,66.329206,338.650194,15.734257,11.998714,38.406587,0.103639
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,7.0,0.0,0.0,0.0,0.0,50.0,19.0,11.0,...,0.5,0.0,0.0,61.0,2.0,13.0,0.0,1.0,2.0,0.0
50%,4.0,5.0,38.0,4.0,0.0,0.0,0.0,290.0,104.0,50.0,...,4.0,3.0,0.0,393.0,8.0,60.0,3.0,3.0,8.0,0.0
75%,24.0,28.0,160.0,41.0,6.0,1.0,3.0,1446.75,483.0,217.0,...,15.0,13.0,1.0,1985.0,30.0,252.0,11.0,11.0,28.0,0.0
max,511.0,316.0,1252.0,815.0,749.0,110.0,652.0,22064.0,7092.0,2147.0,...,277.0,203.0,90.0,30058.0,952.0,3497.0,155.0,100.0,362.0,1.0


### Drop observations with missing values in any column

In [15]:
pitchers4 = pitchers3.dropna(how='any',axis=1)
print(pitchers4.shape)

(9302, 17)


###  Describe the non-missing data

In [16]:
pitchers4.describe()

Unnamed: 0,W,L,G,GS,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BK,R,hof
count,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0,9302.0
mean,22.794453,22.794345,114.470329,45.849925,15.231133,2.168351,7.273705,1227.139647,410.44055,174.514943,31.100946,144.552999,224.770802,1.457859,207.953988,0.010858
std,43.93138,37.717317,164.183047,93.20372,45.442337,6.346569,29.358414,2116.168285,694.073851,277.869566,55.0541,228.423698,406.523467,3.451431,338.650194,0.103639
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,7.0,0.0,0.0,0.0,0.0,50.0,19.0,11.0,1.0,9.0,7.0,0.0,13.0,0.0
50%,4.0,5.0,38.0,4.0,0.0,0.0,0.0,290.0,104.0,50.0,8.0,42.0,49.0,0.0,60.0,0.0
75%,24.0,28.0,160.0,41.0,6.0,1.0,3.0,1446.75,483.0,217.0,35.0,183.0,258.0,1.0,252.0,0.0
max,511.0,316.0,1252.0,815.0,749.0,110.0,652.0,22064.0,7092.0,2147.0,522.0,2795.0,5714.0,90.0,3497.0,1.0


###  Create a dataframe with a sample of the non-inducted pitchers

In [18]:
sampsize = 599
nonhof = pitchers4.loc[pitchers4['hof']==0].sample(sampsize)
print(type(nonhof))
print(nonhof.shape)
print(nonhof.columns)

<class 'pandas.core.frame.DataFrame'>
(599, 17)
Index(['playerID', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER',
       'HR', 'BB', 'SO', 'BK', 'R', 'hof'],
      dtype='object')


### Create a dataframe with all inducted pitchers

In [19]:
hof = pitchers4.loc[pitchers4.hof==1]
print(hof.shape)
print(hof.columns)

(101, 17)
Index(['playerID', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER',
       'HR', 'BB', 'SO', 'BK', 'R', 'hof'],
      dtype='object')


### Concatenate these for the KNN classification analysis

In [23]:
hof3 = pd.concat([nonhof,hof])
print(hof3.shape)

(700, 17)


###  Drop columns we will not use

In [24]:
hofx = hof3.drop(['playerID','hof'],axis=1)
print(hofx.shape)
hofx.describe()

(700, 15)


Unnamed: 0,W,L,G,GS,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BK,R
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,44.585714,37.365714,156.128571,86.534286,37.365714,5.744286,7.854286,2191.747143,706.068571,278.057143,47.827143,228.212857,407.957143,2.307143,339.221429
std,83.543722,61.020487,213.052313,162.885665,94.030665,13.97695,30.40036,3791.046896,1190.29004,440.269237,85.962763,366.634739,758.480735,5.941894,551.256103
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,7.75,0.0,0.0,0.0,0.0,52.0,20.75,11.75,1.0,10.0,8.75,0.0,14.0
50%,5.0,7.0,47.0,7.0,1.0,0.0,0.0,394.0,141.5,68.5,11.0,56.5,68.0,0.0,77.5
75%,38.0,44.5,228.0,77.0,16.0,2.0,4.0,2074.5,705.0,316.25,51.25,261.25,404.25,2.0,369.25
max,511.0,316.0,1071.0,815.0,749.0,110.0,390.0,22064.0,7092.0,2147.0,505.0,2795.0,5714.0,90.0,3355.0


###  Select the y vector for the KNN classification analysis as a Pandas Series

In [26]:
hofy = hof3[['hof']]
print(hofy.shape)

(700, 1)


### Convert the x array from a dataframe to a numpy array

In [27]:
hofxnp = hofx.values

### Convert the y array for a series to a numpy array

In [28]:
hofynp = hofy.values

## Documentation for KNeighborsClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.predict_proba

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

### Split the data into training and test subsets

In [31]:
X_train, X_test, y_train, y_test = train_test_split(hofxnp, hofynp, test_size=0.33) #, random_state=42)

### Train the KNN classification algorithm with the training data

In [38]:
knn = KNeighborsClassifier(n_neighbors=5,weights='uniform',algorithm='auto')
knn.fit(X_train, y_train) 
print(type(knn))

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>


  


### Introspection for the output object from KNN classifier

In [39]:
dir(knn)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_estimator_type',
 '_fit',
 '_fit_X',
 '_fit_method',
 '_get_param_names',
 '_init_params',
 '_pairwise',
 '_tree',
 '_y',
 'algorithm',
 'classes_',
 'effective_metric_',
 'effective_metric_params_',
 'fit',
 'get_params',
 'kneighbors',
 'kneighbors_graph',
 'leaf_size',
 'metric',
 'metric_params',
 'n_jobs',
 'n_neighbors',
 'outputs_2d_',
 'p',
 'predict',
 'predict_proba',
 'radius',
 'score',
 'set_params',
 'weights']

### Sum Hall of Fame counts for X_test, y_test, and y_train

In [40]:
print(sum(knn.predict(X_test)))
print(sum(y_test))
print(sum(y_train))

24.0
[ 39.]
[ 62.]


### Compute the percent correct score for the classification

In [41]:
print(knn.score(X_test,y_test))

0.909090909091


### Show the predicted values for the X_test data

In [42]:
knn.predict(X_test)


array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1

### Show the classification probabilities for X_test

Can you explain why only certain values appear?

In [43]:
knn.predict_proba(X_test)


array([[ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 0.8,  0.2],
       [ 0.6,  0.4],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 0. ,  1. ],
       [ 0.8,  0.2],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0. ,  1. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 0. ,  1. ],
       [ 0.8,  0.2],
       [ 0. ,  1. ],
       [ 1. ,  0. ],
       [ 0.8,  0.2],
       [ 0.8,  0.2],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,