# Hall of Fame classifier

Estimate probabilities of being elected to the hall of fame for current players

In [None]:
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

###  Read the Master file to get names

In [None]:
master = pd.read_csv("../../baseballdatabank-2017.1/core/Master.csv")
print(master.shape)
print(master.columns)

###  Trim down to only variables we want

In [None]:
names = master[['playerID','nameFirst','nameLast']]
names.head()


###  Get the Pitching data

In [None]:
pitching = pd.read_csv("../../baseballdatabank-2017.1/core/Pitching.csv")
print(pitching.shape)
print(pitching.columns)
pitching.head()

###  Summarize by playerID

In [None]:
pitchers = pitching.groupby('playerID').sum()
print(pitchers.columns)
pitchers.head()

###  Note that playerID is no longer a column, it's an index  

We can turn it back into a column by resetting the index

In [None]:
pitchers = pitchers.reset_index()
print(pitchers.columns)
pitchers.head()

###  Throw out non-numeric and non-summable columns

In [None]:
pitchers2 = pitchers.drop(['yearID','stint','ERA'],axis=1)
print(pitchers2.columns)

###  Read the HallOfFame data

In [None]:
hall = pd.read_csv("../../baseballdatabank-2017.1/core/HallOfFame.csv")
print(hall.shape)
print(hall.columns)
hall.head()

###  We're only interested in those who were inducted

In [None]:
in_hall = hall.loc[hall['inducted']=='Y'][['playerID','inducted']]
in_hall.head()

###  Left join HallOfFame data with pitcher data

Only pitchers inducted get 'Y', others get NaN  (missing value)

In [None]:
pitchers3 = pitchers2.merge(in_hall,how='left',on='playerID')
print(pitchers3.columns)
print(pitchers3.shape)
pitchers3.head()

### See if we can pick out the number of rows from the shape

In [None]:
pitchers3.shape[0]

###  Set codes 1=inducted 0=not inducted in a Pandas Series

In [None]:
count=0
hof1 = np.zeros(pitchers3.shape[0])
for row in pitchers3.iterrows():
    i = row[0]
    if (pd.notnull(row[1][25])):
        hof1[i]=1


### Add the series as a column to the pitchers data

In [None]:
pitchers3['hof'] = pd.Series(hof1)
print(pitchers3.columns)

### Summarize the pitchers data

In [None]:
pitchers3.describe()

### Drop observations with missing values in any column

In [None]:
pitchers4 = pitchers3.dropna(how='any',axis=1)
print(pitchers4.shape)

###  Describe the non-missing data

In [None]:
pitchers4.describe()

###  Create a dataframe with a sample of the non-inducted pitchers

In [None]:
sampsize = 599
nonhof = pitchers4.loc[pitchers4['hof']==0].sample(sampsize)
print(type(nonhof))
print(nonhof.shape)
print(nonhof.columns)

### Create a dataframe with all inducted pitchers

In [None]:
hof = pitchers4.loc[pitchers4.hof==1]
print(hof.shape)
print(hof.columns)

### Concatenate these for the KNN classification analysis

In [None]:
hof3 = pd.concat([nonhof,hof])
print(hof3.shape)

###  Drop columns we will not use

In [None]:
hofx = hof3.drop(['playerID','hof'],axis=1)
print(hofx.shape)
hofx.describe()

###  Select the y vector for the KNN classification analysis as a Pandas Series

In [None]:
hofy = hof3[['hof']]
print(hofy.shape)

### Convert the x array from a dataframe to a numpy array

In [None]:
hofxnp = hofx.values

### Convert the y array for a series to a numpy array

In [None]:
hofynp = hofy.values

## Documentation for KNeighborsClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.predict_proba

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

### Split the data into training and test subsets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(hofxnp, hofynp, test_size=0.33) #, random_state=42)

### Train the KNN classification algorithm with the training data

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,weights='uniform',algorithm='auto')
knn.fit(X_train, y_train) 
print(type(knn))

### Introspection for the output object from KNN classifier

In [None]:
dir(knn)

### Sum Hall of Fame counts for X_test, y_test, and y_train

In [None]:
print(sum(knn.predict(X_test)))
print(sum(y_test))
print(sum(y_train))

### Compute the percent correct score for the classification

In [None]:
print(knn.score(X_test,y_test))

### Show the predicted values for the X_test data

In [None]:
knn.predict(X_test)


### Show the classification probabilities for X_test

Can you explain why only certain values appear?

In [None]:
knn.predict_proba(X_test)
