# Hall of Fame classifier - SVM version

Estimate probabilities of being elected to the hall of fame for current players

In [None]:
import re
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

###  Read the Master file to get names

In [None]:
master = pd.read_csv("../../baseballdatabank-2017.1/core/Master.csv")
print(master.shape)
print(master.columns)

###  Trim down to only variables we want

In [None]:
names = master[['playerID','nameFirst','nameLast']]
names.head()


###  Get the batting data

In [None]:
batting = pd.read_csv("../../baseballdatabank-2017.1/core/Batting.csv")
print(batting.shape)
print(batting.columns)
batting.head()

###  Summarize by playerID

In [None]:
batters = batting.groupby('playerID').sum()
print(batters.columns)
batters.head()

###  Note that playerID is no longer a column, it's an index  

We can turn it back into a column by resetting the index

In [None]:
batters = batters.reset_index()
print(batters.columns)
batters.head()

###  Throw out non-numeric and non-summable columns

In [None]:
batters2 = batters.drop(['yearID','stint'],axis=1)
print(batters2.columns)

###  Read the HallOfFame data

In [None]:
hall = pd.read_csv("../../baseballdatabank-2017.1/core/HallOfFame.csv")
print(hall.shape)
print(hall.columns)
hall.head()

###  We're only interested in those who were inducted

In [None]:
in_hall = hall.loc[hall['inducted']=='Y'][['playerID','inducted']]
in_hall.head()

###  Left join HallOfFame data with batter data

Only batters inducted get 'Y', others get NaN  (missing value)

In [None]:
batters3 = batters2.merge(in_hall,how='left',on='playerID')
print(batters3.columns)
print(batters3.shape)
batters3.head()

### See if we can pick out the number of rows from the shape

In [None]:
batters3.shape[0]

###  Set codes 1=inducted 0=not inducted in a Pandas Series

In [None]:
count=0
hof1 = np.zeros(batters3.shape[0])
for row in batters3.iterrows():
    i = row[0]
    if (pd.notnull(row[1][18])):
        hof1[i]=1


### Add the series as a column to the batters data

In [None]:
batters3['hof'] = pd.Series(hof1)
print(batters3.columns)

### Summarize the batters data

In [None]:
batters3.describe()

### Drop observations with missing values in any column

In [None]:
batters4 = batters3.dropna(how='any',axis=1)
print(batters4.shape)

###  Describe the non-missing data

In [None]:
batters4.describe()

###  Create a dataframe with a sample of the non-inducted batters

In [None]:
sampsize = 799
nonhof = batters4.loc[batters4['hof']==0].sample(sampsize)
print(type(nonhof))
print(nonhof.shape)
print(nonhof.columns)

### Create a dataframe with all inducted batters

In [None]:
hof = batters4.loc[batters4.hof==1]
print(hof.shape)
print(hof.columns)

### Concatenate these for the KNN classification analysis

In [None]:
hof3 = pd.concat([nonhof,hof])
print(hof3.shape)

###  Drop columns we will not use

In [None]:
hofx = hof3.drop(['playerID','hof'],axis=1)
print(hofx.shape)
hofx.describe()

###  Select the y vector for the KNN classification analysis as a Pandas Series

In [None]:
hofy = hof3[['hof']]
print(hofy.shape)

### Convert the x array from a dataframe to a numpy array

In [None]:
hofxnp = hofx.values

### Convert the y array for a series to a numpy array

In [None]:
hofynp = hofy.values

## Documentation for KNeighborsClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.predict_proba

In [None]:
from sklearn import svm

In [None]:
from sklearn.model_selection import train_test_split

### Split the data into training and test subsets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(hofxnp, hofynp, test_size=0.33) #, random_state=42)

### Create SVM classifier type SVC

In [None]:
clf = svm.SVC(kernel='rbf')

In [None]:
type(clf)

### Train the SVM classification algorithm with the training data

In [None]:
clf.fit(X_train,y_train)

### Introspection for the output object from SVM classifier

In [None]:
dir(clf)

### Sum Hall of Fame counts for X_test, y_test, and y_train

In [None]:
print(sum(clf.predict(X_test)))
print(sum(y_test))
print(sum(y_train))

### Compute the percent correct score for the classification

In [None]:
print(clf.score(X_test,y_test))

### Create SVM classifier type NuSVC

In [None]:
nuclf = svm.NuSVC(kernel='rbf')

In [None]:
type(nuclf)

### Train the SVM classification algorithm with the training data

In [None]:
nuclf.fit(X_train,y_train)

### Sum Hall of Fame counts for X_test, y_test, and y_train

In [None]:
print(sum(nuclf.predict(X_test)))
print(sum(y_test))
print(sum(y_train))

### Compute the percent correct score for the classification

In [None]:
print(nuclf.score(X_test,y_test))

### Create SVM classifier type LinearSVC

In [None]:
linclf = svm.LinearSVC()

In [None]:
type(linclf)

### Train the SVM classification algorithm with the training data

In [None]:
linclf.fit(X_train,y_train)

### Sum Hall of Fame counts for X_test, y_test, and y_train

In [None]:
print(sum(linclf.predict(X_test)))
print(sum(y_test))
print(sum(y_train))

### Compute the percent correct score for the classification

In [None]:
print(linclf.score(X_test,y_test))