### Machine Learning - K nearest neighbors

In [1]:
# Read data from file
import pandas as pd
df=pd.read_csv('baseball.csv')
df.tail()

Unnamed: 0,Team,League,Year,RunsScored,RunsAllowed,Wins,OnBasePercentage,SluggingPercentage,BattingAverage,Playoffs,RankSeason,RankPlayoffs,GamesPlayed,OpponentOnBasePercentage,OpponentSluggingPercentage
1227,PHI,NL,1962,705,759,81,0.33,0.39,0.26,0,,,161,,
1228,PIT,NL,1962,706,626,93,0.321,0.394,0.268,0,,,161,,
1229,SFG,NL,1962,878,690,103,0.341,0.441,0.278,1,1.0,2.0,165,,
1230,STL,NL,1962,774,664,84,0.335,0.394,0.271,0,,,163,,
1231,WSA,AL,1962,599,716,60,0.308,0.373,0.25,0,,,162,,


In [2]:
df['Playoffs'].value_counts()

0    988
1    244
Name: Playoffs, dtype: int64

In [3]:
df=df.dropna(subset=['OpponentOnBasePercentage', 'OpponentSluggingPercentage'])
df.tail()

Unnamed: 0,Team,League,Year,RunsScored,RunsAllowed,Wins,OnBasePercentage,SluggingPercentage,BattingAverage,Playoffs,RankSeason,RankPlayoffs,GamesPlayed,OpponentOnBasePercentage,OpponentSluggingPercentage
415,SFG,NL,1999,872,831,86,0.356,0.434,0.271,0,,,162,0.345,0.423
416,STL,NL,1999,809,838,75,0.338,0.426,0.262,0,,,161,0.355,0.427
417,TBD,AL,1999,772,913,69,0.343,0.411,0.274,0,,,162,0.371,0.448
418,TEX,AL,1999,945,859,95,0.361,0.479,0.293,1,5.0,4.0,162,0.346,0.459
419,TOR,AL,1999,883,862,84,0.352,0.457,0.28,0,,,162,0.353,0.456


In [4]:
# Split the DataFrame into `X` (the data) and `y` (the labels).
X=df[['OnBasePercentage','SluggingPercentage','BattingAverage','OpponentOnBasePercentage', 'OpponentSluggingPercentage']]
y=df['Playoffs']

In [14]:
# Using `train_test_split`, split `X` and `y` into training and test sets `(X_train, X_test, y_train, and y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
len(X_train)/len(X)

0.75

In [15]:
# Using KNeighborsClassifier, fit a k-nearest neighbors (knn) classifier with X_train, y_train
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [16]:
# Using knn classifier, predict the class labels for the test set X_test.¶
knn.predict(X_test)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [17]:
# Find the score (mean accuracy) of your knn classifier using X_test and y_test.
knn.score(X_test,y_test)

0.8285714285714286

In [18]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook

# Find the training and testing accuracies by target value (i.e. 1, 0)
no_train_X = X_train[y_train==0]
yes_train_X = X_train[y_train==1]
no_train_y = y_train[y_train==0]
yes_train_y = y_train[y_train==1]

no_test_X = X_test[y_test==0]
yes_test_X = X_test[y_test==1]
no_test_y = y_test[y_test==0]
yes_test_y = y_test[y_test==1]

scores = [knn.score(no_train_X, no_train_y), knn.score(yes_train_X, yes_train_y), 
          knn.score(no_test_X, no_test_y), knn.score(yes_test_X, yes_test_y)]


plt.figure()

# Plot the scores as a bar chart
bars = plt.bar(np.arange(4), scores, color=['#4c72b0','#4c72b0','#55a868','#55a868'])

# directly label the score onto the bars
for bar in bars:
    height = bar.get_height()
    plt.gca().text(bar.get_x() + bar.get_width()/2, height*.90, '{0:.{1}f}'.format(height, 2), 
                 ha='center', color='w', fontsize=11)

# remove all the ticks (both axes), and tick labels on the Y axis
plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='off', labelbottom='on')

# remove the frame of the chart
for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.xticks([0,1,2,3], ['Non-Playoff\nTraining', 'Playoff\nTraining', 'Non-Playoff\nTest', 'Playoff\nTest'], alpha=0.8);
plt.title('Training and Test Accuracies for Non-Playoff and Playoff Teams', alpha=0.8)

<IPython.core.display.Javascript object>



Text(0.5,1,'Training and Test Accuracies for Non-Playoff and Playoff Teams')