In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
ckd = Table.read_table('https://www.inferentialthinking.com/data/ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
banknotes = Table.read_table('https://www.inferentialthinking.com/data/banknote.csv')
banknotes

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')

In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
          s=50);

## Defining a classifier

In [None]:
patients = Table.read_table('https://www.inferentialthinking.com/data/breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

jittered.scatter(0, 1, colors='Class')

## Distance

In [None]:
def distance(pt1, pt2):
    """The distance between two points, represented as arrays."""
    return np.sqrt(sum((pt1 - pt2) ** 2))

def row_distance(row1, row2):
    """The distance between two rows of a table."""
    return distance(np.array(row1), np.array(row2))

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

## Classification Procedure

In [None]:
def distances(training, example):
    """Compute the distance from example for each row in training."""
    dists = []
    attributes = training.drop('Class')
    for row in attributes.rows:
        dists.append(row_distance(row, example))
    return training.with_column('Distance', dists)

def closest(training, example, k):
    """Return a table of the k closest neighbors to example."""
    return distances(training, example).sort('Distance').take(np.arange(k))

In [None]:
example = patients.take(21)

In [None]:
example_21 = attributes.row(21)
example_21

In [None]:
closest(patients.exclude(21), example_21, 5)

In [None]:
def majority_class(top_k):
    """Return the class with the highest count."""
    return top_k.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    """Return the majority class among the k nearest neighbors."""
    return majority_class(closest(training, example, k))

In [None]:
classify(patients.exclude(21), example_21, 5)

In [None]:
patients.take(21)

In [None]:
example_12 = attributes.row(12)
classify(patients.exclude(12), example_12, 5)

In [None]:
patients.take(12)

## Review of the Steps ##

- `distance(pt1, pt2)`: Returns the distance between the arrays `pt1` and `pt2`
- `row_distance(row1, row2)`: Returns the distance between the rows `row1` and `row2`
- `distances(training, example)`: Returns a table that is `training` with an additional column `'Distance'` that contains the distance between `example` and each row of `training`
- `closest(training, example, k)`: Returns a table of the rows corresponding to the k smallest distances 
- `majority_class(topk)`: Returns the majority class in the `'Class'` column of the result of `closest`
- `classify(training, example, k)`: Returns the predicted class of `example` based on a `k` nearest neighbors classifier using the historical sample `training`

## Evaluation

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
trainset = shuffled.take(range(342))
testset  = shuffled.take(range(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    numcorrect = 0
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = classify(training, test_attributes.row(i), k)
        # Was the classifier's prediction correct?
        if c == test.column('Class').item(i):
            numcorrect = numcorrect + 1
    return numcorrect / test.num_rows

In [None]:
evaluate_accuracy(trainset, testset, 5)

In [None]:
evaluate_accuracy(trainset, testset, 1)