In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline

## Classification examples

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')

In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
          s=50);

## Defining a classifier

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ]).scatter(0, 1, colors='Class')

## Distance

In [None]:
def distance(row1, row2):
    """The distance between two rows."""
    assert len(row1) == len(row2), 'Both rows must have the same length'
    sum_squares = 0
    for i in np.arange(len(row1)):
        sum_squares = sum_squares + (row1.item(i) - row2.item(i))**2
    return np.sqrt(sum_squares)

In [None]:
patients.show(3)

In [None]:
distance(patients.row(0), patients.row(1))

In [None]:
distance(patients.row(0), patients.row(0))

In [None]:
distance(patients.row(0), patients.row(2))

## Classification Procedure

In [None]:
def distances(training, example):
    """Compute the distance from example for each row in training."""
    dists = []
    attributes = training.drop('Class')
    for i in np.arange(training.num_rows):
        dists.append(distance(attributes.row(i), example))
    return training.with_column('Distance', dists)

def closest(training, example, k):
    """Return a table of the k closest neighbors to example."""
    return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(top_k):
    """Return the class with the highest count."""
    return top_k.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    """Return the majority class among the k nearest neighbors."""
    return majority_class(closest(training, example, k))

In [None]:
patients.take(12)

In [None]:
example = patients.drop('Class').row(12)
example

In [None]:
closest(patients.exclude(12), example, 5)

In [None]:
classify(patients.exclude(12), example, 5)

## Evaluation

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
trainset = shuffled.take(range(342))
testset  = shuffled.take(range(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    numcorrect = 0
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = classify(training, test_attributes.row(i), k)
        # Was the classifier's prediction correct?
        if c == test.column('Class').item(i):
            numcorrect = numcorrect + 1
    return numcorrect / test.num_rows

In [None]:
evaluate_accuracy(trainset, testset, 5)