In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline

## Defining a classifier

In [None]:
patients = Table.read_table('https://www.inferentialthinking.com/data/breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

jittered.scatter(0, 1, colors='Class')

## Distance

In [None]:
def distance(pt1, pt2):
    """The distance between two points, represented as arrays."""
    return ...

def row_distance(row1, row2):
    """The distance between two rows of a table."""
    return ...

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

## Classification Procedure

In [None]:
def closest(training, example, k):
    """Return a table of the k closest neighbors to example."""
    ...

In [None]:
patients.take(12)

In [None]:
example = patients.drop('Class').row(12)
example

In [None]:
closest(patients.exclude(12), example, 5)

In [None]:
def majority_class(top_k):
    """Return the class with the highest count."""
    ...

def classify(training, example, k):
    """Return the majority class among the k nearest neighbors."""
    ...

In [None]:
classify(patients.exclude(12), example, 5)

## Evaluation

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
trainset = shuffled.take(range(342))
testset  = shuffled.take(range(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    numcorrect = 0
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = classify(training, test_attributes.row(i), k)
        # Was the classifier's prediction correct?
        if c == test.column('Class').item(i):
            numcorrect = numcorrect + 1
    return numcorrect / test.num_rows

In [None]:
evaluate_accuracy(trainset, testset, 5)

In [None]:
evaluate_accuracy(trainset, testset, 1)

In [None]:
evaluate_accuracy(trainset, trainset, 5)

In [None]:
evaluate_accuracy(trainset, trainset, 1)

## Decision boundaries

In [None]:
ckd = Table.read_table('https://www.inferentialthinking.com/data/ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
kidney = ckd.select('Hemoglobin', 'Glucose', 'Class')
kidney.scatter(0, 1, colors=2)
plots.scatter(13, 250, color='red', s=30)

In [None]:
def show_closest(t, point):
    """Show closest training example to a point."""
    near = closest(t, point, 1).row(0)
    t.scatter(0, 1, colors='Class')
    plots.scatter(point.item(0), point.item(1), color='red', s=30)
    plots.plot([point.item(0), near.item(0)], [point.item(1), near.item(1)], color='k', lw=2)

show_closest(kidney, make_array(13, 250))

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

In [None]:
kidney_su = standardize(kidney.drop('Class')).with_column('Class', kidney.column('Class'))
show_closest(kidney_su, make_array(-0.2, 1.8))

In [None]:
show_closest(kidney_su, make_array(-0.2, 1.3))

In [None]:
show_closest(kidney_su, make_array(-0.2, 1))

In [None]:
show_closest(kidney_su, make_array(-0.2, 0.9))

In [None]:
def decision_boundary(t, k):
    """Decision boundary of a two-column + Class table."""
    t_su = standardize(t.drop('Class')).with_column('Class', t.column('Class'))
    decisions = Table(t_su.labels)
    for x in np.arange(-2, 2.1, 0.1):
        for y in np.arange(-2, 2.1, 0.1):
            predicted = classify(t_su, make_array(x, y), k)
            decisions.append([x, y, predicted])
    decisions.scatter(0, 1, colors='Class', alpha=0.4)
    plots.xlim(-2, 2)
    plots.ylim(-2, 2)
    t_su_0 = t_su.where('Class', 0)
    t_su_1 = t_su.where('Class', 1)
    plots.scatter(t_su_0.column(0), t_su_0.column(1), c='darkblue', edgecolor='k')
    plots.scatter(t_su_1.column(0), t_su_1.column(1), c='gold', edgecolor='k')
    
decision_boundary(kidney, 1)

In [None]:
decision_boundary(kidney, 5)

In [None]:
decision_boundary(jittered, 1)

In [None]:
decision_boundary(jittered, 5)