In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

jittered.scatter(0, 1, colors='Class')

## Distance ##

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

## Classification Procedure ##

In [None]:
# Compute distance between example and every row in training. 
# Return training augmented with Distance column

def distances(training, example):
    """Compute distance between example and every row in training.
    Return training augmented with Distance column"""
    distances = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        distance = row_distance(row, example)
        distances = np.append(distances, distance)
    return training.with_column('Distance', distances)

In [None]:
patients.row(15)

In [None]:
example = attributes.row(15)
example

In [None]:
distances(patients.exclude(15), example).sort('Distance')

In [None]:
def closest(training, example, k):
    return distances(training, example).sort('Distance').take(np.arange(k))

In [None]:
closest(patients.exclude(15), example, 5).group('Class').sort('count', descending=True)

In [None]:
def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    return majority_class(closest(training, example, k))

In [None]:
classify(patients.exclude(15), example, 5)

In [None]:
classify(patients.exclude(15), example, 7)

In [None]:
my_example = attributes.row(10)
my_example

In [None]:
classify(patients.exclude(10), my_example, 5)

In [None]:
patients.take(10)

## Evaluation ##

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False)
training_set = shuffled.take(np.arange(341))
test_set = shuffled.take(np.arange(341, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test_attributes.num_rows):
        c = classify(training, test_attributes.row(i), k)
        true_label = test.column('Class').item(i)
        num_correct = num_correct + (c == true_label)
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 5)

In [None]:
evaluate_accuracy(training_set, test_set, 7)

In [None]:
evaluate_accuracy(training_set, test_set, 11)

In [None]:
evaluate_accuracy(training_set, training_set, 1)