# Lecture 24: Nearest Neighbor

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline
np.set_printoptions(legacy='1.13')

#NUM_REPETITIONS = 5

## Recap previous lecture

In [None]:
raw_ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
raw_ckd.show(3)

In [None]:
def ckd_label(number):
    if number == 0:
        return "notckd"
    elif number == 1:
        return "ckd"
    else:
        return "unknown"
  
ckd = raw_ckd.with_column(
    'Class', raw_ckd.apply(ckd_label, 'Class')
)    
ckd.show(3)

In [None]:
hgb_glc = ckd.select('Hemoglobin', 'Glucose', 'Class')
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
def distance(a, b):
    """Returns the distance between a and b, where a and b
    are both arrays representing points."""
    return np.sqrt(np.sum((a - b)**2))

In [None]:
new_point = make_array(13, 150)

def distance_from_new(row):
    """Return the distance between row and new_point.
    Row is an input to the function and can change every
    time the function is called. new_point is the same
    every time, though."""
    return distance(np.array(row), new_point)

In [None]:
r0 = hgb_glc.row(0)
r0

In [None]:
np.array(r0)
#distance_from_new(r0) #ERROR

In [None]:
distance_from_new(hgb_glc.drop('Class').row(0))

In [None]:
distances = hgb_glc.with_column(
    'Distance', hgb_glc.drop('Class').apply(distance_from_new)
).sort('Distance')

distances

**Q:** In your own words, describe what `apply` does.

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,500);
plots.ylim(0,500);

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,50);
plots.ylim(125,175);

**Accounting for scale.**

In [None]:
def standard_units(array):
    """Convert array to standard units."""
    return (array - array.mean()) / array.std()
    
def standardize(t):
    "Covert table t to standard units."
    su = Table()
    for label in t.labels:
        su = su.with_column(label + ' (su)', standard_units(t.column(label)))
    return su

def in_su(value, array):
    """Return value in standard units according to 
    the distribution of array."""
    return (value - array.mean()) / array.std()

In [None]:
hgb_glc_su = standardize(hgb_glc.drop('Class'))\
  .with_column('Class', hgb_glc.column('Class'))
hgb_glc_su

In [None]:
hgb_glc_su.scatter('Hemoglobin (su)', 'Glucose (su)', colors='Class')
x_su = in_su(13, hgb_glc.column('Hemoglobin'))
y_su = in_su(150, hgb_glc.column('Glucose'))
plots.scatter(x_su, y_su, color='red');

In [None]:
new_point = make_array(x_su, y_su)

def distance_from_new(row):
    """Return the distance between row and new_point."""
    return distance(np.array(row), new_point)
       
distances = hgb_glc_su.with_column(
    'Distance', hgb_glc_su.drop('Class').apply(distance_from_new)
).sort('Distance')

distances

In [None]:
hgb_glc_su.scatter('Hemoglobin (su)', 'Glucose (su)', colors='Class')
close_x = distances.column('Hemoglobin (su)').item(0)
close_y = distances.column('Glucose (su)').item(0)
plots.scatter([x_su, close_x], [y_su, close_y], color='red');

**Nearest Neighbor Classifier.**

In [None]:
def hgb_glc_nn_classifier_su(new_point_su):
    """Return the hgb/glc classification of new_point_su, which
    should be in standard units already."""
    def distance_from_new(row):
        return distance(np.array(row), new_point_su)
    
    distances = hgb_glc_su.with_column(
        'Distance', hgb_glc_su.drop('Class').apply(distance_from_new)
    )
    return distances.sort('Distance').column('Class').item(0)

def hgb_glc_nn_classifier_ou(new_point):
    """Return the hgb/glc classification of new_point, which
    should still be in original units."""
    hgb_su = in_su(new_point.item(0), hgb_glc.column('Hemoglobin'))
    glc_su = in_su(new_point.item(1), hgb_glc.column('Glucose'))
    return hgb_glc_nn_classifier_su(make_array(hgb_su, glc_su))

In [None]:
hgb_glc_nn_classifier_ou(make_array(13, 150))

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13,13,13], [150,200,250], color='red');

**Q:** What should the classification of (13, 200) be?

A.  ckd  
B.  notckd  

In [None]:
hgb_glc_nn_classifier_ou(make_array(13, 200))

In [None]:
x_su = in_su(13, hgb_glc.column('Hemoglobin'))
y_su = in_su(200, hgb_glc.column('Glucose'))
make_array(x_su,y_su)

In [None]:
decisions = Table(hgb_glc.labels)
for hgb in np.arange(10, 19, .125):
    for glc in np.arange(50, 250, 5):
        predicted = hgb_glc_nn_classifier_ou(make_array(hgb, glc))
        decisions.append([hgb, glc, predicted])
decisions.scatter(0, 1, colors='Class', alpha=0.4)
plots.xlim(10,19)
plots.ylim(40,250)
notckd_pts = hgb_glc.where('Class', 'notckd')
ckd_pts = hgb_glc.where('Class', 'ckd')
plots.scatter(notckd_pts.column(0), notckd_pts.column(1), c='gold', edgecolor='k');
plots.scatter(ckd_pts.column(0), ckd_pts.column(1), c='darkblue', edgecolor='k');

## Training and Testing

In [None]:
shuffled_hgb_glc = hgb_glc.sample(with_replacement=False)
half = int(hgb_glc.num_rows / 2)
train_hgb_glc = shuffled_hgb_glc.take(np.arange(0, half))
test_hgb_glc  = shuffled_hgb_glc.take(np.arange(half, hgb_glc.num_rows))

In [None]:
train_hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
def train_nn_su_classifier(train):
    """Create a nearest-neighbor classifier.  The last column 
    of the training table should be the labels, and the preceeding
    columns should be the attributes.  The names of the columns do not
    matter.  The classifier will work in standard units; but,
    the training table should be in original units.
    
    Returns a function.  That function takes as input a 
    new point in original units. The function returns the 
    classification of the new point as output."""
    
    last_column = train.num_columns - 1
    
    train_su = standardize(train.drop(last_column)).with_column(
        train.labels[last_column], train.column(last_column)
    )
    
    def classify_new_point(new_point):
        """Return the classification of new_point, which
        should still be in original units."""
        
        new_point_su = make_array()
        for i in range(len(new_point)):
            coord_su = in_su(new_point.item(i), train.column(i))
            new_point_su = np.append(new_point_su, coord_su)
        
        def distance_from_new(row):
            """Return the distance between row, which
            should be in standard units, and new_point_su."""
            return distance(np.array(row), new_point_su)
    
        distances = Table().with_columns(
            'Distance', train_su.drop(last_column).apply(distance_from_new),
            'Class', train_su.column(last_column)
        ).sort('Distance')
    
        return distances.column('Class').item(0)
    
    return classify_new_point

In [None]:
hgb_glc_scaled_classifier = train_nn_su_classifier(train_hgb_glc)

In [None]:
hgb_glc_scaled_classifier(make_array(13, 150))

**Note:** What we just did with returning a function was incredibly important to understand.

In [None]:
def visualize_boundary(known_points, classifier, x_range, y_range):
    """Visualize the boundary region of a classifier.
    The visualization is in original units."""
    decisions = Table(known_points.labels)
    for x in x_range:
        for y in y_range:
            predicted = classifier(make_array(x, y))
            decisions.append([x, y, predicted])
    decisions.scatter(0, 1, colors='Class', alpha=0.4)
    plots.xlim(x_range.min(), x_range.max())
    plots.ylim(y_range.min(), y_range.max())
    notckd_pts = known_points.where('Class', 'notckd')
    ckd_pts = known_points.where('Class', 'ckd')
    plots.scatter(notckd_pts.column(0), notckd_pts.column(1), c='gold', edgecolor='k');
    plots.scatter(ckd_pts.column(0), ckd_pts.column(1), c='darkblue', edgecolor='k');

In [None]:
visualize_boundary(train_hgb_glc, hgb_glc_scaled_classifier,
                  np.arange(10, 20, .25), np.arange(50, 250, 5))

In [None]:
visualize_boundary(test_hgb_glc, hgb_glc_scaled_classifier,
                  np.arange(10, 20, .25), np.arange(50, 250, 5))

In [None]:
def classifier_accuracy(test, classifier):
    """Evaluate the accuracy of the classifier using the
    provided test table, whose last column should be
    the labels.  The names of the columns are irrelevant."""
    
    last_column = test.num_columns - 1
    actual_labels = test.column(last_column)
    predicted_labels = test.drop(last_column).apply(classifier)
    
    return np.mean(actual_labels == predicted_labels)    

In [None]:
classifier_accuracy(test_hgb_glc, hgb_glc_scaled_classifier)

## Inference on classifier accuracy

In [None]:
NUM_REPETITIONS = 25
def histogram_nn_classifier_accuracy(data, classifier_trainer, num_repetitions=NUM_REPETITIONS):
    """Visualize the empirical distribution of the accuracy
    of a classifier.  The data input is a table that will
    be split into test and training tables for each iteration
    The classifier_trainer is a function that takes a training 
    table as input, and returns a classifier function.  That 
    classifier function itself should take a new point as input 
    and return the classification of that point."""

    accuracies = make_array()
    
    for _ in np.arange(num_repetitions):
        shuffled = data.sample(with_replacement=False)
        half = int(data.num_rows / 2)
        train = shuffled.take(np.arange(0, half))
        test = shuffled.take(np.arange(half, data.num_rows))
        
        trained_classifier = classifier_trainer(train)
        acc = classifier_accuracy(test, trained_classifier)
        accuracies = np.append(accuracies, acc)
        
    left = percentile(2.5, accuracies)
    right = percentile(97.5, accuracies)
    
    Table().with_column(
        'Accuracy', accuracies
    ).hist()
    plots.plot([left,right], [0,0], color='yellow', lw=8)

In [None]:
histogram_nn_classifier_accuracy(hgb_glc, train_nn_su_classifier)

**Was scaling helpful?**

In [None]:
def train_nn_classifier(train):
    """Create a nearest-neighbor classifier.  The last column 
    of the training table should be the labels, and the preceeding
    column should be the attributes.  The names of the columns do not
    matter.  
    
    Returns a function.  That function takes as input a 
    new point, which should be an array with two items,
    the x and y value of the new point, in that order.
    And the function returns the classification of
    the new point as output."""
    
    def classify_new_point(new_point):
        def distance_from_new(row):
            return distance(np.array(row), new_point)
    
        last_column = train.num_columns - 1
        
        distances = Table().with_columns(
            'Distance', train.drop(last_column).apply(distance_from_new),
            'Class', train.column(last_column)
        ).sort('Distance')
    
        return distances.column('Class').item(0)
    
    return classify_new_point

In [None]:
histogram_nn_classifier_accuracy(hgb_glc, train_nn_classifier)

Let's look at more than 5 repetitions...

**Q:** Was scaling helpful?

A.  Yes  
B.  No  

You do **not** need to worry about scaling and standard units in your proj3.