In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))

def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

# Lecture 35

## Classifying Patients: Disease, or No Disease?

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', group='Class')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', group='Class')

## Classifying Banknotes: Fraudulent, or Real?

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
banknotes.group('Class')

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', group='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', group='Class')

In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
          s=50);

## Nearest Neighbor Classification

In [None]:
ckd.show(3)

In [None]:
def standard_units(x):
    return (x - np.average(x))/np.std(x)

ckd = Table().with_columns(
    'Hemoglobin', standard_units(ckd.column('Hemoglobin')),
    'Glucose', standard_units(ckd.column('Glucose')),
    'White Blood Cell Count', standard_units(ckd.column('White Blood Cell Count')),
    'Class', ckd.column('Class')
)

ckd.show(3)

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', group='Class')

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('gold', 'darkblue')
)
ckd = ckd.join('Class', color_table)
def show_closest(point):
    """point = array([x,y]) 
    gives the coordinates of a new point
    shown in red"""
    
    HemoGl = ckd.drop('White Blood Cell Count', 'Color')
    t = closest(HemoGl, point, 1)
    x_closest = t.row(0).item(1)
    y_closest = t.row(0).item(2)
    ckd.scatter('Hemoglobin', 'Glucose', group='Color')
    plots.legend([],[],bbox_to_anchor=(0, 1))
    plots.scatter(point.item(0), point.item(1), color='red', s=30)
    plots.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2);

## Nearest Neighbor Approach

To make a prediction for a new patient:

- Plot the new patient's data on the scatter plot.
- Find the sample point that is closest to the new patient's point.
- The prediction is the color of the closest sample point.

In [None]:
# In this example, the new patient's Hemoglobin attribute is 0 and her Glucose is 1.8.
new_patient = make_array(0, 1.8)
show_closest(new_patient)

In [None]:
new_patient = make_array(0, 0.5)
show_closest(new_patient)

In [None]:
x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 2.1, 0.1):
    for y in np.arange(-2, 2.1, 0.1):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Hemoglobin', x_array,
    'Glucose', y_array
)

test_grid.scatter('Hemoglobin', 'Glucose', color='red', alpha=0.4, s=30)

plots.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k')

plots.xlim(-2, 2)
plots.ylim(-2, 2);

## Visualizing the Classifier

**Classifier:** A function that takes a new point and returns a prediction.

For each new (red) point above, it predicts either gold or blue according to the color of the closest point in the sample.

In [None]:
def classify_grid(training, test, k):
    c = make_array()
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = np.append(c, classify(training, make_array(test.row(i)), k))   
    return c

c = classify_grid(ckd.drop('White Blood Cell Count', 'Color'), test_grid, 1)

test_grid = test_grid.with_column('Class', c).join('Class', color_table)
test_grid.scatter('Hemoglobin', 'Glucose', group='Color', alpha=0.4, s=30)

plots.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k')
plots.legend([],[],bbox_to_anchor=(0, 1))

plots.xlim(-2, 2)
plots.ylim(-2, 2);