In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
from mpl_toolkits.mplot3d import Axes3D
plots.style.use('fivethirtyeight')
%matplotlib inline

## Classification Examples: Medicine

Can we predict kidney disease based on a set of attributes?

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
ckd = ckd.join('Class', color_table)

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', group='Color')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', group='Color')

## Classification Examples: Banking

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
banknotes = banknotes.join('Class', color_table)

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', group = 'Color')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', group = 'Color')

In [None]:
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
           s=50);

## Defining a Classifier

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
patients_with_color = patients.join('Class', color_table)

In [None]:
patients_with_color.scatter('Bland Chromatin', 'Single Epithelial Cell Size', group = 'Color')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients_with_color.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients_with_color.column('Single Epithelial Cell Size')),
        'Class',
        patients_with_color.column('Class')
    ])

color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
jittered = jittered.join('Class', color_table)

jittered.scatter('Bland Chromatin (jittered)', 'Single Epithelial Cell Size (jittered)', group = 'Color')

## Distance

In [None]:
Table().with_columns(['X', [0, 2, 3], 'Y', [0, 2, 4]]).scatter('X', 'Y')

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points (represented as arrays)"""
    return np.sqrt(sum((pt1 - pt2) ** 2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [None]:
attributes = patients.drop('Class').drop('Color')
attributes.show(3)

In [None]:
attributes.row(0)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

In [None]:
row_distance(attributes.row(0), attributes.row(0))

In [None]:
patients.drop('Class')

## Classification Procedure

In [None]:
def distances(training, example):
    """Compute a table with the training set and distances to the example for each row in the training set."""
   #create a table that includes the distances from the training set to the example
    dists = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        dist = row_distance(row, example)
        dists = np.append(dists, dist)
    
    return training.with_column('Distance', dists)
    


In [None]:
#Let's try our function
distances(patients,patients.drop('Class').row(12))

Now we decide who the closest k neighbors are:

In [None]:
def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training,example).sort('Distance').take(np.arange(k))

In [None]:
# Let's try this out for patient on row 12


In [None]:
example_row12 = patients.drop('Class').row(12)
example_row12

In [None]:
# Call our function closest using 5 nearest neighbors
closest(patients.exclude(12), example_row12, 5)

In [None]:
#TODO: now lets make sure to exclude the actual example
t1 = closest(patients.exclude(12), example_row12, 1)
t1

In [None]:
patients.row(12)

In [None]:
# Let's use group to see what class comes up most
t1.group('Class').sort('count', descending = True).column('Class').item(0)

In [None]:
# Let's put all of the steps in a function
def majority_class(neighbors):
    """Return the class that's most common among all these neighbors."""
    return neighbors.group('Class').sort('count', descending = True).column('Class').item(0)

#Let's try the function out
majority_class(t1)

In [None]:
def classify(training, example, k):
    "Return the majority class among the k nearest neighbors."
    nearest_neighbors = closest(training, example, k)
    return majority_class( nearest_neighbors)

In [None]:
# TODO: Let's try it out and compare with the actual class
classify(patients.exclude(12), example_row12,7)

In [None]:
patients.row(12)

## Review of the Steps 
    
- `distance(pt1, pt2)`: Returns the distance between the arrays `pt1` and `pt2`
- `row_distance(row1, row2)`: Returns the distance between the rows `row1` and `row2`
- `distances(training, example)`: Returns a table that is `training` with an additional column `'Distance'` that contains the distance between `example` and each row of `training`
- `closest(training, example, k)`: Returns a table of the rows corresponding to the k smallest distances 
- `majority_class(neighbors)`: Returns the majority class in the `'Class'` column
- `classify(training, example, k)`: Returns the predicted class of `example` based on a `k` nearest neighbors classifier using the historical sample `training`

## Evaluation

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement = False) # Randomly permute the rows
training_set = shuffled.take(np.arange(400)) #Take the first 380
test_set  = shuffled.take(np.arange(400, 683)) #Take the rest (380 - 683)

In [None]:
def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class') #TODO: drop the Class column
    num_correct = 0
    for i in np.arange(test.num_rows):
        # Run the classifier on the ith patient in the test set
        test_row = test_attributes.row(i)
        c = classify(training, test_row, k)
        # Was the classifier's prediction correct? (if statement)
        if c == test.column('Class').item(i):
            num_correct = num_correct + 1
        
    return num_correct/test.num_rows #Returns a proportion correct

In [None]:
#TODO: try evaluating the accuracy of the 5-nearest neighbor classifier
evaluate_accuracy(training_set, test_set, 5)*100

In [None]:
#TODO: let's pick another value of k?
...

In [None]:
#Why is this not a good idea?
...

## Decision Boundaries

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
ckd = ckd.select('Hemoglobin', 'Glucose','Class')
kidney = ckd.join('Class', color_table)
kidney = kidney.drop('Class','Color').with_columns(
        'Class', kidney.column('Class'), 'Color', kidney.column('Color'))
kidney.scatter('Hemoglobin', 'Glucose', group = 'Color')
plots.scatter(13, 250, color='red', s=30);

In [None]:
def show_closest(t, point):
    """Show closest training example to a point."""
    near = closest(t.drop('Class', 'Color'), point, 1).row(0)
    t.scatter(0, 1, group = 'Color')
    plots.scatter(point.item(0), point.item(1), color='red', s=30)
    plots.plot([point.item(0), near.item(0)], [point.item(1), near.item(1)], color='k', lw=2)
    
show_closest(kidney, make_array(13, 250))

In [None]:
def standard_units(any_numbers):
    """Convert any array of numbers to standard units."""
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

In [None]:
kidney

In [None]:
kidney_su = standardize(kidney.drop('Class','Color')).with_columns('Class', kidney.column('Class'), 'Color', kidney.column('Color'))
show_closest(kidney_su, make_array(-0.2, 1.8))

In [None]:
show_closest(kidney_su, make_array(-0.2, 1))

In [None]:
show_closest(kidney_su, make_array(-0.2, 0.9))

In [None]:
def decision_boundary(t, k):
    """Decision boundary of a two-column + Class table."""
    t_su = standardize(t.drop('Class', 'Color')).with_column('Class', t.column('Class'))
    decisions = Table(t_su.labels)
    for x in np.arange(-2, 2.1, 0.1):
        for y in np.arange(-2, 2.1, 0.1):
            predicted = classify(t_su, make_array(x, y), k)
            decisions.append([x, y, predicted])
            
    color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
    )
    decisions = decisions.join('Class', color_table)
    decisions = decisions.drop('Class','Color').with_columns(
        'Class', decisions.column('Class'), 'Color', decisions.column('Color'))
    
    decisions.scatter(0, 1, group = 'Color', alpha=0.4)
    plots.xlim(-2, 2)
    plots.ylim(-2, 2)
    t_su_0 = t_su.where('Class', 0)
    t_su_1 = t_su.where('Class', 1)
    plots.scatter(t_su_0.column(0), t_su_0.column(1), c='gold', edgecolor='k')
    plots.scatter(t_su_1.column(0), t_su_1.column(1), c='darkblue', edgecolor='k')
    
decision_boundary(kidney, 1)

In [None]:
#TODO: try a couple of other values for k
decision_boundary(kidney, 2)

In [None]:
#remember the data set about breast cancer diagnoses?
# TODO: Let's look at the different decision boundaries based on different k values (use dataset called jittered)
decision_boundary(jittered, 4)