# Lecture 25: Classification

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline
np.set_printoptions(legacy='1.13')

## Chronic kidney disease

In [None]:
# Source: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
    
def ckd_label(number):
    if number == 0:
        return "notckd"
    elif number == 1:
        return "ckd"
    else:
        return "unknown"
  
ckd = ckd.with_column(
    'Class', ckd.apply(ckd_label, 'Class')
)    
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

**Q:** As a human, how would you classify based on Glucose and White Blood Cell Count? 

<br/><br/><br/><br/><br/>

In [None]:
def my_classifier(wbc, glc):
    if wbc <= 12000 and glc <= 140:
        return 'notckd'
    else:
        return 'ckd'

In [None]:
my_classifier(8000, 100)

In [None]:
ckd_classified = ckd.with_column(
    'My Class', ckd.apply(my_classifier, 
                         'White Blood Cell Count',
                         'Glucose' )
)
ckd_classified.show(3)

**Q:** How would you assess whether `my_classifier` is doing a good job?

<br/><br/><br/><br/><br/>

In [None]:
correct = ckd_classified.column('Class')\
  == ckd_classified.column('My Class')
correct

In [None]:
np.sum(correct) / ckd_classified.num_rows

In [None]:
np.mean(correct)

**Problem:** We are misclassifying.  What if we try another pair of variables?

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
def another_classifier(hgb, glc):
    if glc <= 140 and hgb >= 13:
        return 'notckd'
    else:
        return 'ckd'

ckd_reclassified = ckd.with_column(
    'My Class', ckd.apply(another_classifier, 
                          'Hemoglobin', 
                          'Glucose')
)

np.mean(ckd_reclassified.column('Class') 
        == ckd_reclassified.column('My Class'))


**Problem:** What if we get a new individual to classify?

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13,13,13], [150,200,250], color='red');

**Q:** How would you label the bottom red point?

A. ckd  
B. notckd  


**Q:** How would you label the middle red point?

A. ckd  
B. notckd  


## Nearest Neighbor

**Distance between points.**

In [None]:
plots.scatter([0,4], [0,3], color='red', s=80);
plots.plot([0,4], [0,3], linestyle='dashed');

In [None]:
def distance(a, b):
    """Returns the distance between a and b, where a and b
    are both arrays representing points."""
    return np.sqrt(np.sum((a - b)**2))

In [None]:
distance(make_array(0,0), make_array(4,3))

**Classifying a new individual.**

In [None]:
hgb_glc = ckd.select('Hemoglobin', 'Glucose', 'Class')
hgb_glc    

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13], [150], color='red');

**Aside: Rows.**

In [None]:
hgb_glc.row(0)

In [None]:
np.array(hgb_glc.row(0))

In [None]:
np.array(hgb_glc.drop('Class').row(0))

In [None]:
hgb_glc.drop('Class').apply(np.array)

In [None]:
def mean_of_row(row):
    row_as_array = np.array(row)
    return np.mean(row_as_array)

hgb_glc.with_column(
    'Mean of row', hgb_glc.drop('Class').apply(mean_of_row)
)

**End Aside, back to classifying**

In [None]:
new_point = make_array(13, 150)

def distance_from_new(row):
    """Return the distance between row and new_point.
    Row is an input to the function and can change every
    time the function is called.  new_point is the same
    every time, though."""
    return distance(np.array(row), new_point)
    
distances = hgb_glc.with_column(
    'Distance', hgb_glc.drop('Class').apply(distance_from_new)
).sort('Distance')

distances

**Note:** What we just did with `distance_from_new` is incredibly important to understand.

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');

**Q:** What's wrong with my classifier?

A.  Nothing.  
B.  I must have coded the `distance` function wrong.  
C.  Something else is wrong.

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,50);
plots.ylim(125,175);

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,500);
plots.ylim(0,500);

**Accounting for scale.**

In [None]:
def standard_units(array):
    """Convert array to standard units."""
    return (array - array.mean()) / array.std()
    
def standardize(t):
    "Covert table t to standard units."
    su = Table()
    for label in t.labels:
        su = su.with_column(label + ' (su)', standard_units(t.column(label)))
    return su

def in_su(value, array):
    """Return value in standard units according to 
    the distribution of array."""
    return (value - array.mean()) / array.std()

In [None]:
hgb_glc_su = standardize(hgb_glc.drop('Class'))\
  .with_column('Class', hgb_glc.column('Class'))
hgb_glc_su

In [None]:
hgb_glc_su.scatter('Hemoglobin (su)', 'Glucose (su)', colors='Class')
x_su = in_su(13, hgb_glc.column('Hemoglobin'))
y_su = in_su(150, hgb_glc.column('Glucose'))
plots.scatter(x_su, y_su, color='red');

In [None]:
new_point = make_array(x_su, y_su)

def distance_from_new(row):
    """Return the distance between row and new_point."""
    return distance(np.array(row), new_point)
       
distances = hgb_glc_su.with_column(
    'Distance', hgb_glc_su.drop('Class').apply(distance_from_new)
).sort('Distance')

distances

In [None]:
hgb_glc_su.scatter('Hemoglobin (su)', 'Glucose (su)', colors='Class')
close_x = distances.column('Hemoglobin (su)').item(0)
close_y = distances.column('Glucose (su)').item(0)
plots.scatter([x_su, close_x], [y_su, close_y], color='red');

**Nearest Neighbor Classifier.**

In [None]:
def hgb_glc_nn_classifier_su(new_point_su):
    """Return the hgb/glc classification of new_point_su, which
    should be in standard units already."""
    def distance_from_new(row):
        return distance(np.array(row), new_point_su)
    
    distances = hgb_glc_su.with_column(
        'Distance', hgb_glc_su.drop('Class').apply(distance_from_new)
    )
    return distances.sort('Distance').column('Class').item(0)

def hgb_glc_nn_classifier_ou(new_point):
    """Return the hgb/glc classification of new_point, which
    should still be in original units."""
    hgb_su = in_su(new_point.item(0), hgb_glc.column('Hemoglobin'))
    glc_su = in_su(new_point.item(1), hgb_glc.column('Glucose'))
    return hgb_glc_nn_classifier_su(make_array(hgb_su, glc_su))

In [None]:
hgb_glc_nn_classifier_ou(make_array(13, 150))

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13,13,13], [150,200,250], color='red');

**Q:** What should the classification of (13, 200) be?

A.  ckd  
B.  notckd  

In [None]:
hgb_glc_nn_classifier_ou(make_array(13, 200))

In [None]:
x_su = in_su(13, hgb_glc.column('Hemoglobin'))
y_su = in_su(200, hgb_glc.column('Glucose'))
make_array(x_su,y_su)

In [None]:
decisions = Table(hgb_glc.labels)
for hgb in np.arange(10, 19, .125):
    for glc in np.arange(50, 250, 5):
        predicted = hgb_glc_nn_classifier_ou(make_array(hgb, glc))
        decisions.append([hgb, glc, predicted])
decisions.scatter(0, 1, colors='Class', alpha=0.4)
plots.xlim(10,19)
plots.ylim(40,250)
notckd_pts = hgb_glc.where('Class', 'notckd')
ckd_pts = hgb_glc.where('Class', 'ckd')
plots.scatter(notckd_pts.column(0), notckd_pts.column(1), c='gold', edgecolor='k');
plots.scatter(ckd_pts.column(0), ckd_pts.column(1), c='darkblue', edgecolor='k');

## Training and Testing

In [None]:
shuffled_hgb_glc = hgb_glc.sample(with_replacement=False)
half = int(hgb_glc.num_rows / 2)
train_hgb_glc = shuffled_hgb_glc.take(np.arange(0, half))
test_hgb_glc  = shuffled_hgb_glc.take(np.arange(half, hgb_glc.num_rows))

In [None]:
train_hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
def train_nn_su_classifier(train):
    """Create a nearest-neighbor classifier.  The last column 
    of the training table should be the labels, and the preceeding
    columns should be the attributes.  The names of the columns do not
    matter.  The classifier will work in standard units; but,
    the training table should be in original units.
    
    Returns a function.  That function takes as input a 
    new point in original units. The function returns the 
    classification of the new point as output."""
    
    last_column = train.num_columns - 1
    
    train_su = standardize(train.drop(last_column)).with_column(
        train.labels[last_column], train.column(last_column)
    )
    
    def classify_new_point(new_point):
        """Return the classification of new_point, which
        should still be in original units."""
        
        new_point_su = make_array()
        for i in range(len(new_point)):
            coord_su = in_su(new_point.item(i), train.column(i))
            new_point_su = np.append(new_point_su, coord_su)
        
        def distance_from_new(row):
            """Return the distance between row, which
            should be in standard units, and new_point_su."""
            return distance(np.array(row), new_point_su)
    
        distances = Table().with_columns(
            'Distance', train_su.drop(last_column).apply(distance_from_new),
            'Class', train_su.column(last_column)
        ).sort('Distance')
    
        return distances.column('Class').item(0)
    
    return classify_new_point

In [None]:
hgb_glc_scaled_classifier = train_nn_su_classifier(train_hgb_glc)

In [None]:
hgb_glc_scaled_classifier(make_array(13, 150))

**Note:** What we just did with returning a function was incredibly important to understand.

In [None]:
def visualize_boundary(known_points, classifier, x_range, y_range):
    """Visualize the boundary region of a classifier.
    The visualization is in original units."""
    decisions = Table(known_points.labels)
    for x in x_range:
        for y in y_range:
            predicted = classifier(make_array(x, y))
            decisions.append([x, y, predicted])
    decisions.scatter(0, 1, colors='Class', alpha=0.4)
    plots.xlim(x_range.min(), x_range.max())
    plots.ylim(y_range.min(), y_range.max())
    notckd_pts = known_points.where('Class', 'notckd')
    ckd_pts = known_points.where('Class', 'ckd')
    plots.scatter(notckd_pts.column(0), notckd_pts.column(1), c='gold', edgecolor='k');
    plots.scatter(ckd_pts.column(0), ckd_pts.column(1), c='darkblue', edgecolor='k');

In [None]:
visualize_boundary(train_hgb_glc, hgb_glc_scaled_classifier,
                  np.arange(10, 20, .25), np.arange(50, 250, 5))

In [None]:
visualize_boundary(test_hgb_glc, hgb_glc_scaled_classifier,
                  np.arange(10, 20, .25), np.arange(50, 250, 5))

In [None]:
def classifier_accuracy(test, classifier):
    """Evaluate the accuracy of the classifier using the
    provided test table, whose last column should be
    the labels.  The names of the columns are irrelevant."""
    
    last_column = test.num_columns - 1
    actual_labels = test.column(last_column)
    predicted_labels = test.drop(last_column).apply(classifier)
    
    return np.mean(actual_labels == predicted_labels)    

In [None]:
classifier_accuracy(test_hgb_glc, hgb_glc_scaled_classifier)

## Majority Voting in a Neighborhood

Back to a pair of variables where it was harder to classify...

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')
plots.xlim(3500,10000);
plots.ylim(60,180);

In [None]:
wbc_glc = ckd.select('White Blood Cell Count', 'Glucose', 'Class')
half = int(wbc_glc.num_rows / 2)
train_wbc_glc, test_wbc_glc = wbc_glc.split(half)
wbc_glc_scaled_classifier = train_nn_su_classifier(train_wbc_glc)
visualize_boundary(train_wbc_glc, wbc_glc_scaled_classifier,
                  np.arange(3500,11000,200), np.arange(60,200,5))

**Q:** How many points are being mis-classified here?

A. 0  
B. 1  
C. 2 or more  

In [None]:
classifier_accuracy(train_wbc_glc, wbc_glc_scaled_classifier)

In [None]:
visualize_boundary(test_wbc_glc, wbc_glc_scaled_classifier,
                  np.arange(3500,11000,200), np.arange(60,200,5))

**Q:** How many points are being mis-classified here?

A. 0  
B. 1  
C. 2 or more  

In [None]:
classifier_accuracy(test_wbc_glc, wbc_glc_scaled_classifier)

In [None]:
def most_frequent_value(array):
    """Returns the value of array that occurs most frequently.
    """
    t = Table().with_column('values', array)
    counts = t.group('values').sort('count', descending=True).column('values')
    return counts.item(0)

In [None]:
most_frequent_value(make_array(1,1,1,0,0,0,0,0))

In [None]:
def train_knn_su_classifier(train, k):
    """Create a nearest-neighbor classifier.  The last column 
    of the training table should be the labels, and the preceeding
    columns should be the attributes.  The names of the columns do not
    matter.  The classifier will work in standard units; but,
    the training table should be in original units.
    
    Returns a function.  That function takes as input a 
    new point in original units. The function returns the 
    classification of the new point as output."""
    
    last_column = train.num_columns - 1
    
    train_su = standardize(train.drop(last_column)).with_column(
        train.labels[last_column], train.column(last_column)
    )
    
    def classify_new_point(new_point):
        """Return the classification of new_point, which
        should still be in original units."""
        
        new_point_su = make_array()
        for i in range(len(new_point)):
            coord_su = in_su(new_point.item(i), train.column(i))
            new_point_su = np.append(new_point_su, coord_su)
        
        def distance_from_new(row):
            """Return the distance between row, which
            should be in standard units, and new_point_su."""
            return distance(np.array(row), new_point_su)
    
        distances = Table().with_columns(
            'Distance', train_su.drop(last_column).apply(distance_from_new),
            'Class', train_su.column(last_column)
        ).sort('Distance')
        
        top_k = distances.take(np.arange(k)).column('Class')
        return most_frequent_value(top_k)
    
    return classify_new_point

In [None]:
def train_and_visualize_wbc_glc(k):
    classifier = train_knn_su_classifier(train_wbc_glc, k)
    visualize_boundary(test_wbc_glc, classifier,
                  np.arange(3500,11000,250), np.arange(60,200,10))
    acc = classifier_accuracy(test_wbc_glc, classifier)
    print('Accuracy:', acc)

In [None]:
train_and_visualize_wbc_glc(1)

In [None]:
train_and_visualize_wbc_glc(3)

In [None]:
train_and_visualize_wbc_glc(5)

## Distances using more attributes

In [None]:
distance(make_array(0,0), make_array(1,1))

In [None]:
distance(make_array(0,0,0), make_array(1,1,1))

In [None]:
distance(make_array(0,0,0,0), make_array(1,1,1,1))