# Lecture 23: Classification

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline
np.set_printoptions(legacy='1.13')

## Chronic kidney disease

In [None]:
# Source: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
    
def ckd_label(number):
    if number == 0:
        return "notckd"
    elif number == 1:
        return "ckd"
    else:
        return "unknown"
  
ckd = ckd.with_column(
    'Class', ckd.apply(ckd_label, 'Class')
)    
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

**Q:** As a human, how would you classify based on Glucose and White Blood Cell Count? 

<br/><br/><br/><br/><br/>

In [None]:
def my_classifier(wbc, glc):
    if wbc <= 12000 and glc <= 140:
        return 'notckd'
    else:
        return 'ckd'

In [None]:
my_classifier(8000, 100)

In [None]:
ckd_classified = ckd.with_column(
    'My Class', ckd.apply(my_classifier, 
                         'White Blood Cell Count',
                         'Glucose' )
)
ckd_classified.show(3)

**Q:** How would you assess whether `my_classifier` is doing a good job?

<br/><br/><br/><br/><br/>

In [None]:
correct = ckd_classified.column('Class')\
  == ckd_classified.column('My Class')
correct

In [None]:
np.sum(correct) / ckd_classified.num_rows

In [None]:
np.mean(correct)

**Problem:** we are misclassifying.  (We'll come back to this next lecture.)  What if we try another pair of variables?

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
def another_classifier(hgb, glc):
    if glc <= 140 and hgb >= 13:
        return 'notckd'
    else:
        return 'ckd'

ckd_reclassified = ckd.with_column(
    'My Class', ckd.apply(another_classifier, 
                          'Hemoglobin', 
                          'Glucose')
)

np.mean(ckd_reclassified.column('Class') 
        == ckd_reclassified.column('My Class'))


**Problem:** What if we get a new individual to classify?

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13,13,13], [150,200,250], color='red');

**Q:** How would you label the bottom red point?

A. ckd  
B. notckd  


**Q:** How would you label the middle red point?

A. ckd  
B. notckd  


## Nearest Neighbor

**Distance between points.**

In [None]:
plots.scatter([0,4], [0,3], color='red', s=80);
plots.plot([0,4], [0,3], linestyle='dashed');

In [None]:
def distance(a, b):
    """Returns the distance between a and b, where a and b
    are both arrays representing points."""
    return np.sqrt(np.sum((a - b)**2))

In [None]:
distance(make_array(0,0), make_array(4,3))

**Classifying a new individual.**

In [None]:
hgb_glc = ckd.select('Hemoglobin', 'Glucose', 'Class')
hgb_glc    

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13], [150], color='red');

**Aside: Rows.**

In [None]:
hgb_glc.row(0)

In [None]:
np.array(hgb_glc.row(0))

In [None]:
np.array(hgb_glc.drop('Class').row(0))

In [None]:
hgb_glc.drop('Class').apply(np.array)

In [None]:
def mean_of_row(row):
    row_as_array = np.array(row)
    return np.mean(row_as_array)

hgb_glc.with_column(
    'Mean of row', hgb_glc.drop('Class').apply(mean_of_row)
)

**End Aside, back to classifying**

In [None]:
new_point = make_array(13, 150)

def distance_from_new(row):
    """Return the distance between row and new_point.
    Row is an input to the function and can change every
    time the function is called.  new_point is the same
    every time, though."""
    return distance(np.array(row), new_point)
    
distances = hgb_glc.with_column(
    'Distance', hgb_glc.drop('Class').apply(distance_from_new)
).sort('Distance')

distances

**Note:** What we just did with `distance_from_new` is incredibly important to understand.

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');

**Q:** What's wrong with my classifier?

A.  Nothing.  
B.  I must have coded the `distance` function wrong.  
C.  Something else is wrong.

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,500);
plots.ylim(0,500);

In [None]:
hgb_glc.scatter('Hemoglobin', 'Glucose', colors='Class')
plots.scatter([13, 5.6], [150, 157], color='red');
plots.xlim(0,50);
plots.ylim(125,175);