In [None]:
from datascience import *
import numpy as np
import matplotlib

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

## Review

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.sample(3).show(3)

In [None]:
ckd = ckd.select('Hemoglobin', 'Glucose', 'Class')
ckd.show(3)

In [None]:
ckd.sample(3).show(3)

______

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes.sample(6).show()

## New material

- We will work with the `ckd` dataset
- Goal: predict an out-of-sample patient using the entire `ckd` dataset as training. Tomorrow, we will worry about splitting into train and test sets.


Here's our example point for today.

In [None]:
alice = make_array(0,1.1)
alice

### How can we find the *k*-nearest neighbors?

___

#### Detour: Rows and row objects

In [None]:
ckd.row(0)

In [None]:
type(ckd.row(0))

In [None]:
ckd.row(0).item(1)

In [None]:
make_array(ckd.row(0))

_____

First row in the training set:

In [None]:
patient = ckd.drop('Class').row(0)
patient

In [None]:
make_array(patient)

#### **Task**: Let's find a distance between `alice` and the training point.

$$ \text{D} = \sqrt{(x_0-x_1)^2 + (y_0-y_1)^2} $$

In [None]:
(alice.item(0) - patient.item(0)) ** 2 + (alice.item(1) - patient.item(1)) ** 2 

In [None]:
#this is an array 
alice

In [None]:
#This is an array
make_array(patient)

**Discussion Question** [1 min] Is there a quicker way to do the part $(x_0-x_1)^2 + (y_0-y_1)^2$?

In [None]:
alice-make_array(patient) 

In [None]:
(alice-make_array(patient))**2

In [None]:
np.sum((alice-make_array(patient))**2)

In [None]:
np.sum((alice-make_array(patient))**2) ** 0.5

In [None]:
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sum((point1-point2)**2) ** 0.5

In [None]:
distance(alice, make_array(patient))

In [None]:
def distance_from_alice(row):
    return distance(alice, make_array(row))

In [None]:
distance_from_alice(ckd.drop('Class').row(0))                     

_______

#### **Task**: Calculate the Euclidean distance between `alice` and every point in the training set.

In [None]:
ckd.drop('Class').show(5)

**Discussion Question** [1 min]: What's a method we can use to help us?


In [None]:
distances = ckd.drop('Class').apply(distance_from_alice)

In [None]:
def all_distances(training, new_row):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_new_row(row):
        return distance(make_array(new_row), make_array(row))
    return attributes.apply(distance_from_new_row)

In [None]:
distances = all_distances(ckd.drop('Class'), alice)

______

#### **Task:** Augment the training data table with a column containing all the distances.

In [None]:
ckd_with_distances = ckd.with_column('Distance from Alice', distances)

In [None]:
def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

In [None]:
table_with_distances(ckd, alice).show(3)

_______

#### **Task**: Find the 5 nearest neighbors.

**Discussion Question** [1 min]: Think of two table methods that will help us return a *table* with the five nearest neighbors. 

In [None]:
table_with_distances(ckd, alice).show(6)

In [None]:
ckd_with_distances.sort('Distance from Alice')

In [None]:
ckd_with_distances.sort('Distance from Alice').take(np.arange(5))

In [None]:
def nearest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    nearest_neighbors_table = sorted_by_distance.take(np.arange(k))
    return nearest_neighbors_table

In [None]:
five_nearest_neighbors = nearest(ckd, alice, 5)
five_nearest_neighbors

### How do we classify the point?

#### **Task**: Find the majority class (programatically).

##### Method 1

In [None]:
ones = five_nearest_neighbors.where('Class', are.equal_to(1)).num_rows

In [None]:
zeros = five_nearest_neighbors.where('Class', are.equal_to(0)).num_rows

In [None]:
if ones > zeros:
    print(1)
else:
    print(0)

In [None]:
def majority(nearest_neighbors_table):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = nearest_neighbors_table.where('Class', are.equal_to(1)).num_rows
    zeros = nearest_neighbors_table.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

In [None]:
majority(five_nearest_neighbors)

##### Method 2

**Discussion Question:** [2 min]

You are given the `five_nearest_neighbors` table. Write out on a piece of paper (or type on your notes app) a line(s) of code that will find the majority class and return it as a string/number, *regardless* of whether the classes are called 0 or 1, or whatever their names are!

In [None]:
five_nearest_neighbors.group('Class')

In [None]:
five_nearest_neighbors.group('Class').sort('count', descending=True)

In [None]:
five_nearest_neighbors.group('Class').sort('count', descending=True).column('Class')

In [None]:
five_nearest_neighbors.group('Class').sort('count', 
                                           descending=True).column('Class').item(0)

In [None]:
def majority(nearest_neighbors_table, class_name):
    return nearest_neighbors_table.group(class_name).sort('count', 
                                                         descending=True).column(class_name).item(0)

In [None]:
majority(five_nearest_neighbors, 'Class')

#### **Task:** Write a function to complete the entire algorithm!

In [None]:
five_nearest_neighbors = nearest(ckd, alice, 5)
majority(five_nearest_neighbors, 'Class')

In [None]:
def knn(training, class_name, new_point, k):
    nearest_neighbors_table = nearest(training, new_point, k)
    return majority(nearest_neighbors_table, class_name)

In [None]:
knn(ckd, 'Class', alice, 5)

#### Here are all the functions, in order, that made up this algorithm.

In [None]:
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sum((point1-point2)**2) ** 0.5

def all_distances(training, new_row):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_new(row):
        return distance(make_array(new_row), make_array(row))
    return attributes.apply(distance_from_new)


def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))


def nearest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    nearest_neighbors_table = sorted_by_distance.take(np.arange(k))
    return nearest_neighbors_table


def majority(nearest_neighbors_table, class_name):
    return nearest_neighbors_table.group(class_name).sort('count', 
                                                         descending=True).column(class_name).item(0)


def knn(training, class_name, new_point, k):
    nearest_neighbors_table = nearest(training, new_point, k)
    return majority(nearest_neighbors_table, class_name)

____

**Challenge Task**  [1 min] : Perform $k$-NN using the entire `banknotes` dataset as training! Make up a test point like we did with `alice` for the `ckd` dataset. What will the Euclidean distance calculation look like on paper if you use all of the variables (except `"Class"`) as predictors?