In [None]:
import matplotlib
#matplotlib.use('Agg')
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import math
import scipy.stats as stats
plt.style.use('fivethirtyeight')

# Classifiying Iris colors

We'll be classifying iris (the colored part of the eye) based on features of the iris sepal and petal. (We don't know what these are exactly either - feel free to do your own research!)

## Don't forget to _run_ all code cells!

In [None]:
iris = Table.read_table('iris.csv')
color_table = Table().with_columns(
    'species', make_array('setosa','versicolor','virginica'),
    'Color', make_array('darkblue', 'gold', 'lightblue')
)

iris = iris.join('species', color_table).drop(6,7)

# add color labels for graphing purposes
blue = iris.where('species', are.equal_to('setosa'))
gold = iris.where('species', are.equal_to('versicolor'))

iris = blue.append(gold)
iris.show(10)

#### Question 1 

List all the features and classes on the table above (you might use code to help you find all classes, but you can also just look through the rows and columns).

*Write your answer here, replacing this text.*

### Here we've plotted two scatter plots of the sepal and petal features for each species. Note that either of these sets of features will likely make for a good classifier, since there are clear decision boundaries on each plot.

In [None]:
iris.scatter('sepal_length','sepal_width',colors ='species')

In [None]:
iris.scatter('petal_length','petal_width',colors ='species')

#### Question 2

Now we have a unknown iris with sepal_length = 5.1 and  sepal_width= 4.2. Can you classify it? Also, what about an unknown iris with petal_length = 3 and  petal_width=0.75 ?

*Write your answer here, replacing this text.*

### Let's examine the distances between the species attributes.
To do this, we will use the distance formula, and drop the columns with the iris labels from the table. Recall that setosa irises are at the top of the table and versicolor are at the bottom. 

We must drop the columns containing strings in order to compute numerical distances.

In [None]:
attributes = iris.drop('Color')
# For converting iris to binary

def is_one(x):
    if x == 'setosa':
        return 1
    else:
        return 0

# we drop the sepal_length and sepal_width columns
# try going through the classification process again using all 4 features, or the other two!
attributes = attributes.with_column('Class', iris.apply(is_one, 0)).drop('species').drop("sepal_length", "sepal_width")
attributes.show()


In [None]:
# the distance formula - no different than what you probably learned in middle school
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
# same species
distance(np.array(attributes.row(0)), np.array(attributes.row(1)))

In [None]:
# different species
distance(np.array(attributes.row(0)), np.array(attributes.row(90)))

Notice that the distance between a setosa row and a veriscolor row is much higher than that between two setosa rows.

## Implementing the Classifier

Take a look at the function implementations below. They will be similar to what you will use in Project 3, so make sure you understand the implementations!

There are no new table manipulations here.

In [None]:
def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    feats = training.drop('Class')
    def distance_from_point(row):
        # uses distance formula, this makes it work on rows of a table
        return distance(np.array(new_point), np.array(row))
    return feats.apply(distance_from_point)

def table_with_distances(training, new_point):
    # called by closest, calls all_distances
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

def closest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    # called by classify, calls table_with_distances
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    """ Input: a table, with columns like the attributes table
    Returns whether the majority of rows are setosa or versicolor irises
    """
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, new_point, k):
    """The actual classification function that
    returns a classifications based upon the majority of the neighbors"""
    
    # calls closest
    closestk = closest(training, new_point, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

Let's play around with these functions a little.

To see how this works, we'll can find the five nearest neighbors of the first iris in our data. Since this iris is part of the dataset, it is its own nearest neighbor. So we should expect to see it at the top of the list, followed by four others.

In [None]:
# run this cell
eyeball = attributes.drop('species').drop('Color').drop('Class').row(0)
eyeball

In [None]:
# note that we are using the attribute table, rahter that iris, and that the closest points are also 1's
# recall that 1 = setosa
closest(attributes, eyeball, 5)

### Use the train-test split code we wrote in the worksheet to split the `iris` table into a training and test set.

In [None]:
shuffled_data = attributes.sample(with_replacement=False) #randomly shuffle
training_size_proportion = .2
dataset_size = iris.num_rows
x = int(np.round(training_size_proportion * dataset_size))   # don't worry about the int(np.round) part!
training_set = shuffled_data.take(np.arange(x))
testing_set = shuffled_data.take(np.arange(x,dataset_size))

In [None]:
# note shuffling of classes
training_set

## Running the Classifier

We want to verify the accuracy of our classifier, so we define a series of functions that will look at where two arrays are the same and will tell us how many predictions we got correct. 

`count_zero` and `count_equal` work together to show the number of instances in 2 arrays where they have the same corresponding elements. If `array1` and `array2` have the same element at index 0, then `(array1 - array2).item(0)` will be 0. 
This is how we check if our classifier predicted correctly. 

In [None]:
def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

def evaluate_accuracy(training, test, k):
    """runs the testing set's labels against
    a classifier trained on testing data. 
    This takes in unseen data and outputs a
    proportion of correct predictions"""
    test_attributes = test.drop('Class') #dataset with labels removed
    def classify_testrow(row):
        """Classifies the testing data using the classify function"""
        return classify(training, row, k)
    c = test_attributes.apply(classify_testrow)
    return count_equal(c, test.column('Class')) / test.num_rows

In [None]:
evaluate_accuracy(training_set, testing_set, 5)

WoW! Looks like our classifier was really accurate. Why is this?

*Write your answer here, replacing this text.*

Remember that we only used `petal_length` and `petal_width` in this classifier. Try running it again using all 4 featurs or just the `sepal_length` and `sepal_width`.

*Hint*: to do this, you only need to change 1 line of code in the setup of the `attributes` table.