In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
# np.array(list) converts list to an array
# provided all the elements of list are of the same type

n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [None]:
ckd = Table.read_table('ckd.csv')
ckd = ckd.relabeled('Blood Glucose Random', 'Glucose').select('Glucose', 'Hemoglobin', 'White Blood Cell Count', 'Class')

## Classification ##

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

def distances(training, example):
    """Compute distance between example and every row in training.
    Return training augmented with Distance column"""
    distances = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        distances = np.append(distances, row_distance(row, example))
    return training.with_column('Distance', distances)

def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    return majority_class(closest(training, example, k))

## Evaluating the Classifier ##

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

## Banknotes: Fraudulent or Real? ##

In [None]:
banknotes = Table.read_table('banknote.csv').drop('Entropy')

In [None]:
banknotes

In [None]:
attributes = banknotes.drop('Class')
attributes.show(3)

In [None]:
example123 = attributes.row(123)
example123

In [None]:
classify(banknotes.exclude(123), example123, 5)

In [None]:
banknotes.take(123)

In [None]:
banknotes.num_rows

In [None]:
shuffled = banknotes.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(686))
test_set  = shuffled.take(np.arange(686, 1372))

In [None]:
evaluate_accuracy(training_set, test_set, 1)

## Chronic Kidney Disease, or Not? ##

In [None]:
ckd

In [None]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

In [None]:
ckd_new = ckd.select('Class').with_column(
    'Glucose_su', standard_units(ckd.column('Glucose')),
    'Hemoglobin_su', standard_units(ckd.column('Hemoglobin')),
    'WBC_su', standard_units(ckd.column('White Blood Cell Count'))
)

In [None]:
ckd_new

In [None]:
shuffled = ckd_new.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
shuffled = ckd.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [None]:
evaluate_accuracy(training_set, test_set, 3)

## More Likely Than Not ##

In [None]:
students.show(3)

In [None]:
students.pivot('Major', 'Year')

In [None]:
students.pivot('Major', 'Year')

In [None]:
# Chance of third year, given that they have declared
# P(third year | declared)



In [None]:
# P(second year | declared)



In [None]:
# P(second year | declared), from tree diagram
