In [None]:
from datascience import * # from this library, import all functions
from math import *
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Coding review: pandas

In [None]:
## Import the data
ces = pd.read_csv("calenviroscreen.csv")

In [None]:
## Q1: Inspect the data
# head, tail, describe, shape
...

In [None]:
# Tracts with a pollution score of 75-100 represent "disadvantaged communities" under SB-535
# Higher scores = greater pollution burden, population vulnerability
# Q2: Visualize the data
...

In [None]:
# Q3: Using a for loop, create an array to label all disadvantaged communities 
# If the score is between 75-100, give it the label "At Risk"
# Otherwise, "Not at Risk"
scores = ces.ces_pollution_score # access the data as a series

...

for ...:
    ...

In [None]:
## Q4: Add the labels back onto the ces table as the column name "class"
# Try using bracket notation 
# or using df.insert(loc = ..., column = ..., value = ...)
...

In [None]:
# Q5: how many census tracts are at risk? 
# use df.groupby("Group")["Column"].func()
..

In [None]:
## Note: although this is informative, it's not great for a K-NN classifier 
# because we usually want decent representation of all classes.
# For purposes of today's exercise, let's say that "At risk" refers to a score of
# 50-100 just so we have good representation of both groups

ces = pd.read_csv("calenviroscreen.csv")

def add_labels(df):
    scores = df["ces_pollution_score"]
    labels = ["At risk" if 50 <= x <= 100 else "Not at risk" for x in scores] # this is a list comprehension
    df.insert(loc = 0, column = "Class", value = labels)
    return df

new_ces = add_labels(ces)
new_ces.groupby("Class")["census_tract"].count()

In [None]:
## Much better! Q6: let's export this to work with in datascience
# remember to remove the indices!
...

## Building a K-NN classifier

In [None]:
ces_labeled = Table().read_table("ces_labeled.csv").drop("ces_pollution_score") # dropping the aggregate
ces_labeled.show(5)
ces_labeled.num_rows

## Step 1: Splitting the dataset

In [None]:
## It looks like our data is ordered by pollution score. We want to make sure
# we get representative samples for both the test set and training set 
# so they both properly represent the population.
# Q1: Permute the dataset and then create a separate training and test set by an 80/20 ratio. 
num_train_rows = int(ces_labeled.num_rows * 0.8)
num_test_rows = int(ces_labeled.num_rows - num_train_rows)

ces_shuffled = ...
ces_train = ...
ces_test = ...

In [None]:
## Let's check if the ratios are relatively the same
grouped_train = ces_train.group(0)
grouped_train.with_column("prop", grouped_train.column("count") / sum(grouped_train.column("count"))).show()
grouped_test = ces_test.group(0)
grouped_test.with_column("prop", grouped_test.column("count") / sum(grouped_test.column("count"))).show()

## Step 2: Choosing the features/attributes

In [None]:
## The list of our attributes are below:
ces_train.labels[1:]

In [None]:
## In general, we want traits that differentiate the 2 groups relatively well
def create_scatter_group(attrib1, attrib2):
    ces_train.sample(250).scatter(attrib1, attrib2, group = 0)

create_scatter_group("poverty", "ozone")

In [None]:
# Another fun thing - interactive widgets!
# Choose traits you want to compare
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

interact(create_scatter_group, attrib1 = list(ces_train.labels[2:]), attrib2 = list(ces_train.labels[2:]));

In [None]:
## One issue you may have noticed is the wide range of values for all of the variables used
# This isn't good! In K-NN, we want to make sure we aren't weighting one attribute higher than another.
# Let's fix this by converting to standard units. 
# Q: Complete the function:

def standard_units(arr):
    return ...

In [None]:
## This function will use your std units function to normalize all quant. vars.
def norm_tbl(tbl):
    new_tbl = tbl.select(0, 1, 2)
    for col in tbl.labels[2:]:
        converted = standard_units(tbl.column(col))
        new_tbl = new_tbl.with_column(col, converted)
    return new_tbl

norm_train = norm_tbl(ces_train)
norm_test = norm_tbl(ces_test)

norm_train.show(5)

In [None]:
# Notice the data looks relatively the same, but now on different scales
norm_train.sample(200).scatter("poverty", "ozone", group = 0)

In [None]:
# I think that it makes the most sense to use the environmental/population features instead of demographic
# (although it looks like race is correlated with class)
our_attributes = list(ces_train.labels[9:])
our_attributes

train_atts = ces_train.select(our_attributes)
test_atts = ces_test.select(our_attributes)
train_atts

## Step 3: Calculating distance

In [None]:
# Normally, we would take the Euclidean distance between your unknown and every row in the training set
# i.e. np.sqrt((x1 - x2)**2 + (y1 - y2)**2 +(z1 - z2)**2)
# but that's very computationally expensive.
# We've written a function for you below, which uses row objects
test_atts.row(0)

In [None]:
def fast_distances(test_row, train_table):
    """Return an array of the distances between test_row and each row in train_rows.

    Takes 2 arguments:
      test_row: A row of a table containing features of one
        test movie (e.g., test_my_features.row(0)).
      train_table: A table of features (for example, the whole
        table train_my_features)."""
    assert train_table.num_columns < 50, "Make sure you're not using all the features of the movies table."
    counts_matrix = np.asmatrix(train_table.columns).transpose()
    diff = np.tile(np.array(list(test_row)), [counts_matrix.shape[0], 1]) - counts_matrix
    np.random.seed(0) # For tie breaking purposes
    distances = np.squeeze(np.asarray(np.sqrt(np.square(diff).sum(1))))
    eps = np.random.uniform(size=distances.shape)*1e-10 #Noise for tie break
    distances = distances + eps
    return distances

In [None]:
practice_unknown = test_atts.row(0)
print("The class of this row in the test set is " + ces_test.row(0)[0])

In [None]:
# We can take that array of distances and add it back onto the training table
# and then sort to find the "nearest" neighbors
# Q: Find the distances between the practice and all rows in the training set (train_atts) 
# Then, add those distances to the table with the labels: ces_train 
practice_distances = ...
train_with_dist = 
train_with_dist.show(5)

## Step 4: classifying individuals

In [None]:
## Now that we have the table with distances, we can classify an individual 
# by taking a majority vote from the K-nearest neighbors
# let's say k = 15

# Q: Classify the practice row using k=15. What is the "majority" vote?



## Step 5: testing the accuracy of our classifier

In [None]:
# To get a general idea of the accuracy of the classifier, we can do Steps 3-4 for all rows in the test set
# And seeing how many are correct.

def classify_one(row, k=15):
    distance_from_row = fast_distances(row, train_atts)
    with_dist = ces_train.with_column("Distance", distance_from_row).sort("Distance", descending = False)
    return with_dist.take(np.arange(k)).group(0).sort(1, descending = True).column(0).item(0)

classify_one(test_atts.row(0))

In [None]:
our_guesses = test_atts.apply(classify_one)
actual_classes = ces_test.column(0)

In [1]:
## Q: Calculate the proportion of "correct" guesses


In [None]:
# Is this more accurate than just blindly guessing "Not at risk"?
