# Problem 1: Data with R

In [2]:
%load_ext rpy2.ipython

ModuleNotFoundError: No module named 'rpy2'

In [None]:
%%R

library(tidyverse)

In [1]:
%%R
# make sure to get your data in the same folder, next to the notebook file
# so that this will run!


dat <- read_csv('classification_data.csv')

UsageError: Cell magic `%%R` not found.


In [None]:
%%R

# Your data consists of an "X" column and a "label" column
# Use ggplot to plot a kernel density estimate of each label (overlayed on the same plot)
# hint: Don't forget to make sure the labels are factors!
# hint hint: Read the errors carefully! 
# Read the documentation for parse_factor (readr) if confused!

dat$label = factor(dat$label)

ggplot(dat,aes(x=X,fill=label,group=label)) + 
  geom_density(alpha=.5)

# Problem 2: Using Classes in Python

In this problem, you will use the classes in the following cell. **You do not need to modify the classes or touch the code in the following cell in any way!**

In [3]:
from abc import ABC, abstractmethod
from math import sqrt

def smart_zip(a,b):
    try:
        return zip(a, b)
    except TypeError:
        return zip([a], [b])

class AbstractKNeighbors(ABC):
    def __init__(self, K):
        self.K = K

    def fit(self, X, y):
        """ Train the model!

        X should be a list of data points
        y should be a list of labels
        """
        self.X = X
        self.y = y
        return self

    @abstractmethod
    def _make_prediction(self, labels):
        pass

    def predict(self, new_x):
        """Find the nearest K neighbors
        
        new_x should be a single data point
        """

        dists = [sqrt(sum(([(i-j)**2 for i,j in smart_zip(x,new_x)])))
                 for x in self.X]
        sorted_neighbors = sorted(enumerate(dists),
                                  key=lambda t: t[1])
        labels = [self.y[i] for i,_ in sorted_neighbors]
        return self._make_prediction(labels)


class KNearestNeighbors(AbstractKNeighbors):
    def _make_prediction(self, labels):
        avg = sum(labels[:self.K])/self.K
        return round(avg)

class KFurthestNeighbors(AbstractKNeighbors):
    def _make_prediction(self, labels):
        avg = sum(labels[-self.K:])/self.K
        return round(1 - avg)

In [74]:
from csv import reader

with open('classification_data.csv', 'r') as f:
    dat = list(reader(f))[1:]
    dat = [[float(x), int(label)] for x,label in dat]

## Problem 2.1: Shuffling!

In [75]:
# In your data, "X" is a data point that is nothing more than
# a single number. 
# Shuffle your data into a random order (use random.shuffle!)

import random

random.shuffle(dat)


In [76]:
# If you shuffled your data, this test should pass
# (i.e. not throw an error)

assert(sum([label for x,label in dat[:50]]) != 0)

## Problem 2.2: Splitting!

In [77]:
# Split your data, which is now a list, into 2 sublists:
# "train" and "test"
# The "train" group should have 700 elements
# The test group should have 300 elements
# Each group should have the same format as the original data

train = dat[:700]
test = dat[700:]

#print(len(dat))
#print(len(train))
#print(len(test))

#print(train[699])
#print(dat[699])
#print(test[0])
#print(dat[700])

In [47]:
# Now you will need to make another split, within the groups!
# For each group ("train" and "test") split the X's from the labels.

train_x = [x[0] for x in train]
train_labels = [x[1] for x in train]

test_x = [x[0] for x in test]
test_labels = [x[1] for x in test]

## Problem 2.3: Testing models!

In [78]:
# For each model: 
# 1. Create an instance the class, with constructor parameters: K=5
# 2. Train the instance on the "train" groups X's and labels (y's)
# 3. Test how well the instance does: 
#    A. Use the trained instance to predict the label of each "X" in the "test" group
#    B. Use your "test" labels to see if the predicted label is equal the true label

near = KNearestNeighbors(5)
fur = KFurthestNeighbors(5)

near.fit(train_x,train_labels)
fur.fit(train_x,train_labels)

predict_near = [near.predict(i) for i in test_x]
type(predict_near)
predict_fur = [fur.predict(i) for i in test_x]
type(predict_fur)

print(len(predict_near))
print(len(predict_fur))
print(len(test_labels))

results_near = []
results_fur = []
for i in range(len(test_labels)):
        results_near.append(predict_near[i] == test_labels[i])
        results_fur.append(predict_fur[i] == test_labels[i])

print(len(results_near))
print(len(results_fur))


300
300
300
300
300


In [79]:
# Compare the two classes of models!

print(sum(results_near) / len(results_near))
print(sum(results_fur) / len(results_fur))

0.4766666666666667
0.4766666666666667
