In [1]:
# Copyright Daniel Reznikov, Yunfan Yang all rights reserved
# Final Project COGS 118a, Winter 2016
# Notebook Implements K-Nearest-Neighbors Classifier on breast_cancer dataset

In [2]:
import csv
import numpy as np
from numpy import genfromtxt
import theano
import lasagne
import nolearn
from nolearn.lasagne import NeuralNet
import math
import random
import scipy.io as sio
from sklearn.grid_search import GridSearchCV

In [3]:
# Import data
data = genfromtxt('breast_cancer.csv', delimiter=',')
data = data.astype('int32')

# Randomly select 80% of the dataset as your training set and the rest 20% as your testing set.
xTrain = []
xTest = []
yTrain = []
yTest = []

sampleSize = int(.8*len(data))
xTrain = data[0:sampleSize, 0:10]
yTrain = data[0:sampleSize, -1]
xTest = data[sampleSize+1:len(data), 0:10]
yTest = data[sampleSize+1:len(data), -1]

for i in range(len(yTest)):
    if yTest[i] == 2.0:
        yTest[i] = -1
    elif yTest[i] == 4.0:
        yTest[i] = 1
    else:
        print 'WTF'
        
for i in range(len(yTrain)):
    if yTrain[i] == 2.0:
        yTrain[i] = -1
    elif yTrain[i] == 4.0:
        yTrain[i] = 1
    else:
        print 'WTF'
    
xTest = tuple(map(tuple, xTest))
yTest = tuple(yTest)
xTrain = tuple(map(tuple, xTrain))
yTrain = tuple(yTrain)

print sampleSize

print len(xTest)
print len(yTest)
print len(xTrain)
print len(yTrain)

559
139
139
559
559


In [4]:
#Lets do some KNN

# Helper method, returning the Euclidean distance between two points
# L2 NORM
def dist(testPoint, trainPoint):
    sum = 0
    for index in range(len(testPoint)):
        sum += math.pow( trainingPoint[index] - testPoint[index], 2 )
    return math.sqrt(sum)

In [5]:
# neighborDistance is a matrix. Rows test points, colums are distance to each training point
neighborDistance = [[0 for x in range(len(xTrain))] for x in range(len(xTest))]

# Compute Euclidean Distance from each test point, to each training point
# neighbor distance is matrix of testing points by training points
# where i,j corresponds to distance between ith test pt and jth training pt 
testCntr = 0
for testPoint in xTest:
    trainingCntr = 0
    for trainingPoint in xTrain:
        neighborDistance[testCntr][trainingCntr] = (dist(testPoint, trainingPoint), yTrain[trainingCntr])
        trainingCntr += 1
    neighborDistance[testCntr].sort(key=lambda distance: distance[0])
    testCntr += 1
    
for nD in neighborDistance:
    del nD[0]
 
print len(neighborDistance)


139


In [6]:
#  Decide Label, Compute Error, Score
testScore = [0,0]
for k in range(50):
    testCntr = 0
    testErrorCntr = 0
    for testPoint in xTest:
        labelSum = 0 
        for i in range(0, k):
            labelSum += neighborDistance[testCntr][i][1] ##calls the label
        label = np.sign(labelSum)
        if (label != yTest[testCntr]):
            testErrorCntr += 1
        testCntr += 1
    currTestScore = (1- float(testErrorCntr)/float(testCntr))
    if (currTestScore > testScore[0]):
        testScore[0] = currTestScore
        testScore[1] = int(k)
    print 'For K: ' + str(k) + " score is: " + str(1- float(testErrorCntr)/float(testCntr))
        
print ("best K: " + str(testScore[1]))
print ("Testing Score" , str(testScore[0]))

For K: 0 score is: 0.0
For K: 1 score is: 0.618705035971
For K: 2 score is: 0.510791366906
For K: 3 score is: 0.669064748201
For K: 4 score is: 0.58273381295
For K: 5 score is: 0.690647482014
For K: 6 score is: 0.618705035971
For K: 7 score is: 0.697841726619
For K: 8 score is: 0.654676258993
For K: 9 score is: 0.726618705036
For K: 10 score is: 0.697841726619
For K: 11 score is: 0.755395683453
For K: 12 score is: 0.726618705036
For K: 13 score is: 0.726618705036
For K: 14 score is: 0.712230215827
For K: 15 score is: 0.741007194245
For K: 16 score is: 0.726618705036
For K: 17 score is: 0.755395683453
For K: 18 score is: 0.705035971223
For K: 19 score is: 0.748201438849
For K: 20 score is: 0.726618705036
For K: 21 score is: 0.784172661871
For K: 22 score is: 0.726618705036
For K: 23 score is: 0.748201438849
For K: 24 score is: 0.719424460432
For K: 25 score is: 0.741007194245
For K: 26 score is: 0.73381294964
For K: 27 score is: 0.741007194245
For K: 28 score is: 0.726618705036
For K: 2