In [4]:
Pkg.add("Images")
Pkg.add("DataFrames")
using Images
using DataFrames

#typeData could be either "train" or "test.
#labelsInfo should contain the IDs of each image to be read
#The images in the trainResized and testResized data files
#are 20x20 pixels, so imageSize is set to 400.
#path should be set to the location of the data files.

function read_data(typeData, labelsInfo, imageSize, path)
 #Intialize x matrix
 x = zeros(size(labelsInfo, 1), imageSize)

 for (index, idImage) in enumerate(labelsInfo["ID"]) 
  #Read image file 
  nameFile = "$(path)/$(typeData)Resized/$(idImage).Bmp"
  img = imread(nameFile)

  #Convert img to float values 
        temp = float32sc(img)

  #Convert color images to gray images
  #by taking the average of the color scales. 
  if ndims(temp) == 3
   temp = mean(temp.data, 1)
  end
    
  #Transform image matrix to a vector and store 
  #it in data matrix 
  x[index, :] = reshape(temp, 1, imageSize)
 end 
 return x
end

imageSize = 400 # 20 x 20 pixel

#Set location of data files, folders
path = "/Users/ejaramos/Dropbox/_Kaggle WIP/Ramos/source-code-files/ipython-iJulia/"


#Read information about training data , IDs.
labelsInfoTrain = readtable("$(path)/trainLabels.csv")

#Read training matrix
xTrain = read_data("train", labelsInfoTrain, imageSize, path)

#Read information about test data ( IDs ).
labelsInfoTest = readtable("$(path)/sampleSubmission.csv")

#Read test matrix
xTest = read_data("test", labelsInfoTest, imageSize, path)

#Get only first character of string (convert from string to character).
#Apply the function to each element of the column "Class"
yTrain = map(x -> x[1], labelsInfoTrain["Class"])

#Convert from character to integer
yTrain = int(yTrain)

xTrain = xTrain'
xTest = xTest'

INFO: Nothing to be done
INFO: Nothing to be done


LoadError: LoadError: SystemError: opening file /Users/ejaramos/Dropbox/_Kaggle WIP/Ramos/source-code-files/ipython-iJulia//trainLabels.csv: No such file or directory
while loading In[4], in expression starting on line 44

In [None]:
# Defining main functions

In [8]:
function euclidean_distance(a, b)
 distance = 0.0 
 for index in 1:size(a, 1) 
  distance += (a[index]-b[index]) * (a[index]-b[index])
 end
 return distance
end

#This function finds the k nearest neighbors of the ith data point.
function get_k_nearest_neighbors(x, i, k)

 nRows, nCols = size(x)

 #Let's initialize a vector image_i. We do this so that 
 #the image ith is accessed only once from the main X matrix.
 #The program saves time because no repeated work is done.
 #Also, creating an empty vector and filling it with each 
 #element at a time is faster than copying the entire vector at once.
 #Creating empty array (vector) of nRows elements of type Float32(decimal)
 imageI = Array(Float32, nRows) 

 for index in 1:nRows
  imageI[index] = x[index, i]
 end

 #For the same previous reasons, we initialize an empty vector 
 #that will contain the jth data point
 imageJ = Array(Float32, nRows)
 
 #Let's also initialize an empty vector that will contain the distances
 #between the ith data point and each data point in the X matrix.
 distances = Array(Float32, nCols)
 
 for j in 1:nCols
  #The next for loop fills the vector image_j with the jth data point 
  #from the main matrix. Copying element one by one is faster
  #than copying the entire vector at once.
  for index in 1:nRows
   imageJ[index] = x[index, j]
  end
  #Let's calculate the distance and save the result
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 
 #The following line gives the indices sorted by distances.
 sortedNeighbors = sortperm(distances)
 
 #Let's select the k nearest neighbors. We start with the 
 #second closest. See explanation below.
 kNearestNeighbors = sortedNeighbors[2:k+1]
 return kNearestNeighbors
end

#This function assigns a label to the ith point according to
#the labels of the k nearest neighbors. The training
#data is stored in the X matrix, and its labels are stored in y.

function assign_label(x, y, k, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, k)
 
 #let's make a dictionary to save the counts of 
 #the labels
 # Dict{}() is also right .
 # Int,Int indicates the dictionary to expect integer values 
 counts = Dict{Int, Int}() 

 #The next two variables keep track of the 
 #label with the highest count.
 highestCount = 0
 mostPopularLabel = 0

 #Iterating over the labels of the k nearest neighbors
 for n in kNearestNeighbors
  labelOfN = y[n]
  #Adding the current label to our dictionary
  #if it's not already there
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  #Add one to the count
  counts[labelOfN] += 1 

  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end

assign_label (generic function with 1 method)

# Running LOOF-CV with 1NN sequentially

In [6]:
tic()
k=1 
yPredictions = [assign_label(xTrain, yTrain, k, i) for i in 1:size(xTrain, 2)]
loofCvAccuracy = mean(yPredictions .== yTrain) 
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()

LoadError: LoadError: UndefVarError: xTrain not defined
while loading In[6], in expression starting on line 3

# Preparing Julia to run in parallel

In [7]:
addprocs(3) 

@everywhere function euclidean_distance(a, b)
 distance = 0.0 
 for index in 1:size(a, 1) 
  distance += (a[index]-b[index]) * (a[index]-b[index])
 end
 return distance
end

@everywhere function get_k_nearest_neighbors(x, i, k)
 nRows, nCols = size(x)
 imageI = Array(Float32, nRows)
 for index in 1:nRows
  imageI[index] = x[index, i]
 end
 imageJ = Array(Float32, nRows)
 distances = Array(Float32, nCols) 
 for j in 1:nCols
  for index in 1:nRows
   imageJ[index] = x[index, j]
  end
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 sortedNeighbors = sortperm(distances)
 kNearestNeighbors = sortedNeighbors[2:k+1]
 return kNearestNeighbors
end 

@everywhere function assign_label(x, y, k, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, k) 
 counts = Dict{Int, Int}() 
 highestCount = 0
 mostPopularLabel = 0
 for n in kNearestNeighbors
  labelOfN = y[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1 
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end

# Running LOOF-CV with 1NN in parallel

In [5]:
tic()
k = 1
yPredictions = @parallel (vcat) for i in 1:size(xTrain, 2)
 assign_label(xTrain, yTrain, k, i)
end
loofCvAccuracy = mean(yPredictions .== yTrain) 
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()

The LOOF-CV accuracy of 1NN is 0.42798026420499763
elapsed time: 45.727314347 seconds


45.727314347

In [6]:
tic()
k = 1
sumValues = @parallel (+) for i in 1:size(xTrain, 2)
 assign_label(xTrain, yTrain, k, i) == yTrain[i, 1]
end
loofCvAccuracy = sumValues / size(xTrain, 2)
println("The LOOF-CV accuracy of 1NN is $(loofCvAccuracy)")
toc()

elapsed time: 45.046256153 seconds


45.046256153

# Tuning the value for k

In [8]:
#Similar to function assign_label.
#Only changes are commented
@everywhere function assign_label_each_k(x, y, maxK, i)
 kNearestNeighbors = get_k_nearest_neighbors(x, i, maxK) 

 #The next array will keep the labels for each value of k
 labelsK = zeros(Int, 1, maxK) 

 counts = Dict{Int, Int}()
 highestCount = 0
 mostPopularLabel = 0

 #We need to keep track of the current value of k
 for (k, n) in enumerate(kNearestNeighbors)
  labelOfN = y[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN  
  end
  #Save current most popular label 
  labelsK[k] = mostPopularLabel
 end
 #Return vector of labels for each k
 return labelsK
end



In [9]:
tic()
maxK = 20 #Any value can be chosen
yPredictionsK = @parallel (vcat) for i in 1:size(xTrain, 2)
 assign_label_each_k(xTrain, yTrain, maxK, i)
end
for k in 1:maxK
 accuracyK = mean(yTrain .== yPredictionsK[:, k])
 println("The LOOF-CV accuracy of $(k)-NN is $(accuracyK)")
end
toc()

The LOOF-CV accuracy of 1-NN is 0.42798026420499763
The LOOF-CV accuracy of 2-NN is 0.42798026420499763
The LOOF-CV accuracy of 3-NN is 0.4286169027534617
The LOOF-CV accuracy of 4-NN is 0.4211363998090084
The LOOF-CV accuracy of 5-NN is 0.41301925831609104
The LOOF-CV accuracy of 6-NN is 0.40713035174279805
The LOOF-CV accuracy of 7-NN is 0.3983765717014165
The LOOF-CV accuracy of 8-NN is 0.3961483367817921
The LOOF-CV accuracy of 9-NN is 0.3923285054910075
The LOOF-CV accuracy of 10-NN is 0.3886678338373388
The LOOF-CV accuracy of 11-NN is 0.3865987585548305
The LOOF-CV accuracy of 12-NN is 0.38309724653827787
The LOOF-CV accuracy of 13-NN is 0.378004138150565
The LOOF-CV accuracy of 14-NN is 0.37657170141652074
The LOOF-CV accuracy of 15-NN is 0.37593506286805667
The LOOF-CV accuracy of 16-NN is 0.3729110297628521
The LOOF-CV accuracy of 17-NN is 0.3695686773834156
The LOOF-CV accuracy of 18-NN is 0.3679770810122553
The LOOF-CV accuracy of 19-NN is 0.3692503581091835
The LOOF-CV acc

85.727564674

# Running kNN on the test data

In [10]:
@everywhere function get_k_nearest_neighbors(xTrain, imageI, k)
 nRows, nCols = size(xTrain) 
 imageJ = Array(Float32, nRows)
 distances = Array(Float32, nCols) 
 for j in 1:nCols
  for index in 1:nRows
   imageJ[index] = xTrain[index, j]
  end
  distances[j] = euclidean_distance(imageI, imageJ)
 end
 sortedNeighbors = sortperm(distances)
 kNearestNeighbors = sortedNeighbors[1:k]
 return kNearestNeighbors
end 

@everywhere function assign_label(xTrain, yTrain, k, imageI)
 kNearestNeighbors = get_k_nearest_neighbors(xTrain, imageI, k) 
 counts = Dict{Int, Int}() 
 highestCount = 0
 mostPopularLabel = 0
 for n in kNearestNeighbors
  labelOfN = yTrain[n]
  if !haskey(counts, labelOfN)
   counts[labelOfN] = 0
  end
  counts[labelOfN] += 1 #add one to the count
  if counts[labelOfN] > highestCount
   highestCount = counts[labelOfN]
   mostPopularLabel = labelOfN
  end 
 end
 return mostPopularLabel
end


In [11]:
tic()
k = 3 # The CV accuracy shows this value to be the best.
yPredictions = @parallel (vcat) for i in 1:size(xTest, 2)
 nRows = size(xTrain, 1)
 imageI = Array(Float32, nRows)
 for index in 1:nRows
  imageI[index] = xTest[index, i]
 end
 assign_label(xTrain, yTrain, k, imageI)
end
toc()

elapsed time: 46.272848076 seconds


46.272848076

In [12]:
#Convert integer predictions to character
labelsInfoTest["Class"] = char(yPredictions)

#Save predictions
writetable("$(path)/juliaKNNSubmission.csv", labelsInfoTest, separator=',', header=true)
println("Submission file saved in $(path)/juliaKNNSubmission.csv")