In [1]:
function distance{T<:Number}(x::Array{T, 1}, y::Array{T, 1})
    dist = 0  # initialize distance variable

    for i in 1:length(x)  # repeat for all dimensions of x and y
        dist += (x[i] - y[i])^2
    end

    dist = sqrt(dist)
    return dist
end

function classify{T<:Any}(distances::Array{Float64, 1}, labels::Array{T, 1}, k::Int)
    class = unique(labels)  # find all the distinct classes
    nc = length(class)  # number of classes
    indexes = Array(Int,k)  # initialize vector of indexes of the nearest neighbors
    M = typemax(typeof(distances[1]))  # the largest possible number that this vector can have
    class_count = zeros(Int, nc)

    for i in 1:k
        indexes[i] = indmin(distances)
        distances[indexes[i]] = M  # make sure this element is not selected again
    end

    klabels = labels[indexes]

    for i in 1:nc
        for j in 1:k
            if klabels[j] == class[i]
                class_count[i] += 1
            end
        end
    end

    m, index = findmax(class_count)
    conf = m / k

    return class[index], conf
end

function apply_kNN{T1<:Number, T2<:Any}(X::Array{T1,2}, x::Array{T2,1}, Y::Array{T1,2}, k::Int)
    N = size(X,1)  # number of known data points
    n = size(Y,1)  # number of data points to classify
    D = Array(Float64, N) # initialize distance vector
    z = Array(eltype(x), n)  # initialize labels vector (output)
    c = Array(Float64, n) # confidence of prediction

    for i in 1:n
        for j in 1:N
            D[j] = distance(vec(X[j,:]), vec(Y[i,:]))
        end

        z[i], c[i] = classify(D, x, k)
    end

    return z, c
end

apply_kNN (generic function with 1 method)

In [2]:
data = readcsv("d:\\data\\Magic\\magic04.csv")

19020x11 Array{Any,2}:
  28.7967   16.0021  2.6449  0.3918  …   -8.2027  40.092    81.8828  "g"
  31.6036   11.7235  2.5185  0.5303      -9.9574   6.3609  205.261   "g"
 162.052   136.031   4.0612  0.0374     -45.216   76.96    256.788   "g"
  23.8172    9.5728  2.3385  0.6147      -7.1513  10.449   116.737   "g"
  75.1362   30.9205  3.1611  0.3168      21.8393   4.648   356.462   "g"
  51.624    21.1502  2.9085  0.242   …    9.8145   3.613   238.098   "g"
  48.2468   17.3565  3.0332  0.2529      10.5868   4.792   219.087   "g"
  26.7897   13.7595  2.5521  0.4236      -2.9292   0.812   237.134   "g"
  96.2327   46.5165  4.154   0.0779      43.1844   4.854   248.226   "g"
  46.7619   15.1993  2.5786  0.3377      -6.6812   7.875   102.251   "g"
  62.7766   29.9104  3.3331  0.2475  …   23.771    9.9144  323.094   "g"
  18.8562   16.46    2.4385  0.5282     -16.9327  11.461   162.848   "g"
  45.6321   22.71    3.0441  0.2213     -14.3164   0.3822  178.255   "g"
   ⋮                        

In [3]:
I = data[:, 1:(end-1)]
O = data[:, end]

19020-element Array{Any,1}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"

In [4]:
I = map(Float64, I)

19020x10 Array{Float64,2}:
  28.7967   16.0021  2.6449  0.3918  0.1982  …   -8.2027  40.092    81.8828
  31.6036   11.7235  2.5185  0.5303  0.3773      -9.9574   6.3609  205.261 
 162.052   136.031   4.0612  0.0374  0.0187     -45.216   76.96    256.788 
  23.8172    9.5728  2.3385  0.6147  0.3922      -7.1513  10.449   116.737 
  75.1362   30.9205  3.1611  0.3168  0.1832      21.8393   4.648   356.462 
  51.624    21.1502  2.9085  0.242   0.134   …    9.8145   3.613   238.098 
  48.2468   17.3565  3.0332  0.2529  0.1515      10.5868   4.792   219.087 
  26.7897   13.7595  2.5521  0.4236  0.2174      -2.9292   0.812   237.134 
  96.2327   46.5165  4.154   0.0779  0.039       43.1844   4.854   248.226 
  46.7619   15.1993  2.5786  0.3377  0.1913      -6.6812   7.875   102.251 
  62.7766   29.9104  3.3331  0.2475  0.1261  …   23.771    9.9144  323.094 
  18.8562   16.46    2.4385  0.5282  0.2933     -16.9327  11.461   162.848 
  45.6321   22.71    3.0441  0.2213  0.1215     -14.3164   0.

In [6]:
N = length(O)

19020

In [7]:
n = round(Int64, N/2) # instead of round(...) you can use div(N, 2) which yields the same result (give or take 1)

9510

In [8]:
R = randperm(N)

19020-element Array{Int64,1}:
 12868
   965
 11969
  4098
  6251
  9276
 11185
 10835
 14838
  1137
 12989
 16117
 13853
     ⋮
 11855
  4157
  1148
  9630
  2300
 18423
   900
  3999
   155
  9008
 11375
  9513

In [9]:
indX = R[1:n]

9510-element Array{Int64,1}:
 12868
   965
 11969
  4098
  6251
  9276
 11185
 10835
 14838
  1137
 12989
 16117
 13853
     ⋮
  4122
  9451
  4427
  5439
  7225
 18092
 13852
 17413
  6603
 10236
  4994
  9809

In [10]:
X = I[indX,:] 

9510x10 Array{Float64,2}:
  15.5225   14.7969  2.5827  0.5068  0.2575  …   -7.7902  57.3491  246.631 
  22.7315   12.8694  2.2833  0.5677  0.2943      -8.357   10.6062  213.579 
  21.8056    0.0     2.1833  0.8066  0.4426       0.0     55.95     62.714 
  25.3859   19.355   2.7505  0.3979  0.262       15.2668   0.685   132.66  
  20.1382   19.4543  2.2636  0.4578  0.2425     -15.5547  17.612   123.647 
  34.737    23.5412  3.0175  0.292   0.1715  …    9.1531  39.615    84.4406
  78.9361   30.75    3.5039  0.2899  0.16        10.3801   1.3658  333.065 
  21.7375    6.6071  2.1746  0.7291  0.4515      -6.5268   8.734   100.004 
  26.7419   16.7192  2.5234  0.3751  0.2057     -10.9322  79.011   205.846 
  25.3811   15.8935  2.7616  0.4329  0.2519       8.6056  37.555    68.2   
  10.1402    6.862   2.2963  0.7633  0.4517  …   11.1053  34.6423  166.397 
  14.6541   13.5916  2.511   0.642   0.3613     -12.0198  49.6263  188.732 
 209.421   122.147   3.6151  0.1405  0.0715     125.301   29.7

In [11]:
x = O[indX]

9510-element Array{Any,1}:
 "h"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "h"
 "g"
 "h"
 "h"
 "h"
 ⋮  
 "g"
 "g"
 "g"
 "g"
 "g"
 "h"
 "h"
 "h"
 "g"
 "g"
 "g"
 "g"

In [12]:
indY = R[(n+1):end]

9510-element Array{Int64,1}:
 18656
  1390
 18281
  9473
  7770
  6008
 14617
  6790
 18126
  5864
  8572
  2037
  6339
     ⋮
 11855
  4157
  1148
  9630
  2300
 18423
   900
  3999
   155
  9008
 11375
  9513

In [13]:
Y = I[indY, :]

9510x10 Array{Float64,2}:
 23.502   13.6746  2.5653  0.4816  0.3116  …    9.4612  54.046    57.936 
 90.1763  13.9634  2.6021  0.5025  0.2988      11.1199   1.523   410.94  
 50.4864  37.1735  2.9919  0.2409  0.174       21.8089  39.7134  201.881 
 12.895   11.2392  2.1477  0.726   0.4377       6.6514  80.9568  150.922 
 50.2054  20.0516  2.8225  0.3341  0.2235     -14.8623  16.239   205.736 
 35.552   15.5602  2.7222  0.3848  0.2171  …   11.8772  85.3816   27.794 
 74.9872  19.1171  2.7662  0.3582  0.2283     -15.4878  65.8553  233.776 
 52.8537  15.8412  2.8603  0.3434  0.1786      11.4639   0.2426  253.247 
 24.0693   0.0     2.5321  0.7195  0.3715       0.0     26.629   231.863 
 35.6138   7.0964  2.3355  0.485   0.2748      -4.3745  13.759   171.148 
 80.2255  25.295   3.3416  0.189   0.0945  …   13.7018   2.792   222.572 
 66.6014  23.7963  3.0992  0.1528  0.0768     -13.546    2.4355  267.41  
 19.5686  10.9677  2.3222  0.6667  0.35         6.5899  10.9472  165.183 
  ⋮         

In [14]:
y = O[indY]

9510-element Array{Any,1}:
 "h"
 "g"
 "h"
 "g"
 "g"
 "g"
 "h"
 "g"
 "h"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "g"
 "g"
 "g"
 "g"
 "g"
 "h"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"

In [15]:
z = apply_kNN(X, x, Y, 5) 

(Any["g","g","g","g","g","g","h","g","g","g"  …  "g","g","g","g","g","g","g","g","g","g"],[0.6,0.8,0.6,0.8,0.6,0.6,0.8,1.0,0.8,1.0  …  0.8,1.0,0.8,0.6,1.0,0.8,0.8,1.0,0.6,1.0])

In [16]:
println( sum(y .== z[1]) / n )
println(z[1][1:5], z[2][1:5])

0.805888538380652
Any["g","g","g","g","g"][0.6,0.8,0.6,0.8,0.6]
