In [None]:
pwd()

cd("C:/Dev/Julia/JuliaStudy")

pwd()

# Chapter 2 Setting up the Data Science Lab
data = readcsv(".\\Data files-selected\\Magic\\magic04.csv")

data

# Using sample data
data_g = data[1:5, :]
data_h = data[end-2:end, :]

sample_data = vcat(data_g, data_h)

# Distance function
function distance{T<:Number}(x::Array{T, 1}, y::Array{T, 1})
        dist = 0
        for i in 1:length(x)
                dist += (x[i] - y[i])^2
        end
        dist = sqrt(dist)
        return dist
end

x = [1, 2, 3]
y = [2 ,3, 4]
distance(x, y)
@test distance(x, y) ≈ 1.7320508075688772

# This one performs classification of a point based on tis distances from the known points of the dataset
function classify{T<:Any}(distances::Array{Float64, 1}, labels::Array{T, 1}, k::Int64)
        class = unique(labels)
        nc = length(class)      # the number of classes
        indexes = Array(Int, k)
        M = typemax(typeof(distances[1]))
        class_count = Array(Int, nc)
        for i in 1:k
                indexes[i] = indmin(distances)
                distances[indexes[i]] = M
        end
        klabels = labels[indexes]

        for i in 1:nc
                for j in 1:k
                        if klabels[j] == class[i]
                                class_count[i] += 1
                                break
                        end
                end
        end
        index = indmax(class_count)
        return class[index]
end

# Testing classify() function
println(sample_data[:, end])            # ["g", "g", "g", "g", "g", "h", "h", "h"]
sample_features = map(Float64, sample_data[:, 1:(end-1)])
sample_labels = sample_data[:, end]
using Base.Test
@test classify(sample_features, sample_labels, 3)


# The main function(wrapper) of the implementation of the kNN algorithm
function apply_kNN{T1<:Number, T2<:Any}(X::Array{T1, 2}, x::Array{T2, 1}, Y::Array{T1, 2}, k::Int)
        N = size(X, 1)                  # Number of known data points
        n = size(Y, 1)                  # Number of data points to classify
        D = Array(Float64, N)           # Initialize distance vector
        z = Array(typeof(x[1]), n)      # Initialize labels vector(output)

        for i in 1:n
                for j in 1:N
                        D[j] = distance(X[j, :], Y[i, :])
                end
                z[i] = classify(D, x, k)
        end
        return z
end

### Algorithm Testing ###



# Data preperation
I = map(Float64, sample_data[:, 1:(end-1)])            # Features
O = sample_data[:, end]                                # Labels

# Data selection
N = length(O)
n = round(Int64, N/2)
R = randperm(N)

ind_train = R[1:n]              # Indices for the training set
X_train = I[ind_train, :]            # Features for training set
y_train = O[ind_train]               # Labels for training set
ind_test = R[(n+1):end]         # Indices for the test set
X_test = I[ind_test, :]             # Features for test set
y_test = O[ind_test]                # Labels for test set

z = apply_kNN(X_train, y_train, X_test, 5)
println(sum(y_test .== z[1]) / n)            # Predicted accuracy
println(z[1][1:5], z[2][1:5])           # Labels and possibilities

