In [1]:
using DataStructures
using DecisionTree
using Distances
using GLMNet # Ridge, Lasso, ElasticNet
using LIBSVM
using LinearAlgebra
using MLBase
using NearestNeighbors
using Plots
using Random
using RDatasets

In [2]:
iris = dataset("datasets", "iris")
first(iris, 5)

Unnamed: 0_level_0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Cat…
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [3]:
getaccuracy(preds, actual) = sum(preds .== actual) / length(actual)

getaccuracy (generic function with 1 method)

In [4]:
X = Matrix(iris[:, 1:4])
irislabels = iris[:, 5];

In [5]:
irislabelmap = labelmap(irislabels)
y = labelencode(irislabelmap, irislabels)
y[1:5], y[end - 5:end]

([1, 1, 1, 1, 1], [3, 3, 3, 3, 3, 3])

In [6]:
function perclasssplits(y, at)
    uids = unique(y)
    keepids = []
    for ui in uids
        curids = findall(y .=== ui)
        rowids = randsubseq(curids, at)
        push!(keepids, rowids...)
    end
    keepids
end

perclasssplits (generic function with 1 method)

In [7]:
trainids = perclasssplits(y, 0.7)
testids = setdiff(1:length(y), trainids);

In [8]:
assignclass(pred) = argmin(abs.(pred .- [1, 2, 3]))

assignclass (generic function with 1 method)

# Lasso

In [9]:
path = glmnet(X[trainids, :], y[trainids])
cv = glmnetcv(X[trainids, :], y[trainids])
mylambda = path.lambda[argmin(cv.meanloss)]
path = glmnet(X[trainids, :], y[trainids], lambda=[mylambda])

Least Squares GLMNet Solution Path (1 solutions for 4 predictors in 74 passes):
─────────────────────────────
     df   pct_dev           λ
─────────────────────────────
[1]   3  0.932377  0.00672235
─────────────────────────────

In [11]:
q = X[testids, :]
lassopreds = GLMNet.predict(path, q)
lassopreds[1:5]

5-element Array{Float64,1}:
 0.9807323983682769
 0.9960026965222337
 1.0648077876063164
 1.0200495932734668
 1.1099195758949985

In [12]:
lassopreds = assignclass.(lassopreds)
getaccuracy(lassopreds, y[testids])

0.9302325581395349

# Ridge

In [13]:
path = glmnet(X[trainids, :], y[trainids], alpha=0)
cv = glmnetcv(X[trainids, :], y[trainids], alpha=0)
mylambda = path.lambda[argmin(cv.meanloss)]

0.07729074360288947

In [15]:
path = glmnet(X[trainids, :], y[trainids], alpha=0, lambda=[mylambda])
q = X[testids, :]
ridgepreds = GLMNet.predict(path, q)
ridgepreds = assignclass.(ridgepreds)
getaccuracy(ridgepreds, y[testids])

0.9534883720930233

# Elastic Net

In [16]:
path = glmnet(X[trainids, :], y[trainids], alpha=0.5)
cv = glmnetcv(X[trainids, :], y[trainids], alpha=0.5)
mylambda = path.lambda[argmin(cv.meanloss)]

0.013444709432039972

In [18]:
path = glmnet(X[trainids, :], y[trainids], alpha=0.5, lambda=[mylambda])
q = X[testids, :]
enpreds = GLMNet.predict(path, q)
enpreds = assignclass.(enpreds)
getaccuracy(enpreds, y[testids])

0.9302325581395349

# Decision Trees

In [21]:
tree = DecisionTreeClassifier(max_depth=3)
DecisionTree.fit!(tree, X[trainids, :], y[trainids])

DecisionTreeClassifier
max_depth:                3
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
classes:                  [1, 2, 3]
root:                     Decision Tree
Leaves: 4
Depth:  3

In [22]:
q = X[testids, :]
treepreds = DecisionTree.predict(tree, q)
getaccuracy(treepreds, y[testids])

0.9302325581395349

# Random Forests

In [23]:
forest = RandomForestClassifier(n_trees=20)
DecisionTree.fit!(forest, X[trainids, :], y[trainids])

RandomForestClassifier
n_trees:             20
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           -1
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             [1, 2, 3]
ensemble:            Ensemble of Decision Trees
Trees:      20
Avg Leaves: 5.65
Avg Depth:  3.95

In [24]:
forestpreds = DecisionTree.predict(forest, q)
getaccuracy(forestpreds, y[testids])

0.9302325581395349

# Nearest Neighbor

In [25]:
kdtree = KDTree(X[trainids, :]')

KDTree{StaticArrays.SArray{Tuple{4},Float64,1,4},Euclidean,Float64}
  Number of points: 107
  Dimensions: 4
  Metric: Euclidean(0.0)
  Reordered: true

In [26]:
idxs, dists = knn(kdtree, q', 5, true);

In [34]:
c = y[trainids][hcat(idxs...)]
possiblelabels = map(i -> counter(c[:, i]), 1:size(c, 2))
#nnpreds = map(i -> 
#    parse(
#        Int, 
#        string(argmax(DataFrame(possiblelabels[i])[1, :]))), 
#    1:size(c, 2))
#getaccuracy(nnpreds, y[testids])

43-element Array{Accumulator{Int64,Int64},1}:
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(1 => 5)
 Accumulator(2 => 5)
 ⋮
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(2 => 2, 3 => 3)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(3 => 5)
 Accumulator(2 => 1, 3 => 4)

# SVM

In [35]:
mod = svmtrain(X[trainids, :]', y[trainids]);

In [36]:
svmpreds, decisionvals = svmpredict(mod, X[trainids, :]')
getaccuracy(svmpreds, y[trainids])

0.9906542056074766