# PCA + Standardizer + sklearn/SVM with MNIST

## Libraries

In [1]:
# libraries
#using CUDA

using Flux              # the julia ml library
using Images            # image processing and machine vision for julia

using MLJ               # make_blobs, rmse, confmat, f1score, coerce
#using MLJFlux           # NeuralNetworkClassifier, CUDALibs
using MLDataUtils       # label, nlabel, labelfreq, stratifiedobs
using MLDatasets        # mnist

#using LinearAlgebra     # pinv pseudo-inverse matrix
#using Metrics           # r2-score
using Random
#using StatsBase         # standardize (normalization)
#using Distributions

using Plots; gr()
using StatsPlots
using Printf

using CSV
using DataFrames


## Functions

In [2]:
image2Vector(M) = vec( Float32.(M) )   # 32-bits is faster on GPU

function batchImage2Vector(imagesArray3D)
    h, v, N = size(imagesArray3D)
    vectorOfImageVectors = [ image2Vector( imagesArray3D[:, :, i] ) for i in 1:N]
end

function batchImage2Matrix(imagesArray3D)
    vectorOfImageVectors = batchImage2Vector(imagesArray3D)
    M = reduce(hcat, vectorOfImageVectors)
    M'
end

function batchImage2DF(imagesArray3D)
    M = batchImage2Matrix(imagesArray3D)
    DataFrame(M, :auto)
end


batchImage2DF (generic function with 1 method)

In [3]:
# metrics
function printMetrics(ŷ, y)
    display(confmat(ŷ, y))
    println("accuracy: ", round(accuracy(ŷ, y); digits=3))
    println("f1-score: ", round(multiclass_f1score(ŷ, y); digits=3))
end


printMetrics (generic function with 1 method)

## Dataset

In [4]:
# load mnist from MLDatasets
trainX_original,      trainY_original      = MNIST.traindata()
validationX_original, validationY_original = MNIST.testdata();


In [5]:
# split trainset, testset, validation set
Random.seed!(1)
(trainX, trainY), (testX, testY) = stratifiedobs((trainX_original, trainY_original), p = 0.7)
validationX = copy(validationX_original); validationY = copy(validationY_original)

size(trainX), size(testX), size(validationX)

((28, 28, 42001), (28, 28, 17999), (28, 28, 10000))

## Preprocess


In [6]:
function preprocess(X, y)
    newX = batchImage2DF(X)
    #coerce!(newX)   # no need, all scitypes are Continuous in this example
    new_y = coerce(y, OrderedFactor)
    
    return (newX, new_y)
end

X, y = preprocess(trainX, trainY);

In [7]:
scitype(X)

Table{AbstractVector{Continuous}}

In [8]:
scitype(y)

AbstractVector{OrderedFactor{10}} (alias for AbstractArray{OrderedFactor{10}, 1})

## Pipe the model

In [9]:
# reduce predictors
PCA = @load PCA pkg=MultivariateStats verbosity=0
reducer = PCA(pratio = 0.9)

# standardize predictors
std = Standardizer()

# svm scikitlearn
SVMClass = @load SVMClassifier pkg="ScikitLearn" verbosity=0
svm = SVMClass()

pipe = @pipeline reducer std svm


Pipeline291(
    pca = PCA(
            maxoutdim = 0,
            method = :auto,
            pratio = 0.9,
            mean = nothing),
    standardizer = Standardizer(
            features = Symbol[],
            ignore = false,
            ordered_factor = false,
            count = false),
    svm_classifier = SVMClassifier(
            C = 1.0,
            kernel = "rbf",
            degree = 3,
            gamma = "auto",
            coef0 = 0.0,
            shrinking = true,
            tol = 0.001,
            cache_size = 200,
            max_iter = -1,
            decision_function_shape = "ovr",
            random_state = nothing))

### Train

In [10]:
mach = MLJ.machine(pipe, X, y) |> fit!


┌ Info: Training Machine{Pipeline291,…}.
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/pCiRR/src/machines.jl:464
┌ Info: Training Machine{PCA,…}.
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/pCiRR/src/machines.jl:464
┌ Info: Training Machine{Standardizer,…}.
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/pCiRR/src/machines.jl:464
┌ Info: Training Machine{SVMClassifier,…}.
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/pCiRR/src/machines.jl:464


Machine{Pipeline291,…} trained 1 time; caches data
  model: Pipeline291
  args: 
    1:	Source @381 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @825 ⏎ `AbstractVector{OrderedFactor{10}}`


In [15]:
ŷ = MLJ.predict(mach, X);
display(ŷ[1:5])
printMetrics(ŷ, y)


5-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
 8
 2
 6
 2
 7

10×10 Matrix{Int64}:
 4141     1     2     0     0     2     2     1     0     2
    0  4696     3     2     4     0     0    12     5     0
    0    10  4155     6     1     1     0     4     4     0
    0     1     0  4259     0     7     0     0     1     2
    1     2     5     0  4063     1     2     6     2    14
    0     0     0     7     0  3775     0     0     4     1
    3     0     0     0     4     5  4137     0     0     0
    0     4     3     9     1     0     0  4351     2    11
    0     2     3     4     1     2     2     2  4075     1
    1     3     0     5    15     2     0    10     3  4133

accuracy: 0.995
f1-score: 0.995
