Flux with MNIST and GPU
* Pre-treatment with PCA

Libraries

In [1]:
using CUDA
using MLDatasets:          MNIST
using Flux                 # the julia ml library
using PreprocessingImages; pim = PreprocessingImages
using MLDataUtils:         stratifiedobs
using MLJ
using Random
using Plots; gr()

Plots.GRBackend()

In [2]:
include( expanduser("~/projects/pesquisa/libs/libml.jl") )
include( expanduser("~/projects/pesquisa/libs/misc.jl") )

cleanCUDA (generic function with 1 method)

MNIST

In [3]:
# load mnist
trX,   trY   = MNIST(:train)[:]
testX, testY = MNIST(:test)[:];

In [4]:
# split train, validation, test sets
Random.seed!(1)
(trainX, trainY), (validationX, validationY) = stratifiedobs((trX, trY), p = 0.7)
size(trainX), size(validationX), size(testX)

((28, 28, 42001), (28, 28, 17999), (28, 28, 10000))

Preprocessing functions

In [5]:
# MLJ works with dataframes
function preprocess1(X, y)
    Xs = permutedims(X, (2, 1, 3))   # needed to adjust visualization of digits
    Xs = pim.batchImage2DF(Xs)
    
    ys = Int32.(y)

    return (Xs, ys)
end

preprocess1 (generic function with 1 method)

In [6]:
# models("PCA")[2]

In [7]:
# reduce predictors
PCA = @load PCA pkg=MultivariateStats verbosity=0
reducer = PCA(variance_ratio = 0.9)   # mensagem de erro diz que pratio não existe !!!!! e sugere variance_ratio

# standardize predictors
std = Standardizer()

# machine
pipe = @pipeline reducer std

Pipeline285(
  pca = PCA(
        maxoutdim = 0, 
        method = :auto, 
        variance_ratio = 0.9, 
        mean = nothing), 
  standardizer = Standardizer(
        features = Symbol[], 
        ignore = false, 
        ordered_factor = false, 
        count = false))

Data preprocessing

In [8]:
# convert to dataframe
X_tr , y_tr  = preprocess1(trainX, trainY)
X_val, y_val = preprocess1(validationX, validationY)

typeof(X_tr), size(X_tr), size(y_tr)

(DataFrames.DataFrame, (42001, 784), (42001,))

In [9]:
# pca dimensionality reduction
mach = MLJ.machine(pipe, X_tr) |> fit!
X_tr  = MLJ.transform(mach, X_tr)  .|> Float32   # transform(unsupervised) vs predict(supervised)
X_val = MLJ.transform(mach, X_val) .|> Float32   # transform(unsupervised) vs predict(supervised)

typeof(X_tr), size(X_tr), size(y_tr)

┌ Info: Training machine(Pipeline285(pca = PCA(maxoutdim = 0, …), …), …).
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/6ooqv/src/machines.jl:496
┌ Info: Training machine(PCA(maxoutdim = 0, …), …).
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/6ooqv/src/machines.jl:496
┌ Info: Training machine(Standardizer(features = Symbol[], …), …).
└ @ MLJBase /home/ciro/.julia/packages/MLJBase/6ooqv/src/machines.jl:496


(DataFrames.DataFrame, (42001, 87), (42001,))

In [12]:
# Flux works with matrices and vectors
function preprocess2(df, y)
    N, d = size(df)
    Xs = pim.df2Matrix(df)            # convert dataframe to matrix
    Xs = Xs |> Flux.flatten       # convert n-dimensional array to matrix (adequacy to Flux); last dimension is preserved
    Xs = [Xs[i,:] for i in 1:N]   # individualize each data point on a vector of vectors
    Xs = Flux.batch(Xs)           # batch the dataset, with each data point individualized

    ys = Flux.onehotbatch( Float32.(y), 0:9 )
    return (Xs, ys)
end

preprocess2 (generic function with 1 method)

In [13]:
# convert to flux matrix
X_tr , y_tr  = preprocess2(X_tr, y_tr)
X_val, y_val = preprocess2(X_val, y_val)

d, N         = size(X_tr)
typeof(X_tr), size(X_tr), size(y_tr)

(Matrix{Float32}, (87, 42001), (10, 42001))

In [14]:
# data on gpu
X_tr_g  = X_tr |> gpu
y_tr_g  = y_tr |> gpu
X_val_g = X_val |> gpu;

┌ Info: The GPU function is being called but the GPU is not accessible. 
│ Defaulting back to the CPU. (No action is required if you want to run on the CPU).
└ @ Flux /home/ciro/.julia/packages/Flux/KkC79/src/functor.jl:192


Model

In [15]:
# model architecture
nInputs  = d
nOutputs = 10

# Flux.Dense(nInputs, 256, tanh)
model = Chain( Dense(nInputs => 16, relu),
               Dense(16      => nOutputs, relu),
               softmax)      |> gpu                # weights on gpu

Chain(
  Dense(87 => 16, relu),                [90m# 1_408 parameters[39m
  Dense(16 => 10, relu),                [90m# 170 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 4 arrays, [39m1_578 parameters, 6.414 KiB.

In [16]:
# training parameters
η = 0.2            # learning rate (\eta<tab>)
lossFunction(X, y) = Flux.mse( model(X), y )
modelParameters    = Flux.params(model)
dataset            = Flux.DataLoader((X_tr_g, y_tr_g), batchsize=32) # batchsize=1 => SGD, batch size > 1 => batch gradient descent
modelOptimizer     = Flux.Descent(η);
# callBack           = Flux.throttle(() -> println("."), 10);        # print every 10s

Training

In [18]:
epochs = 5000

5000

In [19]:
function predictOutcome(X)
    ŷ = Flux.onecold( model(X), [0:9;] )
end

predictOutcome (generic function with 1 method)

In [20]:
# # https://fluxml.ai/Flux.jl/stable/training/training/

# # preferred for multiple epochs
# for epoch in 1:epochs
#     Flux.train!(lossFunction, modelParameters, dataset, modelOptimizer; cb=callBack)
# end

In [21]:
# https://fluxml.ai/Flux.jl/stable/training/training/

loss_tr           = Vector{Float64}()
acc_val           = Vector{Float64}()
maxLoss           = 1e-4   # hint: start at 1e-2, then go lower for more epochs
nearDeltaZeroLoss = maxLoss / 25
nearDeltaZeroAcc  = 1e-5

for epoch in 1:epochs
    # train
    loss = trainModel!( lossFunction, modelParameters, dataset, modelOptimizer )   # libml
    push!(loss_tr, loss)
    # print(".")   # "epoch completed" indicator

    # predict
    ŷ  = predictOutcome(X_val_g)
    f1 = MLJ.multiclass_f1score(ŷ, coerce(validationY, OrderedFactor))
    push!(acc_val, f1)

    # exit criteria
    if earlyStopLossCriteria(loss_tr, maxLoss, nearDeltaZeroLoss)   break   end
    if earlyStopAccuracyCriteria(acc_val, nearDeltaZeroAcc)         break   end
end

LoadError: InterruptException:



In [None]:
# plot training
s = (500, 500)
p1 = plotVector(loss_tr, s, :log10, "Loss function")
p2 = plotVector(acc_val, s, :none,  "f1-score")
display( plot(p1, p2, layout=(2,1), size=s) )

In [None]:
# print metrics
ŷ = predictOutcome(X_val_g)
accuracyValidation, _ = printMetrics( ŷ, coerce(validationY, OrderedFactor) );

In [None]:
# clean gpu memory
X_tr  = nothing
y_tr  = nothing
X_val = nothing
cleanCUDA()

Testing

In [None]:
# preprocessing
X_tst, y_tst = preprocess1(testX, testY)
X_tst        = MLJ.transform(mach, X_tst) .|> Float32   # transform(unsupervised) vs predict(supervised)
X_tst, y_tst = preprocess2(X_tst, y_tst)
X_tst_g      = X_tst |> gpu

# predict
ŷ = predictOutcome(X_tst_g)

# result
accuracyTest, _ = printMetrics( ŷ, coerce(testY, OrderedFactor) );

In [None]:
accuracyValidation > accuracyTest ? "fit ok" : "overfitting"

In [None]:
# clean gpu memory
X_tst = nothing
cleanCUDA()