In [1]:
using CUDA
using Flux              # the julia ml library


In [2]:
# fake data
N = 60000

X_true = randn((28, 28, N))
y_true = rand(1:10, N)

function preprocess(X, y)
    Xs = Float32.(X) |> Flux.flatten
    ys = Flux.onehotbatch( Float32.(y), 1:10 )
    
    return (Xs, ys)
end

h, v, N = size(X_true); d = h * v
X, y = preprocess(X_true, y_true);

In [3]:
d, N, size(X), size(y)

(784, 60000, (784, 60000), (10, 60000))

In [8]:
#CUDA.allowscalar(false)

X_d = X |> gpu   # data on gpu
y_d = y |> gpu   # data on gpu

# model configuration
nInputs  = d
nOutputs = 10
model = Flux.Dense(nInputs, nOutputs, relu) |> gpu   # weights on gpu

losses(X, y)    = Flux.crossentropy( model(X), y )
modelParameters = Flux.params(model)
dataLoader = @CUDA.allowscalar Flux.DataLoader((X_d, y_d), batchsize=10000)


Flux.Data.DataLoader{Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Flux.OneHotArray{UInt32, 10, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}}, Random._GLOBAL_RNG}((Float32[0.19320646 1.8693867 … 0.10007166 -1.0638701; -0.26152492 -1.6822407 … 0.113506675 0.37112305; … ; -0.69257283 2.568404 … -1.6768019 0.2516955; -1.3831296 -0.9397052 … 0.3330867 0.4828851], Bool[0 0 … 0 0; 0 1 … 0 0; … ; 0 0 … 0 0; 0 0 … 0 0]), 10000, 60000, true, 60000, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  59991, 59992, 59993, 59994, 59995, 59996, 59997, 59998, 59999, 60000], false, Random._GLOBAL_RNG())

In [20]:
# Flux.train!(loss, params, data, opt; cb)
@time Flux.train!(losses, modelParameters, dataLoader, Descent())

  0.061832 seconds (6.22 k allocations: 823.656 KiB)
