# ArrayFire

In [1]:
using ArrayFire
using BenchmarkTools

In [2]:
getAvailableBackends()

CPU and OpenCL


In [3]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (OpenCL, 64-bit Mac OSX, build f65dd97)
[0] APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB
-1- APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB


In [4]:
ArrayFire.AF_BACKEND_DEFAULT, ArrayFire.AF_BACKEND_CPU, ArrayFire.AF_BACKEND_OPENCL

(0x00000000,0x00000001,0x00000004)

#### Switch backends for computations during execution:  ```setBackend``` method

In [5]:
setBackend(AF_BACKEND_CPU) #Switch back to CPU backend
getActiveBackend()

CPU Backend


In [6]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (CPU, 64-bit Mac OSX, build f65dd97)
[0] Unknown: Unknown, 32768 MB, Max threads(1) 


In [7]:
setBackend(AF_BACKEND_OPENCL) #Switch to OPENCL backend
getActiveBackend()

OpenCL Backend


In [8]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (OpenCL, 64-bit Mac OSX, build f65dd97)
[0] APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB
-1- APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB


## Using AFArray Arrays

In [9]:
A = Array{Float32}(rand(1000,1000))

1000×1000 Array{Float32,2}:
 0.991526   0.905353   0.414237  …  0.00187583  0.487111   0.393964 
 0.686738   0.541147   0.902407     0.975978    0.0705317  0.840541 
 0.914216   0.507185   0.189814     0.705462    0.124437   0.254265 
 0.618367   0.933544   0.440262     0.135589    0.0250456  0.533915 
 0.485341   0.660695   0.10388      0.312015    0.0337241  0.116335 
 0.820546   0.511828   0.444231  …  0.644758    0.897743   0.788075 
 0.202777   0.744868   0.483767     0.470155    0.972139   0.450529 
 0.559522   0.364542   0.884492     0.118107    0.165076   0.499278 
 0.940412   0.466941   0.996051     0.162097    0.955392   0.63116  
 0.614052   0.548663   0.674687     0.414027    0.289061   0.961085 
 0.343558   0.400573   0.142486  …  0.0420587   0.0338375  0.713605 
 0.838507   0.254263   0.933234     0.584956    0.506654   0.929596 
 0.29988    0.555391   0.342497     0.130637    0.240383   0.790352 
 ⋮                               ⋱                                  
 0.708

In [10]:
Agpu = AFArray(A)

1000×1000 ArrayFire.AFArray{Float32,2}:
 0.991526   0.905353   0.414237  …  0.00187583  0.487111   0.393964 
 0.686738   0.541147   0.902407     0.975978    0.0705317  0.840541 
 0.914216   0.507185   0.189814     0.705462    0.124437   0.254265 
 0.618367   0.933544   0.440262     0.135589    0.0250456  0.533915 
 0.485341   0.660695   0.10388      0.312015    0.0337241  0.116335 
 0.820546   0.511828   0.444231  …  0.644758    0.897743   0.788075 
 0.202777   0.744868   0.483767     0.470155    0.972139   0.450529 
 0.559522   0.364542   0.884492     0.118107    0.165076   0.499278 
 0.940412   0.466941   0.996051     0.162097    0.955392   0.63116  
 0.614052   0.548663   0.674687     0.414027    0.289061   0.961085 
 0.343558   0.400573   0.142486  …  0.0420587   0.0338375  0.713605 
 0.838507   0.254263   0.933234     0.584956    0.506654   0.929596 
 0.29988    0.555391   0.342497     0.130637    0.240383   0.790352 
 ⋮                               ⋱                             

In [11]:
@time res = Array(Agpu*Agpu)

  0.034877 seconds (9.38 k allocations: 4.189 MB, 16.35% gc time)


1000×1000 Array{Float32,2}:
 247.688  242.083  244.167  243.93   …  245.74   239.54   239.448  245.992
 250.886  252.541  247.229  242.474     246.704  241.226  249.206  249.052
 250.833  254.152  253.98   244.952     250.773  245.19   248.663  254.309
 245.709  249.344  243.36   244.452     251.729  237.674  246.609  252.106
 258.94   256.044  253.564  250.503     258.55   250.522  256.52   262.778
 254.162  254.218  249.47   241.176  …  254.408  244.913  250.947  247.674
 254.587  253.329  254.237  248.94      251.282  242.236  252.369  255.994
 253.114  258.378  255.489  253.147     259.558  255.118  262.077  259.47 
 249.827  245.603  236.589  237.523     244.647  233.012  240.364  245.979
 248.366  251.231  245.83   239.223     249.691  238.103  247.95   254.121
 243.707  237.995  239.737  235.548  …  245.393  236.626  240.023  244.926
 250.404  252.036  249.821  245.982     257.575  240.942  252.942  257.076
 239.952  239.684  232.082  232.713     235.774  227.879  234.529  236.9

In [12]:
@time res =A *A;

  0.474895 seconds (378.52 k allocations: 16.890 MB, 1.24% gc time)


#### Bring computation from the GPU to the CPU: Just Array(AFArray)

In [13]:
host_to_device = AFArray(rand(100,100));
device_to_host = Array(host_to_device);

# Little Example with MNIST and a softmax


- More details about softmax http://cs231n.github.io/linear-classify/#softmax
- http://cs231n.github.io/neural-networks-case-study/

In [14]:
using MNIST

In [15]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()
n_features = size(X_train)[1]

T = Float32
X_train = Array{T}( (X_train - minimum(X_train))/(maximum(X_train) - minimum(X_train)) )
y_train = Array{Int32}(y_train) + 1
X_test = Array{T}(X_test - minimum(X_test))/(maximum(X_test) - minimum(X_test)) 
y_test = Array{Int32}(y_test) + 1 ;

In [16]:
n_features

784

In [17]:
g_X_train = AFArray(X_train)
g_y_train = AFArray(y_train)
g_X_test = AFArray(X_test)
g_y_test = AFArray(y_test);

In [18]:
type Softmax
    W
    b
end

In [19]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))

g_W = AFArray(W)
g_b = AFArray(b);

s = Softmax(g_W,g_b)

Softmax(Float32[0.118046 -0.0414824 … -0.123881 0.0902521; -0.0550601 0.144616 … 0.0101098 0.105457; … ; -0.0110981 -0.0705758 … 0.00348977 -0.0848718; 0.160865 -0.0399483 … -0.0365819 0.0367972],Float32[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [20]:
function score(s::Softmax, minibatch )
    return s.W * minibatch .+ s.b
end

score (generic function with 1 method)

In [21]:
function probability(scores)
    return exp.(scores) ./ sum( exp.(scores), 2)
end

probability (generic function with 1 method)

In [22]:
batch_size = 25

25

In [23]:
X_minibatch = g_X_train[:,1:batch_size]
y_minibatch = g_y_train[1:batch_size];

In [24]:
sc = score(s, X_minibatch)

10×25 ArrayFire.AFArray{Float32,2}:
 -1.54075    -0.455529   0.0298911  …  -1.5243     -0.2878     -0.745876
  0.0564713  -0.635052   0.856473      -0.730749   -0.62796    -0.642135
  1.18439     1.13053    0.455516      -0.104623   -1.1595      0.754017
  0.176526    1.07941   -0.0772081      0.137638   -0.175537   -0.232986
  0.785931    0.513322  -0.982625      -0.55731    -0.247772    0.760794
  1.60452    -0.504355  -0.75444    …   0.638736   -0.609212    1.02341 
  0.377698    0.175181   0.367915       0.37969     0.899522    1.03421 
 -1.1851     -1.68758   -0.371822      -0.842797   -0.188103   -0.452986
 -0.327628    0.326841   0.591464      -0.414518    0.443976   -0.2453  
  0.305432   -0.464481  -0.714248      -0.0383462   0.0561763   0.872214

In [25]:
probs = probability(sc)

10×25 ArrayFire.AFArray{Float32,2}:
 0.0162     0.0479534  0.0779174  …  0.0164687  0.0567104   0.0358693
 0.0486159  0.0243475  0.108197      0.0221255  0.0245207   0.0241756
 0.0915028  0.0867048  0.0441455     0.0252128  0.00877998  0.059501 
 0.0236964  0.0584522  0.018386      0.0227926  0.0166641   0.0157338
 0.0851088  0.064801   0.0145178     0.0222133  0.0302721   0.0829961
 0.107603   0.0130602  0.0101705  …  0.0409628  0.0117601   0.06018  
 0.0361699  0.029539   0.0358178     0.0362421  0.0609499   0.0697377
 0.0186809  0.0113024  0.0421308     0.0263062  0.0506277   0.0388464
 0.0381182  0.073344   0.0955628     0.0349459  0.0824585   0.0413892
 0.0425714  0.0197128  0.0153559     0.0301868  0.0331793   0.0750357

We now have an array probs where each col now contains the class probabilities.

In particular, since we’ve normalized them every row now sums to one. We can now query for the log probabilities assigned to the correct classes in each example:

    [probs[i,j] for (i,j) in zip(minibatch_true_classes,minibatch_indicies)]

In [26]:
probs[:,1:4]

10×4 ArrayFire.AFArray{Float32,2}:
 0.0162     0.0479534  0.0779174  0.0486526 
 0.0486159  0.0243475  0.108197   0.0271717 
 0.0915028  0.0867048  0.0441455  0.00735118
 0.0236964  0.0584522  0.018386   0.0161423 
 0.0851088  0.064801   0.0145178  0.0254832 
 0.107603   0.0130602  0.0101705  0.0134755 
 0.0361699  0.029539   0.0358178  0.0468475 
 0.0186809  0.0113024  0.0421308  0.0643172 
 0.0381182  0.073344   0.0955628  0.0867942 
 0.0425714  0.0197128  0.0153559  0.0391515 

In [27]:
y_train[1:4]

4-element Array{Int32,1}:
 6
 1
 5
 2

In [28]:
[probs[i,j] for (i,j) in zip(y_train[1:4],1:4)]

4-element Array{Float32,1}:
 0.107603 
 0.0479534
 0.0145178
 0.0271717

- The array correct_logprobs is a Vector of just the probabilities assigned to the correct classes for each example.

- The full loss is then the average of these log probabilities and the regularization loss:



In [29]:
logprobs_correct_classes = [-log(probs[i,j]) for (i,j) in zip(y_minibatch, 1:length(y_minibatch))]
data_loss = sum(logprobs_correct_classes)/length(logprobs_correct_classes)

3.6681707f0

#### Gradient of the loss

In [30]:
n_classes = 10
indicies_to_modify = [ y + n_classes*(i-1) for (i,y) in enumerate(y_minibatch)];

In [31]:
indicies_to_modify'

1×25 Array{Int64,2}:
 6  11  25  32  50  53  62  74  82  95  …  187  200  205  211  230  232  242

In [32]:
dscores = zeros(probs)
dscores .= probs

10×25 Array{Float32,2}:
 0.0162     0.0479534  0.0779174  …  0.0164687  0.0567104   0.0358693
 0.0486159  0.0243475  0.108197      0.0221255  0.0245207   0.0241756
 0.0915028  0.0867048  0.0441455     0.0252128  0.00877998  0.059501 
 0.0236964  0.0584522  0.018386      0.0227926  0.0166641   0.0157338
 0.0851088  0.064801   0.0145178     0.0222133  0.0302721   0.0829961
 0.107603   0.0130602  0.0101705  …  0.0409628  0.0117601   0.06018  
 0.0361699  0.029539   0.0358178     0.0362421  0.0609499   0.0697377
 0.0186809  0.0113024  0.0421308     0.0263062  0.0506277   0.0388464
 0.0381182  0.073344   0.0955628     0.0349459  0.0824585   0.0413892
 0.0425714  0.0197128  0.0153559     0.0301868  0.0331793   0.0750357

In [33]:
dscores[24*10+1] # first position last example

0.03586931f0

In [34]:
dscores[10*25] # last position from last example in the array

0.07503572f0

Go to every position to be modified (the ones with the correct class) and substract 1

In [35]:
dscores[indicies_to_modify] -= 1;

In [36]:
dscores ./= length(y_minibatch)

10×25 Array{Float32,2}:
  0.000648     -0.0380819     0.0031167    …   0.00226842    0.00143477 
  0.00194464    0.000973898   0.00432787      -0.0390192    -0.039033   
  0.00366011    0.00346819    0.00176582       0.000351199   0.00238004 
  0.000947855   0.00233809    0.000735439      0.000666565   0.000629351
  0.00340435    0.00259204   -0.0394193        0.00121089    0.00331984 
 -0.0356959     0.00052241    0.000406818  …   0.000470405   0.0024072  
  0.0014468     0.00118156    0.00143271       0.002438      0.00278951 
  0.000747235   0.000452097   0.00168523       0.00202511    0.00155386 
  0.00152473    0.00293376    0.00382251       0.00329834    0.00165557 
  0.00170285    0.000788513   0.000614238      0.00132717    0.00300143 

In [37]:
size(X_minibatch), size(dscores'), size(X_minibatch*dscores'), size(s.W)

((784,25),(25,10),(784,10),(10,784))

In [38]:
size(dscores),size(X_minibatch')

((10,25),(25,784))

In [39]:
nabla_W = A_mul_Bt(dscores, X_minibatch);
nabla_b = vec(sum(dscores,2))

10-element Array{Float32,1}:
 -0.04      
 -0.2       
 -0.04      
 -0.08      
 -0.08      
 -0.04      
 -0.04      
  1.97906f-9
 -1.86265f-9
 -0.08      

In [40]:
size(nabla_W)

(10,784)

In [41]:
function gradient_softmax(s::Softmax, X_minibatch, y_minibatch::AbstractVector)
    n_classes = length(s.b)
    n_samples = length(y_minibatch)
    
    scores = score(s, X_minibatch)
    probs = probability(sc)

    indicies_to_modify = [y + n_classes*(i-1) for (i,y) in enumerate(y_minibatch)]
    dscores = probs
    dscores[indicies_to_modify] -= 1;
    dscores ./= length(y_minibatch)

    nabla_W = A_mul_Bt(dscores, X_minibatch)
    nabla_b = vec(sum(dscores,2))    
    return nabla_W, nabla_b, data_loss
end

gradient_softmax (generic function with 1 method)

In [42]:
@time  gradient_softmax(s, X_minibatch, y_minibatch);

  0.465763 seconds (340.78 k allocations: 14.867 MB, 1.15% gc time)


In [96]:
function gradient_softmax2(s::Softmax, X_minibatch::AbstractMatrix, Y_minibatch::AbstractMatrix)
    n_classes = length(s.b)
    n_samples = length(y_minibatch)
    
    probs = probability(score(s, X_minibatch))
    dscores = probs - Y_minibatch

    data_loss = -sum( Y_minibatch.*log(probs))/n_samples
    nabla_W = A_mul_Bt(dscores, X_minibatch)/n_samples
    nabla_b = vec(sum(dscores,2))/n_samples
    
    return nabla_W, nabla_b, data_loss
end



gradient_softmax2 (generic function with 1 method)

In [44]:
function one_hot_encoding(y_train)
    unique_classes = sort(unique(y_train))
    class_to_pos = Dict(class =>pos for (pos,class) in enumerate(unique_classes))    
    encoded_classes = zeros(length(unique_classes), length(y_train))
    for (i,y) in enumerate(y_train)
        encoded_classes[class_to_pos[y],i] = 1
    end
    return encoded_classes
end

one_hot_encoding (generic function with 1 method)

In [45]:
Y_train = Array{Float32}(one_hot_encoding(y_train))
Y_train = AFArray(Y_train);

In [48]:
@time  gradient_softmax(s, X_minibatch, Y_train[:,1:25]);

  0.002316 seconds (161 allocations: 4.000 KB)


### Learning with the gradient

In [89]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))

g_W = AFArray(W)
g_b = AFArray(b);

s = Softmax(g_W,g_b)

Softmax(Float32[-0.0736793 -0.114804 … -0.0415704 -0.0521807; -0.12795 -0.0562833 … 0.0377865 -0.00850028; … ; -0.00561452 -0.0683079 … -0.0322009 -0.0155293; 0.0947887 -0.036649 … 0.0755049 0.00797631],Float32[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [99]:
X_minibatch = X_train[:,1:100]
Y_minibatch = Y_train[:,1:100];

In [113]:
@time gradient_softmax2(s, X_minibatch, Y_minibatch)

  2.821758 seconds (6.03 M allocations: 109.619 MB, 0.55% gc time)


(
Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

Float32[-0.48,-0.52,-0.2,-0.4,-0.4,-0.16,-0.4,-0.36,-0.28,-0.4],18.843307f0)

In [91]:
print("\nnorm(s.W): ", norm(s.W))
nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
s.W .-= lr .* nabla_W
s.b .-= lr .* nabla_b
print("\nloss: ", data_loss)
print("\nnorm(s.W): ", norm(s.W))


norm(s.W): 8.846954
loss: 19.145744
norm(s.W): 8.846323

In [None]:
print("\nnorm(s.W): ", norm(s.W))
nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
s.W .-= lr .* nabla_W
s.b .-= lr .* nabla_b
print("\nloss: ", data_loss)
print("\nnorm(s.W): ", norm(s.W))

In [None]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))
g_W = AFArray(W)
g_b = AFArray(b);
s = Softmax(g_W,g_b)

lr = Float32(0.05)
print_every = 10
n_samples = size(X_minibatch)[2]

for i in 1:15
    nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
    s.W .-= lr .* nabla_W
    s.b .-= lr .* nabla_b
    print("\niter: ", i , "  loss: ", data_loss)
end

### ?? What's up with the cost?

In [125]:
@time gradient_softmax(s, X_minibatch, Y_minibatch)

  0.000827 seconds (154 allocations: 130.156 KB)


(
Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

Float32[-0.8,-1.0,-0.76,-0.8,-0.8,-0.48,-0.72,-0.8,-0.56,-0.88],39.98738f0)

In [123]:
batch_size = 200
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))
s = Softmax(W,b)

X_minibatch = X_train[:,1:batch_size]
Y_minibatch = Array{Float32}(one_hot_encoding(y_train))[:,1:batch_size];

lr = Float32(0.01)
print_every = 10 
n_samples = size(X_minibatch)[2]

for i in 1:15
    nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
    s.W .-= lr .* nabla_W
    s.b .-= lr .* nabla_b
    print("\niter: ", i , "  loss: ", data_loss)
end


iter: 1  loss: 45.00461
iter: 2  loss: 44.12252
iter: 3  loss: 43.31262
iter: 4  loss: 42.578476
iter: 5  loss: 41.923416
iter: 6  loss: 41.35017
iter: 7  loss: 40.860565
iter: 8  loss: 40.45524
iter: 9  loss: 40.133533
iter: 10  loss: 39.893642
iter: 11  loss: 39.73279
iter: 12  loss: 39.64762
iter: 13  loss: 39.634396
iter: 14  loss: 39.68924
iter: 15  loss: 39.808212

### Spiral data

In [None]:
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels

num_examples = X.shape[0]


for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j
end
# lets visualize the data:
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)