# ArrayFire

In [2]:
using ArrayFire
using BenchmarkTools

In [3]:
getAvailableBackends()

CPU and OpenCL


In [4]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (OpenCL, 64-bit Mac OSX, build f65dd97)
[0] APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB
-1- APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB


In [5]:
ArrayFire.AF_BACKEND_DEFAULT, ArrayFire.AF_BACKEND_CPU, ArrayFire.AF_BACKEND_OPENCL

(0x00000000,0x00000001,0x00000004)

#### Switch backends for computations during execution:  ```setBackend``` method

In [6]:
setBackend(AF_BACKEND_CPU) #Switch back to CPU backend
getActiveBackend()

CPU Backend


In [7]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (CPU, 64-bit Mac OSX, build f65dd97)
[0] Unknown: Unknown, 32768 MB, Max threads(1) 


In [8]:
setBackend(AF_BACKEND_OPENCL) #Switch to OPENCL backend
getActiveBackend()

OpenCL Backend


In [9]:
ArrayFire.AFInfo()

ArrayFire v3.3.2 (OpenCL, 64-bit Mac OSX, build f65dd97)
[0] APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB
-1- APPLE   : AMD Radeon HD - FirePro D300 Compute Engine, 2048 MB


## Using AFArray Arrays

In [10]:
A = Array{Float32}(rand(1000,1000))

1000×1000 Array{Float32,2}:
 0.571312  0.697011  0.879106   …  0.400494   0.811225    0.384668  
 0.183041  0.668708  0.0602257     0.0962854  0.240068    0.615246  
 0.419585  0.44442   0.121419      0.566939   0.0801314   0.574808  
 0.341526  0.387929  0.619143      0.599713   0.00970275  0.596939  
 0.35592   0.941023  0.301473      0.052276   0.670344    0.637076  
 0.945409  0.642618  0.161548   …  0.652432   0.842524    0.278593  
 0.200232  0.603488  0.390178      0.984458   0.0719123   0.0824463 
 0.780205  0.342922  0.325967      0.745449   0.776394    0.443762  
 0.171331  0.128021  0.889548      0.878686   0.60608     0.17696   
 0.797794  0.129769  0.1165        0.306394   0.614774    0.87428   
 0.585677  0.567981  0.541071   …  0.594755   0.354125    0.620547  
 0.844262  0.255532  0.0530294     0.0879742  0.531512    0.704695  
 0.103627  0.599612  0.983694      0.0529969  0.361663    0.978394  
 ⋮                              ⋱                                   
 0.560

In [11]:
Agpu = AFArray(A)

1000×1000 ArrayFire.AFArray{Float32,2}:
 0.571312  0.697011  0.879106   …  0.400494   0.811225    0.384668  
 0.183041  0.668708  0.0602257     0.0962854  0.240068    0.615246  
 0.419585  0.44442   0.121419      0.566939   0.0801314   0.574808  
 0.341526  0.387929  0.619143      0.599713   0.00970275  0.596939  
 0.35592   0.941023  0.301473      0.052276   0.670344    0.637076  
 0.945409  0.642618  0.161548   …  0.652432   0.842524    0.278593  
 0.200232  0.603488  0.390178      0.984458   0.0719123   0.0824463 
 0.780205  0.342922  0.325967      0.745449   0.776394    0.443762  
 0.171331  0.128021  0.889548      0.878686   0.60608     0.17696   
 0.797794  0.129769  0.1165        0.306394   0.614774    0.87428   
 0.585677  0.567981  0.541071   …  0.594755   0.354125    0.620547  
 0.844262  0.255532  0.0530294     0.0879742  0.531512    0.704695  
 0.103627  0.599612  0.983694      0.0529969  0.361663    0.978394  
 ⋮                              ⋱                              

In [12]:
@time res = Array(Agpu*Agpu)

  0.360405 seconds (9.40 k allocations: 4.190 MB)


1000×1000 Array{Float32,2}:
 252.973  255.343  262.824  254.598  …  254.989  251.326  249.514  258.124
 243.001  242.861  245.759  243.964     244.97   245.308  240.241  239.535
 253.616  248.869  252.994  254.093     247.616  241.224  241.607  248.999
 258.079  249.724  260.796  255.834     253.508  241.478  245.729  250.739
 251.993  248.074  255.303  252.834     247.095  246.556  239.484  250.107
 257.595  254.186  260.427  252.698  …  255.043  251.096  247.379  255.697
 248.132  252.39   253.137  250.689     241.508  239.179  241.867  247.722
 252.646  261.812  258.734  256.198     251.428  253.451  250.696  257.861
 251.683  248.79   251.288  248.509     252.75   251.854  240.841  255.1  
 248.797  237.538  250.509  246.736     242.254  240.938  244.611  241.611
 253.103  249.476  255.488  253.429  …  250.051  245.918  244.799  253.219
 251.404  247.981  249.552  249.5       250.045  244.938  244.55   244.835
 258.227  251.609  253.115  255.154     252.901  240.481  245.944  256.5

In [13]:
@time res =A *A;

  0.455368 seconds (378.12 k allocations: 16.868 MB, 1.26% gc time)


#### Bring computation from the GPU to the CPU: Just Array(AFArray)

In [14]:
host_to_device = AFArray(rand(100,100));
device_to_host = Array(host_to_device);

# Little Example with MNIST and a softmax


- More details about softmax http://cs231n.github.io/linear-classify/#softmax
- http://cs231n.github.io/neural-networks-case-study/

In [15]:
using MNIST

In [16]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()
n_features = size(X_train)[1]

T = Float32
X_train = Array{T}( (X_train - minimum(X_train))/(maximum(X_train) - minimum(X_train)) )
y_train = Array{Int32}(y_train) + 1
X_test = Array{T}(X_test - minimum(X_test))/(maximum(X_test) - minimum(X_test)) 
y_test = Array{Int32}(y_test) + 1 ;

In [17]:
n_features

784

In [18]:
g_X_train = AFArray(X_train)
g_y_train = AFArray(y_train)
g_X_test = AFArray(X_test)
g_y_test = AFArray(y_test);

In [19]:
type Softmax
    W
    b
end

In [20]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))

g_W = AFArray(W)
g_b = AFArray(b);

s = Softmax(g_W,g_b)

Softmax(Float32[0.0622508 0.0666868 … 0.0387792 0.115878; -0.170409 -0.0260312 … -0.0186933 -0.0612406; … ; -0.11641 0.037459 … -0.0264259 -0.0925197; 0.263111 -0.0593193 … -0.167753 -0.107716],Float32[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [21]:
function score(s::Softmax, minibatch )
    return s.W * minibatch .+ s.b
end

score (generic function with 1 method)

In [22]:
function probability(scores)
    return exp.(scores) ./ sum( exp.(scores), 2)
end

probability (generic function with 1 method)

In [23]:
batch_size = 25

25

In [24]:
X_minibatch = g_X_train[:,1:batch_size]
y_minibatch = g_y_train[1:batch_size];

In [25]:
sc = score(s, X_minibatch)

10×25 ArrayFire.AFArray{Float32,2}:
  0.437946    1.89439   -0.545094   …   0.601925    0.16521    -0.00559358
 -0.998376   -2.25608    0.0664899      0.0650496  -0.944304   -1.8298    
 -1.17351    -1.56137    0.367401      -0.450058    0.219386    0.405007  
  0.0604159   0.117146  -2.21337        0.169649    0.0888343   0.499621  
  2.19321     2.9        0.549265       1.10683     0.609013    1.55207   
  0.464647   -0.163695   0.824697   …  -0.98627    -0.70309     0.204114  
 -0.305093   -0.346913  -0.0929131     -1.05043    -0.645083   -0.587201  
  1.42406     0.38135    0.468984       1.10203     0.725008    1.53904   
  1.30162     1.61071    0.719225      -0.209154    0.547341    0.024211  
  0.367061   -0.704601   0.583259       1.13642     0.0245456   0.747804  

In [26]:
probs = probability(sc)

10×25 ArrayFire.AFArray{Float32,2}:
 0.0335234  0.143838    0.0125435   …  0.0394969  0.0255211  0.021514 
 0.0321258  0.00913354  0.0931793      0.0930452  0.0339107  0.0139886
 0.0160636  0.0108992   0.0749983      0.0331157  0.0646799  0.0778724
 0.0449686  0.0475935   0.00462827     0.050159   0.0462649  0.0697676
 0.0664078  0.13464     0.0128311      0.0224085  0.0136211  0.0349766
 0.0656902  0.0350441   0.0941604   …  0.0153949  0.0204342  0.0506236
 0.0507877  0.0487075   0.0627924      0.0241024  0.0361495  0.0383037
 0.0549219  0.0193598   0.021133       0.0398005  0.0272993  0.0616139
 0.0504634  0.0687408   0.0281869      0.0111393  0.0237355  0.0140672
 0.0221116  0.00757188  0.0274482      0.0477252  0.0156989  0.0323575

We now have an array probs where each col now contains the class probabilities.

In particular, since we’ve normalized them every row now sums to one. We can now query for the log probabilities assigned to the correct classes in each example:

    [probs[i,j] for (i,j) in zip(minibatch_true_classes,minibatch_indicies)]

In [27]:
probs[:,1:4]

10×4 ArrayFire.AFArray{Float32,2}:
 0.0335234  0.143838    0.0125435   0.034166 
 0.0321258  0.00913354  0.0931793   0.032665 
 0.0160636  0.0108992   0.0749983   0.0546305
 0.0449686  0.0475935   0.00462827  0.0455214
 0.0664078  0.13464     0.0128311   0.0145251
 0.0656902  0.0350441   0.0941604   0.016326 
 0.0507877  0.0487075   0.0627924   0.0346881
 0.0549219  0.0193598   0.021133    0.020089 
 0.0504634  0.0687408   0.0281869   0.0203961
 0.0221116  0.00757188  0.0274482   0.0147299

In [28]:
y_train[1:4]

4-element Array{Int32,1}:
 6
 1
 5
 2

In [29]:
[probs[i,j] for (i,j) in zip(y_train[1:4],1:4)]

4-element Array{Float32,1}:
 0.0656902
 0.143838 
 0.0128311
 0.032665 

- The array correct_logprobs is a Vector of just the probabilities assigned to the correct classes for each example.

- The full loss is then the average of these log probabilities and the regularization loss:



In [30]:
logprobs_correct_classes = [-log(probs[i,j]) for (i,j) in zip(y_minibatch, 1:length(y_minibatch))]
data_loss = sum(logprobs_correct_classes)/length(logprobs_correct_classes)

3.3448548f0

#### Gradient of the loss

In [31]:
n_classes = 10
indicies_to_modify = [ y + n_classes*(i-1) for (i,y) in enumerate(y_minibatch)];

In [32]:
indicies_to_modify'

1×25 Array{Int64,2}:
 6  11  25  32  50  53  62  74  82  95  …  187  200  205  211  230  232  242

In [33]:
dscores = zeros(probs)
dscores .= probs

10×25 Array{Float32,2}:
 0.0335234  0.143838    0.0125435   …  0.0394969  0.0255211  0.021514 
 0.0321258  0.00913354  0.0931793      0.0930452  0.0339107  0.0139886
 0.0160636  0.0108992   0.0749983      0.0331157  0.0646799  0.0778724
 0.0449686  0.0475935   0.00462827     0.050159   0.0462649  0.0697676
 0.0664078  0.13464     0.0128311      0.0224085  0.0136211  0.0349766
 0.0656902  0.0350441   0.0941604   …  0.0153949  0.0204342  0.0506236
 0.0507877  0.0487075   0.0627924      0.0241024  0.0361495  0.0383037
 0.0549219  0.0193598   0.021133       0.0398005  0.0272993  0.0616139
 0.0504634  0.0687408   0.0281869      0.0111393  0.0237355  0.0140672
 0.0221116  0.00757188  0.0274482      0.0477252  0.0156989  0.0323575

In [34]:
dscores[24*10+1] # first position last example

0.021514006f0

In [35]:
dscores[10*25] # last position from last example in the array

0.03235753f0

Go to every position to be modified (the ones with the correct class) and substract 1

In [36]:
dscores[indicies_to_modify] -= 1;

In [37]:
dscores ./= length(y_minibatch)

10×25 Array{Float32,2}:
  0.00134094   -0.0342465     0.00050174   …   0.00102085    0.00086056 
  0.00128503    0.000365342   0.00372717      -0.0386436    -0.0394405  
  0.000642544   0.000435969   0.00299993       0.0025872     0.0031149  
  0.00179875    0.00190374    0.000185131      0.0018506     0.00279071 
  0.00265631    0.00538559   -0.0394868        0.000544843   0.00139906 
 -0.0373724     0.00140176    0.00376641   …   0.00081737    0.00202494 
  0.00203151    0.0019483     0.00251169       0.00144598    0.00153215 
  0.00219688    0.000774394   0.00084532       0.00109197    0.00246456 
  0.00201854    0.00274963    0.00112747       0.000949421   0.000562686
  0.000884465   0.000302875   0.00109793       0.000627954   0.0012943  

In [38]:
size(X_minibatch), size(dscores'), size(X_minibatch*dscores'), size(s.W)

((784,25),(25,10),(784,10),(10,784))

In [39]:
size(dscores),size(X_minibatch')

((10,25),(25,784))

In [40]:
nabla_W = A_mul_Bt(dscores, X_minibatch);
nabla_b = vec(sum(dscores,2))

10-element Array{Float32,1}:
 -0.04      
 -0.2       
 -0.04      
 -0.08      
 -0.08      
 -0.04      
 -0.04      
  2.56114f-9
  2.96859f-9
 -0.08      

In [41]:
size(nabla_W)

(10,784)

In [42]:
function gradient_softmax(s::Softmax, X_minibatch, y_minibatch::AbstractVector)
    n_classes = length(s.b)
    n_samples = length(y_minibatch)
    
    scores = score(s, X_minibatch)
    probs = probability(sc)

    indicies_to_modify = [y + n_classes*(i-1) for (i,y) in enumerate(y_minibatch)]
    dscores = probs
    dscores[indicies_to_modify] -= 1;
    dscores ./= length(y_minibatch)

    nabla_W = A_mul_Bt(dscores, X_minibatch)
    nabla_b = vec(sum(dscores,2))    
    return nabla_W, nabla_b, data_loss
end

gradient_softmax (generic function with 1 method)

In [43]:
@time  gradient_softmax(s, X_minibatch, y_minibatch);

  0.794697 seconds (341.44 k allocations: 14.944 MB, 0.62% gc time)


In [44]:
function gradient_softmax2(s::Softmax, X_minibatch::AbstractMatrix, Y_minibatch::AbstractMatrix)
    n_classes = length(s.b)
    n_samples = length(y_minibatch)
    
    probs = probability(score(s, X_minibatch))
    dscores = probs - Y_minibatch

    data_loss = -sum( Y_minibatch.*log(probs))/n_samples
    nabla_W = A_mul_Bt(dscores, X_minibatch)/n_samples
    nabla_b = vec(sum(dscores,2))/n_samples
    
    return nabla_W, nabla_b, data_loss
end

gradient_softmax2 (generic function with 1 method)

In [45]:
function one_hot_encoding(y_train)
    unique_classes = sort(unique(y_train))
    class_to_pos = Dict(class =>pos for (pos,class) in enumerate(unique_classes))    
    encoded_classes = zeros(length(unique_classes), length(y_train))
    for (i,y) in enumerate(y_train)
        encoded_classes[class_to_pos[y],i] = 1
    end
    return encoded_classes
end

one_hot_encoding (generic function with 1 method)

In [46]:
Y_train = Array{Float32}(one_hot_encoding(y_train))
Y_train = AFArray(Y_train);

In [47]:
@time  gradient_softmax(s, X_minibatch, Y_train[:,1:25]);

LoadError: MethodError: no method matching gradient_softmax(::Softmax, ::ArrayFire.AFArray{Float32,2}, ::ArrayFire.AFArray{Float32,2})[0m
Closest candidates are:
  gradient_softmax(::Softmax, ::Any, [1m[31m::AbstractArray{T,1}[0m) at In[42]:2[0m

### Learning with the gradient

In [48]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))

g_W = AFArray(W)
g_b = AFArray(b);

s = Softmax(g_W,g_b)

Softmax(Float32[0.0327769 -0.125568 … 0.204383 0.00964492; -0.0547601 -0.129532 … -0.0646045 -0.112268; … ; -0.121336 -0.253113 … 0.154665 0.352933; 0.114556 -0.200836 … -0.0107951 -0.0830136],Float32[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [49]:
X_minibatch = X_train[:,1:100]
Y_minibatch = Y_train[:,1:100];

In [50]:
@time gradient_softmax2(s, X_minibatch, Y_minibatch)

  2.954621 seconds (6.57 M allocations: 126.905 MB, 0.80% gc time)


(
Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

Float32[-0.48,-0.52,-0.2,-0.4,-0.4,-0.16,-0.4,-0.36,-0.28,-0.4],18.89552f0)

In [51]:
print("\nnorm(s.W): ", norm(s.W))
nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
s.W .-= lr .* nabla_W
s.b .-= lr .* nabla_b
print("\nloss: ", data_loss)
print("\nnorm(s.W): ", norm(s.W))


norm(s.W): 8.891143

LoadError: MethodError: no method matching gradient_softmax(::Softmax, ::Array{Float32,2}, ::ArrayFire.AFArray{Float32,2})[0m
Closest candidates are:
  gradient_softmax(::Softmax, ::Any, [1m[31m::AbstractArray{T,1}[0m) at In[42]:2[0m

In [52]:
print("\nnorm(s.W): ", norm(s.W))
nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
s.W .-= lr .* nabla_W
s.b .-= lr .* nabla_b
print("\nloss: ", data_loss)
print("\nnorm(s.W): ", norm(s.W))


norm(s.W): 8.891143

LoadError: MethodError: no method matching gradient_softmax(::Softmax, ::Array{Float32,2}, ::ArrayFire.AFArray{Float32,2})[0m
Closest candidates are:
  gradient_softmax(::Softmax, ::Any, [1m[31m::AbstractArray{T,1}[0m) at In[42]:2[0m

In [None]:
batch_size = 500
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))
g_W = AFArray(W)
g_b = AFArray(b);
s = Softmax(g_W,g_b)

lr = Float32(0.05)
print_every = 10
n_samples = size(X_minibatch)[2]

for i in 1:15
    nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
    s.W .-= lr .* nabla_W
    s.b .-= lr .* nabla_b
    print("\niter: ", i , "  loss: ", data_loss)
end

### ?? What's up with the cost?

In [125]:
@time gradient_softmax(s, X_minibatch, Y_minibatch)

  0.000827 seconds (154 allocations: 130.156 KB)


(
Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

Float32[-0.8,-1.0,-0.76,-0.8,-0.8,-0.48,-0.72,-0.8,-0.56,-0.88],39.98738f0)

In [53]:
batch_size = 200
n_classes = 10

W = Array{T}(randn(n_classes, n_features)/10);
b =  Array{T}(zeros(n_classes))
s = Softmax(W,b)

X_minibatch = X_train[:,1:batch_size]
Y_minibatch = Array{Float32}(one_hot_encoding(y_train))[:,1:batch_size];

lr = Float32(0.01)
print_every = 10 
n_samples = size(X_minibatch)[2]

for i in 1:15
    nabla_W, nabla_b, data_loss = gradient_softmax(s, X_minibatch, Y_minibatch)
    s.W .-= lr .* nabla_W
    s.b .-= lr .* nabla_b
    print("\niter: ", i , "  loss: ", data_loss)
end

LoadError: MethodError: no method matching gradient_softmax(::Softmax, ::Array{Float32,2}, ::Array{Float32,2})[0m
Closest candidates are:
  gradient_softmax(::Softmax, ::Any, [1m[31m::AbstractArray{T,1}[0m) at In[42]:2[0m

### Spiral data

In [None]:
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels

num_examples = X.shape[0]


for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j
end
# lets visualize the data:
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)