In [1]:
using CUDA
using StaticArrays

In [2]:
θ = range(0, 2π; length=361)[1:end-1]
θ_cu = cu(collect(θ))

360-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.017453292
 0.034906585
 0.05235988
 0.06981317
 0.08726646
 0.10471976
 0.12217305
 0.13962634
 0.15707964
 0.17453292
 0.19198622
 0.20943952
 ⋮
 6.0737457
 6.091199
 6.1086526
 6.126106
 6.143559
 6.161012
 6.1784654
 6.195919
 6.213372
 6.2308254
 6.2482786
 6.265732

In [3]:
r_smf32(t) = @SMatrix(Float32[
    cos(t) -sin(t) 0
    sin(t)  cos(t) 0
    0      0       1
])

f(t) = r_smf32(t) * @SVector Float32[1, 0, 0]

f (generic function with 1 method)

In [4]:
f.(θ_cu)

360-element CuArray{SVector{3, Float32}, 1, CUDA.Mem.DeviceBuffer}:
 [1.0, 0.0, 0.0]
 [0.9998477, 0.017452406, 0.0]
 [0.99939084, 0.034899496, 0.0]
 [0.9986295, 0.05233596, 0.0]
 [0.9975641, 0.06975647, 0.0]
 [0.9961947, 0.08715574, 0.0]
 [0.9945219, 0.104528464, 0.0]
 [0.99254614, 0.12186935, 0.0]
 [0.99026805, 0.1391731, 0.0]
 [0.98768836, 0.15643448, 0.0]
 [0.9848077, 0.17364818, 0.0]
 [0.98162717, 0.190809, 0.0]
 [0.9781476, 0.2079117, 0.0]
 ⋮
 [0.97814757, -0.20791176, 0.0]
 [0.98162717, -0.19080916, 0.0]
 [0.9848078, -0.17364797, 0.0]
 [0.98768836, -0.15643436, 0.0]
 [0.99026805, -0.13917309, 0.0]
 [0.99254614, -0.12186943, 0.0]
 [0.99452186, -0.10452865, 0.0]
 [0.9961947, -0.08715556, 0.0]
 [0.9975641, -0.06975638, 0.0]
 [0.9986295, -0.052335963, 0.0]
 [0.99939084, -0.0348996, 0.0]
 [0.9998477, -0.017452609, 0.0]

In [5]:
using CUDA, NNlib, NNlibCUDA

In [6]:
?batched_vec

search: [0m[1mb[22m[0m[1ma[22m[0m[1mt[22m[0m[1mc[22m[0m[1mh[22m[0m[1me[22m[0m[1md[22m[0m[1m_[22m[0m[1mv[22m[0m[1me[22m[0m[1mc[22m [0m[1mb[22m[0m[1ma[22m[0m[1mt[22m[0m[1mc[22m[0m[1mh[22m[0m[1me[22m[0m[1md[22m[0m[1m_[22mmul [0m[1mb[22m[0m[1ma[22m[0m[1mt[22m[0m[1mc[22m[0m[1mh[22m[0m[1me[22m[0m[1md[22m[0m[1m_[22mmul! [0m[1mb[22m[0m[1ma[22m[0m[1mt[22m[0m[1mc[22m[0m[1mh[22m[0m[1me[22m[0m[1md[22m[0m[1m_[22madjoint [0m[1mb[22m[0m[1ma[22m[0m[1mt[22m[0m[1mc[22m[0m[1mh[22m[0m[1me[22m[0m[1md[22m[0m[1m_[22mtranspose



```
batched_vec(A::Array{T,3}, B::Matrix)
batched_vec(A::Array{T,3}, b::Vector)
```

Batched matrix-vector multiplication: the result has `C[:,:,k] == A[:,:,k] * B[:,k]` for all `k`, or else `C[:,:,k] == A[:,:,k] * b` for `b::Vector`.

With the same argument types, `batched_mul(A, B)` would regard `B` as a fixed matrix, not a batch of vectors. Both reshape and then call `batched_mul(::Array{T,3}, ::Array{T,3})`.

```jldoctest
julia> A, B, b = randn(16,8,32), randn(8,32), randn(8);

julia> batched_vec(A,B) |> size
(16, 32)

julia> batched_vec(A,b) |> size
(16, 32)
```


In [7]:
R(t) = [
    cos(t) -sin(t) 0
    sin(t)  cos(t) 0
    0      0       1
]

θ = range(0, 2π; length=361)[1:end-1]
A = mapslices(t -> R(t[1, 1]), reshape(θ, 1, 1, :); dims=(1, 2))
A_cu = cu(A)

3×3×360 CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}:
[:, :, 1] =
 1.0  -0.0  0.0
 0.0   1.0  0.0
 0.0   0.0  1.0

[:, :, 2] =
 0.999848   -0.0174524  0.0
 0.0174524   0.999848   0.0
 0.0         0.0        1.0

[:, :, 3] =
 0.999391   -0.0348995  0.0
 0.0348995   0.999391   0.0
 0.0         0.0        1.0

...

[:, :, 358] =
  0.99863   0.052336  0.0
 -0.052336  0.99863   0.0
  0.0       0.0       1.0

[:, :, 359] =
  0.999391   0.0348995  0.0
 -0.0348995  0.999391   0.0
  0.0        0.0        1.0

[:, :, 360] =
  0.999848   0.0174524  0.0
 -0.0174524  0.999848   0.0
  0.0        0.0        1.0

In [8]:
batched_vec(A_cu, cu([1.0, 0.0, 0.0]))

3×360 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 1.0  0.999848   0.999391   0.99863   …   0.99863    0.999391    0.999848
 0.0  0.0174524  0.0348995  0.052336     -0.052336  -0.0348995  -0.0174524
 0.0  0.0        0.0        0.0           0.0        0.0         0.0

In [9]:
using CUDA, NNlib, NNlibCUDA

R(t) = [
    cos(t) -sin(t) 0
    sin(t)  cos(t) 0
    0      0       1
]

θ = range(0, 2π; length=361)[1:end-1]
A = mapslices(t -> R(t[1, 1]), reshape(θ, 1, 1, :); dims=(1, 2))
batched_vec(cu(A), cu([1.0, 0.0, 0.0]))

3×360 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 1.0  0.999848   0.999391   0.99863   …   0.99863    0.999391    0.999848
 0.0  0.0174524  0.0348995  0.052336     -0.052336  -0.0348995  -0.0174524
 0.0  0.0        0.0        0.0           0.0        0.0         0.0

See also https://discourse.julialang.org/t/how-to-broadcast-or-batch-multiply-a-batch-of-matrices-with-another-matrix-on-the-gpu/67259