In [70]:
from sys.info import simdwidthof, simdbitwidth

In [71]:
print("host vector size", simdbitwidth())

print("simdwidth[AnyType] of various:")

print("f32", simdwidthof[DType.float32]())
print("f64", simdwidthof[DType.float64]())
print("int64", simdwidthof[DType.int64]())
print("si8", simdwidthof[DType.int8]())
print("bool", simdwidthof[DType.bool]())

host vector size 256
simdwidth[AnyType] of various:
f32 8
f64 4
int64 4
si8 32
bool 32


In [72]:
@register_passable("trivial")
struct Example[dtype: DType, dims: Int]:
    var data: SIMD[dtype, dims]

In [73]:
alias ExampleAlias = Example[DType.float32, 4]
let o = ExampleAlias{ data: SIMD[DType.float32, 4](42, 43, 44, 45) }
print(simdwidthof[ExampleAlias]())


2


In [74]:
alias ExampleAliasContd = Example[DType.float16, 16]

In [75]:
print(simdwidthof[ExampleAliasContd]())

1


In [1]:
    # fn __eq__(self, other: Self) -> Bool:
    #     """
    #     Vectorized equality check (approx 27X faster than procedural version in _sloweq)
    #     """
    #     let len = self.__len__()
    #     alias nelts = simdwidthof[Point[dtype, point_dims]]()
    #     let n = len * point_dims

    #     if len != other.__len__():
    #         return False

    #     for i in range(0, n, nelts):
    #         var self_vec = self.coords.simd_load[nelts](i)
    #         var other_vec = other.coords.simd_load[nelts](i)
    #         # the last batch of nelts may not fill the vector, so write zeros so equality check can still be valid
    #         if n - i < nelts:
    #             for j in range(n - i, n):
    #                 self_vec[j] = 0
    #                 other_vec[j] = 0
    #         let vectors_eq = self_vec == other_vec
    #         if not Bool(vectors_eq):
    #             return False

    #     return True


In [2]:
var x = SIMD[DType.float64, 4](1,2,3,4)
print(x)

[1.0, 2.0, 3.0, 4.0]


In [3]:
var x2 = SIMD[DType.float64, 2](5, 6)
print(x2)

[5.0, 6.0]


In [4]:
var y = SIMD[DType.float64, 4]()
print(y)

[0.0, 0.0, 0.0, 0.0]


In [5]:
y = x2

error: [0;1;31m[1mExpression [5]:20:9: [0m[1mcannot implicitly convert 'SIMD[f64, 2]' value to 'SIMD[f64, 4]' in assignment
[0m    y = x2
[0;1;32m        ^~
[0m[0m
expression failed to parse (no further compiler diagnostics)

In [1]:
# what's faster, simd[1] * 10, or simd[8] load?

from tensor import Tensor, TensorSpec, TensorShape
from utils.index import Index
from random import rand

def make_tensor() -> Tensor[DType.float32]:
    let height = 256
    let width = 256
    let channels = 3

    # Create the tensor of dimensions height, width, channels
    # and fill with random values.
    let image = rand[DType.float32](height, width, channels)

    # Declare the grayscale image.
    let spec = TensorSpec(DType.float32, height, width)
    var gray_scale_image = Tensor[DType.float32](spec)

    # Perform the RGB to grayscale transform.
    for y in range(height):
        for x in range(width):
            let r = image[y,x,0]
            let g = image[y,x,1]
            let b = image[y,x,2]
            gray_scale_image[Index(y,x)] = 0.299 * r + 0.587 * g + 0.114 * b

    return gray_scale_image

print(make_tensor().num_elements())

65536


In [17]:
from benchmark import Benchmark

def wrap_benchmark():
    let t = make_tensor()
    var effect1 = SIMD[DType.float32, 1](1)

    @parameter
    fn bench_load1() -> None:
        for n in range(0, 10 * 1024):
            let x = t.simd_load[1](0)
            effect1 = x

    let ns1 = Benchmark().run[bench_load1]()

    print("ignore", effect1)
 
    @parameter
    fn bench_load10():
        for n in range(0, 10):
            let x = t.simd_load[1024](0)
            effect1 = x[5]

    let ns2 = Benchmark().run[bench_load10]()
    print("ignore", effect1)

    print(ns1, "vs", ns2, ns1.__truediv__(ns2), "X speedup")

wrap_benchmark()



ignore 0.40199160575866699
ignore 0.25344225764274597
4466 vs 4 1116.5 X speedup
