### 非向量化和向量化耗时比较

In [2]:
:dep burn = {version = "0.12.1", features = ["ndarray", "wgpu", "candle"]}

In [3]:
:show_deps

burn = {version = "0.12.1", features = ["ndarray", "wgpu", "candle"]}


In [4]:
use burn::backend::{NdArray, Wgpu, Candle};
use burn::tensor::{Distribution, Tensor};
use burn::backend::candle::CandleDevice;
use std::time::{Duration, Instant};

In [19]:
type Backend = NdArray;
// type Backend = Wgpu;

let m:usize = 1_000_000;
let device = Default::default();
let a: Tensor<Backend, 1> = Tensor::random([m], Distribution::Default, &device);
let b: Tensor<Backend, 1> = Tensor::random([m], Distribution::Default, &device);
println!("a ======: {}", a);

  data:
[0.630997, 0.55161667, 0.70124835, ..., 0.77008104, 0.0076946616, 0.9957493],
  shape:  [1000000],
  device:  Cpu,
  backend:  "ndarray",
  kind:  "Float",
  dtype:  "f32",
}


In [20]:
let mut c: f32 = 0.0;
let a_vec:Vec<f32> = a.clone().into_data().value;
let b_vec:Vec<f32> = b.clone().into_data().value;

let start = Instant::now();
for i in 0..m {
    c += a_vec[i] * b_vec[i];
}
let duration = start.elapsed();
println!("for所需时间为: {:?}", duration);
println!("c: {:?}", c);

for所需时间为: 1.299016ms
c: 250419.81


In [21]:
let a_clone:Tensor<Backend, 1> = a.clone();
let b_clone:Tensor<Backend, 1> = b.clone();
let start = Instant::now();
let d = a_clone.mul(b_clone).sum().into_scalar();
let duration = start.elapsed();
println!("向量化运算所需时间为: {:?}", duration);
println!("d: {:?}", d);

向量化运算所需时间为: 1.189895ms
d: 250454.72


### 存在几个疑问
1. rust向量化和非向量化非常接近
2. 数据量为100000时，rust的ndarray耗时152.317µs，numpy耗时0.43702125549316406ms,此时rust的ndarray有明显优势
3. 数据量为1000000时，rust的ndarray耗时1.258799ms，numpy耗时1.2631416320800781ms，此时并无明显差距