Improve the GPU stuff

gyroflow · Feb 7, 2022 · 6276bc4 · 6276bc4
1 parent 7e2818d
commit 6276bc4
Show file tree

Hide file tree

Showing 6 changed files with 242 additions and 126 deletions.
diff --git a/src/core/gpu/mod.rs b/src/core/gpu/mod.rs
@@ -3,4 +3,19 @@
 
 #[cfg(feature = "use-opencl")]
 pub mod opencl;
-pub mod wgpu;
+pub mod wgpu;
+
+pub fn initialize_contexts() -> Option<String> {
+    #[cfg(feature = "use-opencl")]
+    match opencl::OclWrapper::initialize_context() {
+        Ok(name) => { return Some(name); },
+        Err(e) => { log::error!("OpenCL error: {:?}", e); }
+    }
+
+    match wgpu::WgpuWrapper::initialize_context() {
+        Some(name) => { return Some(name); },
+        None => { log::error!("WGPU init error"); }
+    }
+
+    None
+}
diff --git a/src/core/gpu/opencl.rs b/src/core/gpu/opencl.rs
@@ -2,6 +2,7 @@
 // Copyright © 2021-2022 Adrian <adrian.eddy at gmail>
 
 use ocl::*;
+use parking_lot::RwLock;
 
 pub struct OclWrapper {
     kernel: Kernel,
@@ -12,11 +13,17 @@ pub struct OclWrapper {
     params_buf: Buffer<f32>,
 }
 
+struct CtxWrapper {
+    device: Device,
+    context: Context,
+}
+
+lazy_static::lazy_static! {
+    static ref CONTEXT: RwLock<Option<CtxWrapper>> = RwLock::new(None);
+}
+
 impl OclWrapper {
-    pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, ocl_names: (&str, &str, &str, &str), bg: nalgebra::Vector4<f32>) -> ocl::Result<Self> {
-
-        if height < 4 || output_height < 4 || stride < 1 { return Err(ocl::BufferCmdError::AlreadyMapped.into()); }
-
+    pub fn initialize_context() -> ocl::Result<String> {
         let platform = Platform::default();
         let device = Device::first(platform)?;
         ::log::info!("OpenCL Platform: {}, Device: {} {}", platform.name()?, device.vendor()?, device.name()?);
@@ -26,60 +33,77 @@ impl OclWrapper {
             .devices(device)
             .build()?;
 
-        let queue = Queue::new(&context, device, None)?;
+        let name = device.name();
 
-        let program = Program::builder()
-            .src(include_str!("opencl_undistort.cl"))
-            .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPE"    .into(), val: ocl_names.0.into() })
-            .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERT" .into(), val: ocl_names.1.into() })
-            .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPEF"   .into(), val: ocl_names.2.into() })
-            .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERTF".into(), val: ocl_names.3.into() })
-            .bo(builders::BuildOpt::CmplrDefine { ident: "PIXEL_BYTES"  .into(), val: format!("{}", bytes_per_pixel) })
-            .devices(device)
-            .build(&context)?;
-
-        let source_buffer = Buffer::builder().queue(queue.clone()).len(stride*height)
-            .flags(MemFlags::new().read_only().host_write_only()).build()?;
-
-        let dest_buffer = Buffer::builder().queue(queue.clone()).len(output_stride*output_height)
-            .flags(MemFlags::new().write_only().host_read_only().alloc_host_ptr()).build()?;
-
-        let params_len = 9 * (height + 1);
-        let params_buf = Buffer::<f32>::builder().queue(queue.clone()).flags(MemFlags::new().read_only()).len(params_len).build()?;
-
-        let mut builder = Kernel::builder();
-        unsafe {
-            builder.program(&program).name("undistort_image").queue(queue)
-            .global_work_size((output_width, output_height))
-            .disable_arg_type_check()
-            .arg(&source_buffer)
-            .arg(&dest_buffer)
-            .arg(ocl::prm::Ushort::new(width as u16))
-            .arg(ocl::prm::Ushort::new(height as u16))
-            .arg(ocl::prm::Ushort::new(stride as u16))
-            .arg(ocl::prm::Ushort::new(output_width as u16))
-            .arg(ocl::prm::Ushort::new(output_height as u16))
-            .arg(ocl::prm::Ushort::new(output_stride as u16))
-            .arg(&params_buf)
-            .arg(ocl::prm::Ushort::new(2));
-        }
+        *CONTEXT.write() = Some(CtxWrapper { device, context });
+
+        name
+    }
 
-        match pix_element_count {
-            1 => builder.arg(ocl::prm::Float::new(bg[0])),
-            2 => builder.arg(ocl::prm::Float2::new(bg[0], bg[1])),
-            3 => builder.arg(ocl::prm::Float3::new(bg[0], bg[1], bg[2])),
-            4 => builder.arg(ocl::prm::Float4::new(bg[0], bg[1], bg[2], bg[3])),
-            _ => panic!("Unknown pix_element_count {}", pix_element_count)
-        };
-        let kernel = builder.build()?;
-
-        Ok(Self {
-            pix_element_count,
-            kernel,
-            src: source_buffer,
-            dst: dest_buffer,
-            params_buf,
-        })
+    pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, ocl_names: (&str, &str, &str, &str), bg: nalgebra::Vector4<f32>) -> ocl::Result<Self> {
+        if height < 4 || output_height < 4 || stride < 1 { return Err(ocl::BufferCmdError::AlreadyMapped.into()); }
+
+        let context_initialized = CONTEXT.read().is_some();
+        if !context_initialized { Self::initialize_context()?; }
+        let lock = CONTEXT.read();
+        if let Some(ref ctx) = *lock {
+            let queue = Queue::new(&ctx.context, ctx.device, None)?;
+
+            let program = Program::builder()
+                .src(include_str!("opencl_undistort.cl"))
+                .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPE"    .into(), val: ocl_names.0.into() })
+                .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERT" .into(), val: ocl_names.1.into() })
+                .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPEF"   .into(), val: ocl_names.2.into() })
+                .bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERTF".into(), val: ocl_names.3.into() })
+                .bo(builders::BuildOpt::CmplrDefine { ident: "PIXEL_BYTES"  .into(), val: format!("{}", bytes_per_pixel) })
+                .devices(ctx.device)
+                .build(&ctx.context)?;
+
+            let source_buffer = Buffer::builder().queue(queue.clone()).len(stride*height)
+                .flags(MemFlags::new().read_only().host_write_only()).build()?;
+
+            let dest_buffer = Buffer::builder().queue(queue.clone()).len(output_stride*output_height)
+                .flags(MemFlags::new().write_only().host_read_only().alloc_host_ptr()).build()?;
+
+            let params_len = 9 * (height + 1);
+            let params_buf = Buffer::<f32>::builder().queue(queue.clone()).flags(MemFlags::new().read_only()).len(params_len).build()?;
+
+            let mut builder = Kernel::builder();
+            unsafe {
+                builder.program(&program).name("undistort_image").queue(queue)
+                .global_work_size((output_width, output_height))
+                .disable_arg_type_check()
+                .arg(&source_buffer)
+                .arg(&dest_buffer)
+                .arg(ocl::prm::Ushort::new(width as u16))
+                .arg(ocl::prm::Ushort::new(height as u16))
+                .arg(ocl::prm::Ushort::new(stride as u16))
+                .arg(ocl::prm::Ushort::new(output_width as u16))
+                .arg(ocl::prm::Ushort::new(output_height as u16))
+                .arg(ocl::prm::Ushort::new(output_stride as u16))
+                .arg(&params_buf)
+                .arg(ocl::prm::Ushort::new(2));
+            }
+
+            match pix_element_count {
+                1 => builder.arg(ocl::prm::Float::new(bg[0])),
+                2 => builder.arg(ocl::prm::Float2::new(bg[0], bg[1])),
+                3 => builder.arg(ocl::prm::Float3::new(bg[0], bg[1], bg[2])),
+                4 => builder.arg(ocl::prm::Float4::new(bg[0], bg[1], bg[2], bg[3])),
+                _ => panic!("Unknown pix_element_count {}", pix_element_count)
+            };
+            let kernel = builder.build()?;
+
+            Ok(Self {
+                pix_element_count,
+                kernel,
+                src: source_buffer,
+                dst: dest_buffer,
+                params_buf,
+            })
+        } else {
+            Err(ocl::BufferCmdError::AlreadyMapped.into())
+        }
     }
 
     pub fn set_background(&mut self, bg: nalgebra::Vector4<f32>) -> ocl::Result<()> {

diff --git a/src/core/gpu/wgpu.rs b/src/core/gpu/wgpu.rs
@@ -4,7 +4,9 @@
 use std::borrow::Cow;
 use bytemuck::Pod;
 use bytemuck::Zeroable;
+use wgpu::Adapter;
 use wgpu::BufferUsages;
+use parking_lot::RwLock;
 
 #[repr(C, align(32))]
 #[derive(Clone, Copy)]
@@ -41,83 +43,107 @@ pub struct WgpuWrapper  {
     globals: Globals
 }
 
-impl WgpuWrapper  {
-    pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, bg: nalgebra::Vector4<f32>) -> Option<Self> {
-        let params_count = 9 * (height + 1);
-
-        if height < 4 || output_height < 4 || stride < 1 { return None; }
-
-        let in_size = (stride * height) as wgpu::BufferAddress;
-        let out_size = (output_stride * output_height) as wgpu::BufferAddress;
-        let params_size = (params_count * std::mem::size_of::<f32>()) as wgpu::BufferAddress;
+lazy_static::lazy_static! {
+    static ref ADAPTER: RwLock<Option<Adapter>> = RwLock::new(None);
+}
 
+impl WgpuWrapper {
+    pub fn initialize_context() -> Option<String> {
         let instance = wgpu::Instance::new(wgpu::Backends::all());
 
         let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))?;
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: None,
-            features: wgpu::Features::empty(),
-            limits: wgpu::Limits::default(),
-        }, None)).ok()?;
-
         let info = adapter.get_info();
         log::debug!("WGPU adapter: {:?}", &info);
 
-        let shader = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
-            source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("wgpu_undistort.wgsl"))),
-            label: None
-        });
+        let name = info.name.clone();
 
-        let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
-        let in_pixels      = device.create_buffer(&wgpu::BufferDescriptor { size: in_size,  usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
-        let out_pixels     = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, label: None, mapped_at_creation: false });
-        let params_buffer  = device.create_buffer(&wgpu::BufferDescriptor { size: params_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
+        *ADAPTER.write() = Some(adapter);
 
-        let params2_buffer  = device.create_buffer(&wgpu::BufferDescriptor { size: std::mem::size_of::<Globals>() as u64, usage: BufferUsages::UNIFORM | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
-
-        let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { module: &shader, entry_point: "undistort", label: None, layout: None });
-
-        let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
-        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
-            label: None,
-            layout: &bind_group_layout,
-            entries: &[
-                wgpu::BindGroupEntry { binding: 0, resource: in_pixels.as_entire_binding() }, 
-                wgpu::BindGroupEntry { binding: 1, resource: params_buffer.as_entire_binding() }, 
-                wgpu::BindGroupEntry { binding: 2, resource: out_pixels.as_entire_binding() },
-                wgpu::BindGroupEntry { binding: 3, resource: params2_buffer.as_entire_binding() },
-            ],
-        });
-        let globals = Globals {
-            width: width as u32,
-            height: height as u32,
-            stride: stride as u32,
-
-            output_width: output_width as u32,
-            output_height: output_height as u32,
-            output_stride: output_stride as u32,
-            bytes_per_pixel: bytes_per_pixel as u32,
-            pix_element_count: pix_element_count as u32,
-            num_params: 2,
-            bg: [bg[0], bg[1], bg[2], bg[3]]
-        };
-
-        Some(Self {
-            device,
-            queue,
-            staging_buffer,
-            out_pixels,
-            in_pixels,
-            params_buffer,
-            params2_buffer,
-            bind_group,
-            compute_pipeline,
-            in_size,
-            out_size,
-            params_size,
-            globals
-        })
+        Some(name)
+    }
+
+    pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, bg: nalgebra::Vector4<f32>) -> Option<Self> {
+        let params_count = 9 * (height + 1);
+
+        if height < 4 || output_height < 4 || stride < 1 { return None; }
+
+        let in_size = (stride * height) as wgpu::BufferAddress;
+        let out_size = (output_stride * output_height) as wgpu::BufferAddress;
+        let params_size = (params_count * std::mem::size_of::<f32>()) as wgpu::BufferAddress;
+
+        //let instance = wgpu::Instance::new(wgpu::Backends::all());
+
+        //let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))?;
+        let adapter_initialized = ADAPTER.read().is_some();
+        if !adapter_initialized { Self::initialize_context(); }
+        let lock = ADAPTER.read();
+        if let Some(ref adapter) = *lock {
+            let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
+                label: None,
+                features: wgpu::Features::empty(),
+                limits: wgpu::Limits::default(),
+            }, None)).ok()?;
+
+            let info = adapter.get_info();
+            log::debug!("WGPU adapter: {:?}", &info);
+
+            let shader = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
+                source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("wgpu_undistort.wgsl"))),
+                label: None
+            });
+
+            let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
+            let in_pixels      = device.create_buffer(&wgpu::BufferDescriptor { size: in_size,  usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
+            let out_pixels     = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, label: None, mapped_at_creation: false });
+            let params_buffer  = device.create_buffer(&wgpu::BufferDescriptor { size: params_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
+
+            let params2_buffer  = device.create_buffer(&wgpu::BufferDescriptor { size: std::mem::size_of::<Globals>() as u64, usage: BufferUsages::UNIFORM | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
+
+            let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { module: &shader, entry_point: "undistort", label: None, layout: None });
+
+            let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
+            let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+                label: None,
+                layout: &bind_group_layout,
+                entries: &[
+                    wgpu::BindGroupEntry { binding: 0, resource: in_pixels.as_entire_binding() }, 
+                    wgpu::BindGroupEntry { binding: 1, resource: params_buffer.as_entire_binding() }, 
+                    wgpu::BindGroupEntry { binding: 2, resource: out_pixels.as_entire_binding() },
+                    wgpu::BindGroupEntry { binding: 3, resource: params2_buffer.as_entire_binding() },
+                ],
+            });
+            let globals = Globals {
+                width: width as u32,
+                height: height as u32,
+                stride: stride as u32,
+
+                output_width: output_width as u32,
+                output_height: output_height as u32,
+                output_stride: output_stride as u32,
+                bytes_per_pixel: bytes_per_pixel as u32,
+                pix_element_count: pix_element_count as u32,
+                num_params: 2,
+                bg: [bg[0], bg[1], bg[2], bg[3]]
+            };
+
+            Some(Self {
+                device,
+                queue,
+                staging_buffer,
+                out_pixels,
+                in_pixels,
+                params_buffer,
+                params2_buffer,
+                bind_group,
+                compute_pipeline,
+                in_size,
+                out_size,
+                params_size,
+                globals
+            })
+        } else {
+            None
+        }
     }
 
     pub fn set_background(&mut self, bg: nalgebra::Vector4<f32>) {

diff --git a/src/gyroflow.rs b/src/gyroflow.rs
@@ -133,6 +133,10 @@ fn entry() {
 
     rendering::init().unwrap();
 
+    if let Some(name) = core::gpu::initialize_contexts() {
+        rendering::set_gpu_type_from_name(&name);
+    }
+
     engine.exec();
 }
 

diff --git a/src/rendering/ffmpeg_hw.rs b/src/rendering/ffmpeg_hw.rs
@@ -66,6 +66,18 @@ lazy_static::lazy_static! {
     static ref DEC_DEVICES: Mutex<HashMap<DeviceType, HWDevice>> = Mutex::new(HashMap::new());
 }
 
+pub fn initialize_cuda_ctx() {
+    let type_ = ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA;
+    let mut devices = ENC_DEVICES.lock();
+    if let Entry::Vacant(e) = devices.entry(type_) {
+        ::log::debug!("create {:?}", type_);
+        if let Ok(dev) = HWDevice::from_type(type_) {
+            ::log::debug!("created ok {:?}", type_);
+            e.insert(dev);
+        }
+    }
+}
+
 pub fn supported_gpu_backends() -> Vec<String> {
     let mut ret = Vec::new();
     let mut hw_type = ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_NONE;