Skip to content

Commit

Permalink
Improve the GPU stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
AdrianEddy committed Feb 7, 2022
1 parent 7e2818d commit 6276bc4
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 126 deletions.
17 changes: 16 additions & 1 deletion src/core/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,19 @@

#[cfg(feature = "use-opencl")]
pub mod opencl;
pub mod wgpu;
pub mod wgpu;

pub fn initialize_contexts() -> Option<String> {
#[cfg(feature = "use-opencl")]
match opencl::OclWrapper::initialize_context() {
Ok(name) => { return Some(name); },
Err(e) => { log::error!("OpenCL error: {:?}", e); }
}

match wgpu::WgpuWrapper::initialize_context() {
Some(name) => { return Some(name); },
None => { log::error!("WGPU init error"); }
}

None
}
136 changes: 80 additions & 56 deletions src/core/gpu/opencl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Copyright © 2021-2022 Adrian <adrian.eddy at gmail>

use ocl::*;
use parking_lot::RwLock;

pub struct OclWrapper {
kernel: Kernel,
Expand All @@ -12,11 +13,17 @@ pub struct OclWrapper {
params_buf: Buffer<f32>,
}

struct CtxWrapper {
device: Device,
context: Context,
}

lazy_static::lazy_static! {
static ref CONTEXT: RwLock<Option<CtxWrapper>> = RwLock::new(None);
}

impl OclWrapper {
pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, ocl_names: (&str, &str, &str, &str), bg: nalgebra::Vector4<f32>) -> ocl::Result<Self> {

if height < 4 || output_height < 4 || stride < 1 { return Err(ocl::BufferCmdError::AlreadyMapped.into()); }

pub fn initialize_context() -> ocl::Result<String> {
let platform = Platform::default();
let device = Device::first(platform)?;
::log::info!("OpenCL Platform: {}, Device: {} {}", platform.name()?, device.vendor()?, device.name()?);
Expand All @@ -26,60 +33,77 @@ impl OclWrapper {
.devices(device)
.build()?;

let queue = Queue::new(&context, device, None)?;
let name = device.name();

let program = Program::builder()
.src(include_str!("opencl_undistort.cl"))
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPE" .into(), val: ocl_names.0.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERT" .into(), val: ocl_names.1.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPEF" .into(), val: ocl_names.2.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERTF".into(), val: ocl_names.3.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "PIXEL_BYTES" .into(), val: format!("{}", bytes_per_pixel) })
.devices(device)
.build(&context)?;

let source_buffer = Buffer::builder().queue(queue.clone()).len(stride*height)
.flags(MemFlags::new().read_only().host_write_only()).build()?;

let dest_buffer = Buffer::builder().queue(queue.clone()).len(output_stride*output_height)
.flags(MemFlags::new().write_only().host_read_only().alloc_host_ptr()).build()?;

let params_len = 9 * (height + 1);
let params_buf = Buffer::<f32>::builder().queue(queue.clone()).flags(MemFlags::new().read_only()).len(params_len).build()?;

let mut builder = Kernel::builder();
unsafe {
builder.program(&program).name("undistort_image").queue(queue)
.global_work_size((output_width, output_height))
.disable_arg_type_check()
.arg(&source_buffer)
.arg(&dest_buffer)
.arg(ocl::prm::Ushort::new(width as u16))
.arg(ocl::prm::Ushort::new(height as u16))
.arg(ocl::prm::Ushort::new(stride as u16))
.arg(ocl::prm::Ushort::new(output_width as u16))
.arg(ocl::prm::Ushort::new(output_height as u16))
.arg(ocl::prm::Ushort::new(output_stride as u16))
.arg(&params_buf)
.arg(ocl::prm::Ushort::new(2));
}
*CONTEXT.write() = Some(CtxWrapper { device, context });

name
}

match pix_element_count {
1 => builder.arg(ocl::prm::Float::new(bg[0])),
2 => builder.arg(ocl::prm::Float2::new(bg[0], bg[1])),
3 => builder.arg(ocl::prm::Float3::new(bg[0], bg[1], bg[2])),
4 => builder.arg(ocl::prm::Float4::new(bg[0], bg[1], bg[2], bg[3])),
_ => panic!("Unknown pix_element_count {}", pix_element_count)
};
let kernel = builder.build()?;

Ok(Self {
pix_element_count,
kernel,
src: source_buffer,
dst: dest_buffer,
params_buf,
})
pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, ocl_names: (&str, &str, &str, &str), bg: nalgebra::Vector4<f32>) -> ocl::Result<Self> {
if height < 4 || output_height < 4 || stride < 1 { return Err(ocl::BufferCmdError::AlreadyMapped.into()); }

let context_initialized = CONTEXT.read().is_some();
if !context_initialized { Self::initialize_context()?; }
let lock = CONTEXT.read();
if let Some(ref ctx) = *lock {
let queue = Queue::new(&ctx.context, ctx.device, None)?;

let program = Program::builder()
.src(include_str!("opencl_undistort.cl"))
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPE" .into(), val: ocl_names.0.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERT" .into(), val: ocl_names.1.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_TYPEF" .into(), val: ocl_names.2.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "DATA_CONVERTF".into(), val: ocl_names.3.into() })
.bo(builders::BuildOpt::CmplrDefine { ident: "PIXEL_BYTES" .into(), val: format!("{}", bytes_per_pixel) })
.devices(ctx.device)
.build(&ctx.context)?;

let source_buffer = Buffer::builder().queue(queue.clone()).len(stride*height)
.flags(MemFlags::new().read_only().host_write_only()).build()?;

let dest_buffer = Buffer::builder().queue(queue.clone()).len(output_stride*output_height)
.flags(MemFlags::new().write_only().host_read_only().alloc_host_ptr()).build()?;

let params_len = 9 * (height + 1);
let params_buf = Buffer::<f32>::builder().queue(queue.clone()).flags(MemFlags::new().read_only()).len(params_len).build()?;

let mut builder = Kernel::builder();
unsafe {
builder.program(&program).name("undistort_image").queue(queue)
.global_work_size((output_width, output_height))
.disable_arg_type_check()
.arg(&source_buffer)
.arg(&dest_buffer)
.arg(ocl::prm::Ushort::new(width as u16))
.arg(ocl::prm::Ushort::new(height as u16))
.arg(ocl::prm::Ushort::new(stride as u16))
.arg(ocl::prm::Ushort::new(output_width as u16))
.arg(ocl::prm::Ushort::new(output_height as u16))
.arg(ocl::prm::Ushort::new(output_stride as u16))
.arg(&params_buf)
.arg(ocl::prm::Ushort::new(2));
}

match pix_element_count {
1 => builder.arg(ocl::prm::Float::new(bg[0])),
2 => builder.arg(ocl::prm::Float2::new(bg[0], bg[1])),
3 => builder.arg(ocl::prm::Float3::new(bg[0], bg[1], bg[2])),
4 => builder.arg(ocl::prm::Float4::new(bg[0], bg[1], bg[2], bg[3])),
_ => panic!("Unknown pix_element_count {}", pix_element_count)
};
let kernel = builder.build()?;

Ok(Self {
pix_element_count,
kernel,
src: source_buffer,
dst: dest_buffer,
params_buf,
})
} else {
Err(ocl::BufferCmdError::AlreadyMapped.into())
}
}

pub fn set_background(&mut self, bg: nalgebra::Vector4<f32>) -> ocl::Result<()> {
Expand Down
162 changes: 94 additions & 68 deletions src/core/gpu/wgpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
use std::borrow::Cow;
use bytemuck::Pod;
use bytemuck::Zeroable;
use wgpu::Adapter;
use wgpu::BufferUsages;
use parking_lot::RwLock;

#[repr(C, align(32))]
#[derive(Clone, Copy)]
Expand Down Expand Up @@ -41,83 +43,107 @@ pub struct WgpuWrapper {
globals: Globals
}

impl WgpuWrapper {
pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, bg: nalgebra::Vector4<f32>) -> Option<Self> {
let params_count = 9 * (height + 1);

if height < 4 || output_height < 4 || stride < 1 { return None; }

let in_size = (stride * height) as wgpu::BufferAddress;
let out_size = (output_stride * output_height) as wgpu::BufferAddress;
let params_size = (params_count * std::mem::size_of::<f32>()) as wgpu::BufferAddress;
lazy_static::lazy_static! {
static ref ADAPTER: RwLock<Option<Adapter>> = RwLock::new(None);
}

impl WgpuWrapper {
pub fn initialize_context() -> Option<String> {
let instance = wgpu::Instance::new(wgpu::Backends::all());

let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))?;

let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
label: None,
features: wgpu::Features::empty(),
limits: wgpu::Limits::default(),
}, None)).ok()?;

let info = adapter.get_info();
log::debug!("WGPU adapter: {:?}", &info);

let shader = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("wgpu_undistort.wgsl"))),
label: None
});
let name = info.name.clone();

let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
let in_pixels = device.create_buffer(&wgpu::BufferDescriptor { size: in_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
let out_pixels = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, label: None, mapped_at_creation: false });
let params_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: params_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
*ADAPTER.write() = Some(adapter);

let params2_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: std::mem::size_of::<Globals>() as u64, usage: BufferUsages::UNIFORM | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });

let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { module: &shader, entry_point: "undistort", label: None, layout: None });

let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry { binding: 0, resource: in_pixels.as_entire_binding() },
wgpu::BindGroupEntry { binding: 1, resource: params_buffer.as_entire_binding() },
wgpu::BindGroupEntry { binding: 2, resource: out_pixels.as_entire_binding() },
wgpu::BindGroupEntry { binding: 3, resource: params2_buffer.as_entire_binding() },
],
});
let globals = Globals {
width: width as u32,
height: height as u32,
stride: stride as u32,

output_width: output_width as u32,
output_height: output_height as u32,
output_stride: output_stride as u32,
bytes_per_pixel: bytes_per_pixel as u32,
pix_element_count: pix_element_count as u32,
num_params: 2,
bg: [bg[0], bg[1], bg[2], bg[3]]
};

Some(Self {
device,
queue,
staging_buffer,
out_pixels,
in_pixels,
params_buffer,
params2_buffer,
bind_group,
compute_pipeline,
in_size,
out_size,
params_size,
globals
})
Some(name)
}

pub fn new(width: usize, height: usize, stride: usize, bytes_per_pixel: usize, output_width: usize, output_height: usize, output_stride: usize, pix_element_count: usize, bg: nalgebra::Vector4<f32>) -> Option<Self> {
let params_count = 9 * (height + 1);

if height < 4 || output_height < 4 || stride < 1 { return None; }

let in_size = (stride * height) as wgpu::BufferAddress;
let out_size = (output_stride * output_height) as wgpu::BufferAddress;
let params_size = (params_count * std::mem::size_of::<f32>()) as wgpu::BufferAddress;

//let instance = wgpu::Instance::new(wgpu::Backends::all());

//let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))?;
let adapter_initialized = ADAPTER.read().is_some();
if !adapter_initialized { Self::initialize_context(); }
let lock = ADAPTER.read();
if let Some(ref adapter) = *lock {
let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
label: None,
features: wgpu::Features::empty(),
limits: wgpu::Limits::default(),
}, None)).ok()?;

let info = adapter.get_info();
log::debug!("WGPU adapter: {:?}", &info);

let shader = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("wgpu_undistort.wgsl"))),
label: None
});

let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
let in_pixels = device.create_buffer(&wgpu::BufferDescriptor { size: in_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });
let out_pixels = device.create_buffer(&wgpu::BufferDescriptor { size: out_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, label: None, mapped_at_creation: false });
let params_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: params_size, usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });

let params2_buffer = device.create_buffer(&wgpu::BufferDescriptor { size: std::mem::size_of::<Globals>() as u64, usage: BufferUsages::UNIFORM | BufferUsages::COPY_DST, label: None, mapped_at_creation: false });

let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { module: &shader, entry_point: "undistort", label: None, layout: None });

let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: None,
layout: &bind_group_layout,
entries: &[
wgpu::BindGroupEntry { binding: 0, resource: in_pixels.as_entire_binding() },
wgpu::BindGroupEntry { binding: 1, resource: params_buffer.as_entire_binding() },
wgpu::BindGroupEntry { binding: 2, resource: out_pixels.as_entire_binding() },
wgpu::BindGroupEntry { binding: 3, resource: params2_buffer.as_entire_binding() },
],
});
let globals = Globals {
width: width as u32,
height: height as u32,
stride: stride as u32,

output_width: output_width as u32,
output_height: output_height as u32,
output_stride: output_stride as u32,
bytes_per_pixel: bytes_per_pixel as u32,
pix_element_count: pix_element_count as u32,
num_params: 2,
bg: [bg[0], bg[1], bg[2], bg[3]]
};

Some(Self {
device,
queue,
staging_buffer,
out_pixels,
in_pixels,
params_buffer,
params2_buffer,
bind_group,
compute_pipeline,
in_size,
out_size,
params_size,
globals
})
} else {
None
}
}

pub fn set_background(&mut self, bg: nalgebra::Vector4<f32>) {
Expand Down
4 changes: 4 additions & 0 deletions src/gyroflow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ fn entry() {

rendering::init().unwrap();

if let Some(name) = core::gpu::initialize_contexts() {
rendering::set_gpu_type_from_name(&name);
}

engine.exec();
}

Expand Down
12 changes: 12 additions & 0 deletions src/rendering/ffmpeg_hw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ lazy_static::lazy_static! {
static ref DEC_DEVICES: Mutex<HashMap<DeviceType, HWDevice>> = Mutex::new(HashMap::new());
}

pub fn initialize_cuda_ctx() {
let type_ = ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA;
let mut devices = ENC_DEVICES.lock();
if let Entry::Vacant(e) = devices.entry(type_) {
::log::debug!("create {:?}", type_);
if let Ok(dev) = HWDevice::from_type(type_) {
::log::debug!("created ok {:?}", type_);
e.insert(dev);
}
}
}

pub fn supported_gpu_backends() -> Vec<String> {
let mut ret = Vec::new();
let mut hw_type = ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_NONE;
Expand Down

0 comments on commit 6276bc4

Please sign in to comment.