This repository has been archived by the owner on Mar 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
mcu.rs
385 lines (335 loc) · 15.9 KB
/
mcu.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
//! Implements routines to decode a MCU
//!
//! # Side notes
//! Yes, I pull in some dubious tricks, like really dubious here, they're not hard to come up
//! but I know they're hard to understand(e.g how I don't allocate space for Cb and Cr
//! channels if output colorspace is grayscale) but bear with me, it's the search for fast software
//! that got me here.
//!
//! # Multithreading
//!
//!This isn't exposed so I can dump all the info here
//!
//! To make multithreading work, we want to break dependency chains but in cool ways.
//! i.e we want to find out where we can forward one section as another one does something.
//!
//! # The algorithm
//! Simply do it per MCU width taking into account sub-sampling ratios
//!
//! 1. Decode an MCU width taking into account how many image channels we have(either Y only or Y,Cb and Cr)
//!
//! 2. After successfully decoding, copy pixels decoded and spawn a thread to handle post processing(IDCT,
//! upsampling and color conversion)
//!
//! 3. After successfully decoding all pixels, join threads.
//!
//! 4. Call it a day,
//!
//!But as easy as this sounds in theory, in practice, it sucks...
//!
//! We essentially have to consider that down-sampled images have weird MCU arrangement and for such cases
//! ! choose the path of decoding 2 whole MCU heights for horizontal/vertical upsampling and
//! 4 whole MCU heights for horizontal and vertical upsampling, which when expressed in code doesn't look nice.
//!
//! There is also the overhead of synchronization which makes some things annoying.
//!
//! Also there is the overhead of `cloning` and allocating intermediate memory to ensure multithreading is safe.
//! This may make this library almost 3X slower if someone chooses to disable `threadpool` (please don't) feature because
//! we are optimized for the multithreading path.
//!
//! # Scoped ThreadPools
//! Things you don't want to do in the fast path. **Lock da mutex**
//! Things you don't want to have in your code. **Mutex**
//!
//! Multithreading is not everyone's cake because synchronization is like battling with the devil
//! The default way is a mutex for when threads may write to the same memory location. But in our case we
//! don't write to the same, location, so why pay for something not used.
//!
//! In C/C++ land we can just pass mutable chunks to different threads but in Rust don't you know about
//! the borrow checker?...
//!
//! To send different mutable chunks to threads, we use scoped threads which guarantee that the thread
//! won't outlive the data and finally let it compile.
//! This allows us to not use locks during decoding avoiding that overhead. and allowing more cleaner
//! faster code in post processing..
use std::cmp::min;
use std::io::Cursor;
use std::sync::Arc;
use crate::bitstream::BitStream;
use crate::components::{ComponentID, SubSampRatios};
use crate::errors::DecodeErrors;
use crate::marker::Marker;
use crate::worker::post_process;
use crate::Decoder;
/// The size of a DC block for a MCU.
pub const DCT_BLOCK: usize = 64;
impl Decoder
{
/// Check for existence of DC and AC Huffman Tables
fn check_tables(&self) -> Result<(), DecodeErrors>
{
// check that dc and AC tables exist outside the hot path
for i in 0..self.input_colorspace.num_components()
{
let _ = &self
.dc_huffman_tables
.get(self.components[i].dc_huff_table)
.as_ref()
.ok_or_else(|| {
DecodeErrors::HuffmanDecode(format!(
"No Huffman DC table for component {:?} ",
self.components[i].component_id
))
})?
.as_ref()
.ok_or_else(|| {
DecodeErrors::HuffmanDecode(format!(
"No DC table for component {:?}",
self.components[i].component_id
))
})?;
let _ = &self
.ac_huffman_tables
.get(self.components[i].ac_huff_table)
.as_ref()
.ok_or_else(|| {
DecodeErrors::HuffmanDecode(format!(
"No Huffman AC table for component {:?} ",
self.components[i].component_id
))
})?
.as_ref()
.ok_or_else(|| {
DecodeErrors::HuffmanDecode(format!(
"No AC table for component {:?}",
self.components[i].component_id
))
})?;
}
Ok(())
}
/// Decode MCUs and carry out post processing.
///
/// This is the main decoder loop for the library, the hot path.
///
/// Because of this, we pull in some very crazy optimization tricks hence readability is a pinch
/// here.
#[allow(clippy::similar_names)]
#[inline(never)]
#[rustfmt::skip]
pub(crate) fn decode_mcu_ycbcr_baseline(
&mut self, reader: &mut Cursor<Vec<u8>>,
) -> Result<Vec<u8>, DecodeErrors>
{
self.check_component_dimensions()?;
let mut scoped_pools = scoped_threadpool::Pool::new(self.num_threads.unwrap_or( num_cpus::get()) as u32);
info!("Created {} worker threads", scoped_pools.thread_count());
let (mcu_width, mcu_height);
let mut bias = 1;
if self.interleaved
{
// set upsampling functions
self.set_upsampling()?;
if self.sub_sample_ratio == SubSampRatios::H
{
// horizontal sub-sampling.
// Values for horizontal samples end halfway the image and do not complete an MCU width.
// To make it complete we multiply width by 2 and divide mcu_height by 2
mcu_width = self.mcu_x * 2;
mcu_height = self.mcu_y / 2;
} else if self.sub_sample_ratio == SubSampRatios::HV
{
mcu_width = self.mcu_x;
mcu_height = self.mcu_y / 2;
bias = 2;
// V;
} else {
mcu_width = self.mcu_x;
mcu_height = self.mcu_y;
}
} else {
// For non-interleaved images( (1*1) subsampling)
// number of MCU's are the widths (+7 to account for paddings) divided bu 8.
mcu_width = ((self.info.width + 7) / 8) as usize;
mcu_height = ((self.info.height + 7) / 8) as usize;
}
let mut stream = BitStream::new();
// Size of our output image(width*height)
let capacity = usize::from(self.info.width + 7) * usize::from(self.info.height + 7);
let component_capacity = mcu_width * DCT_BLOCK;
// Create an Arc of components to prevent cloning on every MCU width
let global_component = Arc::new(self.components.clone());
// Storage for decoded pixels
let mut global_channel = vec![0; capacity * self.output_colorspace.num_components()];
// things needed for post processing that we can remove out of the loop
let input = self.input_colorspace;
let output = self.output_colorspace;
let idct_func = self.idct_func;
let color_convert_16 = self.color_convert_16;
let width = usize::from(self.width());
let h_max = self.h_max;
let v_max = self.v_max;
// Halfway width size, used for vertical sub-sampling to write |Y2| in the right position.
let width_stride = (component_capacity * self.components[0].vertical_sample * self.components[0].horizontal_sample * bias) >> 1;
let hv_width_stride = width_stride >> 1;
// check dc and AC tables
self.check_tables()?;
let is_hv = self.sub_sample_ratio == SubSampRatios::HV;
// Split output into different blocks each containing enough space for an MCU width
let mut chunks =
global_channel.chunks_exact_mut(width * output.num_components() * 8 * h_max * v_max);
let mut tmp = [0; DCT_BLOCK];
// Argument for scoped threadpools, see file docs.
scoped_pools.scoped::<_, Result<(), DecodeErrors>>(|scope| {
for _ in 0..mcu_height
{
// faster to memset than a later memcpy
// We allocate on every mcu_height since this is sent to a separate
// thread (that's how we're multi-threaded and thread safe).
let mut temporary = [vec![], vec![], vec![]];
for (pos, comp) in self.components.iter().enumerate()
{
// multiply capacity with sampling factor, it should be 1*1 for un-sampled images
// Allocate only needed components.
if min(self.output_colorspace.num_components() - 1, pos) == pos
{
let len = component_capacity * comp.vertical_sample * comp.horizontal_sample * bias;
temporary[pos] = vec![0; len];
}
}
// Bias only affects 4:2:0(chroma quartered) sub-sampled images.
// since we want to fetch two MCU rows before we send it to post process
for v in 0..bias
{
for j in 0..mcu_width
{
// iterate over components
for pos in 0..self.input_colorspace.num_components()
{
let component = &mut self.components[pos];
// Safety:The tables were confirmed to exist in self.check_tables();
let dc_table = unsafe {
self.dc_huffman_tables
.get_unchecked(component.dc_huff_table)
.as_ref()
.unwrap_or_else(|| std::hint::unreachable_unchecked())
};
let ac_table = unsafe {
self.ac_huffman_tables
.get_unchecked(component.ac_huff_table)
.as_ref()
.unwrap_or_else(|| std::hint::unreachable_unchecked())
};
// If image is interleaved iterate over scan components,
// otherwise if it-s non-interleaved, these routines iterate in
// trivial scanline order(Y,Cb,Cr)
for v_samp in 0..component.vertical_sample
{
for h_samp in 0..component.horizontal_sample
{
// only decode needed components
if min(self.output_colorspace.num_components() - 1, pos) == pos
{
// The spec https://www.w3.org/Graphics/JPEG/itu-t81.pdf page 26
// Get position to write
// This is complex, don't even try to understand it. ~author
let is_y =
usize::from(component.component_id == ComponentID::Y);
// This only affects 4:2:0 images.
let y_offset = is_y
* v
* (hv_width_stride
+ (hv_width_stride * (component.vertical_sample - 1)));
let another_stride =
(width_stride * v_samp * usize::from(!is_hv))
+ hv_width_stride * v_samp * usize::from(is_hv);
let yet_another_stride = usize::from(is_hv)
* (width_stride >> 2)
* v
* usize::from(component.component_id != ComponentID::Y);
// offset calculator.
let start = (j * 64 * component.horizontal_sample)
+ (h_samp * 64)
+ another_stride
+ y_offset
+ yet_another_stride;
// Get the location we will be writing to.
// It will always be zero since it's initialized per MCU height.
let tmp: &mut [i16; 64] = temporary.get_mut(pos).unwrap().get_mut(start..start + 64).unwrap().try_into().unwrap();
stream.decode_mcu_block(reader, dc_table, ac_table, tmp, &mut component.dc_pred)?;
}
else
{
// component not needed, decode and discard bits
stream.decode_mcu_block(reader, dc_table, ac_table, &mut tmp, &mut component.dc_pred)?;
}
}
}
self.todo = self.todo.wrapping_sub(1);
// after every interleaved MCU that's a mcu, count down restart markers.
if self.todo == 0
{
self.handle_rst(&mut stream)?;
}
}
}
}
// Clone things, to make multithreading safe
let component = global_component.clone();
let next_chunk = chunks.next().unwrap();
scope.execute(move || {
post_process(&mut temporary, &component,
idct_func, color_convert_16,
input, output, next_chunk,
width);
});
}
//everything is okay
Ok(())
})?;
info!("Finished decoding image");
// remove excess allocation for images.
global_channel.truncate(
usize::from(self.width())
* usize::from(self.height())
* self.output_colorspace.num_components(),
);
return Ok(global_channel);
}
// handle RST markers.
// No-op if not using restarts
// this routine is shared with mcu_prog
#[cold]
pub(crate) fn handle_rst(&mut self, stream: &mut BitStream) -> Result<(), DecodeErrors>
{
self.todo = self.restart_interval;
if let Some(marker) = stream.marker
{
// Found a marker
// Read stream and see what marker is stored there
match marker
{
Marker::RST(_) =>
{
// reset stream
stream.reset();
// Initialize dc predictions to zero for all components
self.components.iter_mut().for_each(|x| x.dc_pred = 0);
// Start iterating again. from position.
}
Marker::EOI =>
{
// silent pass
}
_ =>
{
return Err(DecodeErrors::MCUError(format!(
"Marker {:?} found in bitstream, possibly corrupt jpeg",
marker
)));
}
}
}
Ok(())
}
}