Merge branch 'master' into dragly/fix-corrupted-range-coding

gendx · Dec 16, 2019 · 0ee7b55 · 0ee7b55
2 parents b8c9f8f + 4429567
commit 0ee7b55
Show file tree

Hide file tree

Showing 11 changed files with 306 additions and 43 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,3 +9,9 @@ rust:
 matrix:
     allow_failures:
         - rust: nightly
+before_script:
+    - rustup component add rustfmt
+script:
+    - cargo fmt -- --check
+    - cargo build --verbose
+    - cargo test --verbose
diff --git a/bors.toml b/bors.toml
@@ -0,0 +1,9 @@
+# List of commit statuses that must pass on the merge commit before it is
+# pushed to master.
+status = [
+  "continuous-integration/travis-ci/push",
+]
+
+# If set to true, and if the PR branch is on the same repository that bors-ng
+# itself is on, the branch will be deleted.
+delete_merged_branches = true
diff --git a/src/decode/lzma.rs b/src/decode/lzma.rs
@@ -4,6 +4,9 @@ use crate::error;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::io;
 
+use crate::decompress::Options;
+use crate::decompress::UnpackedSize;
+
 pub struct LZMAParams {
     // most lc significant bits of previous byte are part of the literal context
     lc: u32, // 0..8
@@ -15,7 +18,7 @@ pub struct LZMAParams {
 }
 
 impl LZMAParams {
-    pub fn read_header<R>(input: &mut R) -> error::Result<LZMAParams>
+    pub fn read_header<R>(input: &mut R, options: &Options) -> error::Result<LZMAParams>
     where
         R: io::BufRead,
     {
@@ -58,17 +61,26 @@ impl LZMAParams {
         info!("Dict size: {}", dict_size);
 
         // Unpacked size
-        let unpacked_size_provided = input.read_u64::<LittleEndian>().or_else(|e| {
-            Err(error::Error::LZMAError(format!(
-                "LZMA header too short: {}",
-                e
-            )))
-        })?;
-        let marker_mandatory: bool = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF;
-        let unpacked_size = if marker_mandatory {
-            None
-        } else {
-            Some(unpacked_size_provided)
+        let unpacked_size: Option<u64> = match options.unpacked_size {
+            UnpackedSize::ReadFromHeader => {
+                let unpacked_size_provided = input.read_u64::<LittleEndian>().or_else(|e| {
+                    Err(error::Error::LZMAError(format!(
+                        "LZMA header too short: {}",
+                        e
+                    )))
+                })?;
+                let marker_mandatory: bool = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF;
+                if marker_mandatory {
+                    None
+                } else {
+                    Some(unpacked_size_provided)
+                }
+            }
+            UnpackedSize::ReadHeaderButUseProvided(x) => {
+                input.read_u64::<LittleEndian>()?;
+                x
+            }
+            UnpackedSize::UseProvided(x) => x,
         };
 
         info!("Unpacked size: {:?}", unpacked_size);

diff --git a/src/decode/mod.rs b/src/decode/mod.rs
@@ -1,6 +1,7 @@
 pub mod lzbuffer;
 pub mod lzma;
 pub mod lzma2;
+pub mod options;
 pub mod rangecoder;
 pub mod util;
 pub mod xz;
diff --git a/src/decode/options.rs b/src/decode/options.rs
@@ -0,0 +1,34 @@
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Options {
+    /// Defines whether the unpacked size should be read from the header or provided.
+    /// The default is
+    /// [`UnpackedSize::ReadFromHeader`](enum.UnpackedSize.html#variant.ReadFromHeader)
+    pub unpacked_size: UnpackedSize,
+}
+
+/// Alternatives for defining the unpacked size of the decoded data
+#[derive(Clone, Copy, Debug)]
+pub enum UnpackedSize {
+    /// Assume that the 8 bytes used to specify the unpacked size are present in the header.
+    /// If the bytes are `0xFFFF_FFFF_FFFF_FFFF`, assume that there is an end-of-payload marker in
+    /// the file.
+    /// If not, read the 8 bytes as a little-endian encoded u64.
+    ReadFromHeader,
+    /// Assume that there are 8 bytes representing the unpacked size present in the header.
+    /// Read it, but ignore it and use the provided value instead.
+    /// If the provided value is `None`, assume that there is an end-of-payload marker in the file.
+    /// Note that this is a non-standard way of reading LZMA data,
+    /// but is used by certain libraries such as
+    /// [OpenCTM](http://openctm.sourceforge.net/).
+    ReadHeaderButUseProvided(Option<u64>),
+    /// Assume that the 8 bytes typically used to represent the unpacked size are *not* present in
+    /// the header. Use the provided value.
+    /// If the provided value is `None`, assume that there is an end-of-payload marker in the file.
+    UseProvided(Option<u64>),
+}
+
+impl Default for UnpackedSize {
+    fn default() -> UnpackedSize {
+        UnpackedSize::ReadFromHeader
+    }
+}
diff --git a/src/encode/dumbencoder.rs b/src/encode/dumbencoder.rs
@@ -1,3 +1,4 @@
+use crate::compress::{Options, UnpackedSize};
 use crate::encode::rangecoder;
 use byteorder::{LittleEndian, WriteBytesExt};
 use std::io;
@@ -9,6 +10,7 @@ where
     rangecoder: rangecoder::RangeEncoder<'a, W>,
     literal_probs: [[u16; 0x300]; 8],
     is_match: [u16; 4], // true = LZ, false = literal
+    unpacked_size: UnpackedSize,
 }
 
 const LC: u32 = 3;
@@ -19,7 +21,7 @@ impl<'a, W> Encoder<'a, W>
 where
     W: io::Write,
 {
-    pub fn from_stream(stream: &'a mut W) -> io::Result<Self> {
+    pub fn from_stream(stream: &'a mut W, options: &Options) -> io::Result<Self> {
         let dict_size = 0x800000;
 
         // Properties
@@ -32,13 +34,28 @@ where
         stream.write_u32::<LittleEndian>(dict_size)?;
 
         // Unpacked size
-        info!("Unpacked size: unknown");
-        stream.write_u64::<LittleEndian>(0xFFFF_FFFF_FFFF_FFFF)?;
+        match &options.unpacked_size {
+            UnpackedSize::WriteToHeader(unpacked_size) => {
+                let value: u64 = match unpacked_size {
+                    None => {
+                        info!("Unpacked size: unknown");
+                        0xFFFF_FFFF_FFFF_FFFF
+                    }
+                    Some(x) => {
+                        info!("Unpacked size: {}", x);
+                        *x
+                    }
+                };
+                stream.write_u64::<LittleEndian>(value)?;
+            }
+            UnpackedSize::SkipWritingToHeader => {}
+        };
 
         let encoder = Encoder {
             rangecoder: rangecoder::RangeEncoder::new(stream),
             literal_probs: [[0x400; 0x300]; 8],
             is_match: [0x400; 4],
+            unpacked_size: options.unpacked_size,
         };
 
         Ok(encoder)
@@ -68,33 +85,39 @@ where
     }
 
     fn finish(&mut self, input_len: usize) -> io::Result<()> {
-        // Write end-of-stream marker
-        let pos_state = input_len & 3;
-
-        // Match
-        self.rangecoder
-            .encode_bit(&mut self.is_match[pos_state], true)?;
-        // New distance
-        self.rangecoder.encode_bit(&mut 0x400, false)?;
-
-        // Dummy len, as small as possible (len = 0)
-        for _ in 0..4 {
-            self.rangecoder.encode_bit(&mut 0x400, false)?;
-        }
-
-        // Distance marker = 0xFFFFFFFF
-        // pos_slot = 63
-        for _ in 0..6 {
-            self.rangecoder.encode_bit(&mut 0x400, true)?;
-        }
-        // num_direct_bits = 30
-        // result = 3 << 30 = C000_0000
-        //        + 3FFF_FFF0  (26 bits)
-        //        + F          ( 4 bits)
-        for _ in 0..30 {
-            self.rangecoder.encode_bit(&mut 0x400, true)?;
+        match self.unpacked_size {
+            UnpackedSize::SkipWritingToHeader => {}
+            UnpackedSize::WriteToHeader(Some(_)) => {}
+            UnpackedSize::WriteToHeader(None) => {
+                // Write end-of-stream marker
+                let pos_state = input_len & 3;
+
+                // Match
+                self.rangecoder
+                    .encode_bit(&mut self.is_match[pos_state], true)?;
+                // New distance
+                self.rangecoder.encode_bit(&mut 0x400, false)?;
+
+                // Dummy len, as small as possible (len = 0)
+                for _ in 0..4 {
+                    self.rangecoder.encode_bit(&mut 0x400, false)?;
+                }
+
+                // Distance marker = 0xFFFFFFFF
+                // pos_slot = 63
+                for _ in 0..6 {
+                    self.rangecoder.encode_bit(&mut 0x400, true)?;
+                }
+                // num_direct_bits = 30
+                // result = 3 << 30 = C000_0000
+                //        + 3FFF_FFF0  (26 bits)
+                //        + F          ( 4 bits)
+                for _ in 0..30 {
+                    self.rangecoder.encode_bit(&mut 0x400, true)?;
+                }
+                //        = FFFF_FFFF
+            }
         }
-        //        = FFFF_FFFF
 
         // Flush range coder
         self.rangecoder.finish()

diff --git a/src/encode/mod.rs b/src/encode/mod.rs
@@ -1,5 +1,6 @@
 pub mod dumbencoder;
 pub mod lzma2;
+pub mod options;
 mod rangecoder;
 mod util;
 pub mod xz;
diff --git a/src/encode/options.rs b/src/encode/options.rs
@@ -0,0 +1,30 @@
+/// Options for the `lzma_compress` function
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Options {
+    /// Defines whether the unpacked size should be written to the header.
+    /// The default is
+    /// [`UnpackedSize::WriteToHeader(None)`](enum.encode.UnpackedSize.html#variant.WriteValueToHeader)
+    pub unpacked_size: UnpackedSize,
+}
+
+/// Alternatives for handling unpacked size
+#[derive(Clone, Copy, Debug)]
+pub enum UnpackedSize {
+    /// If the value is `Some(u64)`, write the provided u64 value to the header.
+    /// There is currently no check in place that verifies that this is the actual number of bytes
+    /// provided by the input stream.
+    /// If the value is `None`, write the special `0xFFFF_FFFF_FFFF_FFFF` code to the header,
+    /// indicating that the unpacked size is unknown.
+    WriteToHeader(Option<u64>),
+    /// Do not write anything to the header. The unpacked size needs to be stored elsewhere and
+    /// provided when reading the file. Note that this is a non-standard way of writing LZMA data,
+    /// but is used by certain libraries such as
+    /// [OpenCTM](http://openctm.sourceforge.net/).
+    SkipWritingToHeader,
+}
+
+impl Default for UnpackedSize {
+    fn default() -> UnpackedSize {
+        UnpackedSize::WriteToHeader(None)
+    }
+}
diff --git a/src/encode/rangecoder.rs b/src/encode/rangecoder.rs
@@ -96,3 +96,39 @@ where
         self.normalize()
     }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::decode::rangecoder::RangeDecoder;
+    use std::io::BufReader;
+
+    fn encode_decode(prob_init: u16, bits: &[bool]) {
+        let mut buf: Vec<u8> = Vec::new();
+
+        let mut encoder = RangeEncoder::new(&mut buf);
+        let mut prob = prob_init;
+        for &b in bits {
+            encoder.encode_bit(&mut prob, b).unwrap();
+        }
+        encoder.finish().unwrap();
+
+        let mut bufread = BufReader::new(buf.as_slice());
+        let mut decoder = RangeDecoder::new(&mut bufread).unwrap();
+        let mut prob = prob_init;
+        for &b in bits {
+            assert_eq!(decoder.decode_bit(&mut prob).unwrap(), b);
+        }
+        assert!(decoder.is_finished_ok().unwrap());
+    }
+
+    #[test]
+    fn test_encode_decode_zeros() {
+        encode_decode(0x400, &[false; 10000]);
+    }
+
+    #[test]
+    fn test_encode_decode_ones() {
+        encode_decode(0x400, &[true; 10000]);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -10,11 +10,29 @@ pub mod error;
 use crate::decode::lzbuffer::LZBuffer;
 use std::io;
 
+pub mod compress {
+    pub use crate::encode::options::*;
+}
+
+pub mod decompress {
+    pub use crate::decode::options::*;
+}
+
+/// Decompress LZMA data with default [`Options`](decompress/struct.Options.html).
 pub fn lzma_decompress<R: io::BufRead, W: io::Write>(
     input: &mut R,
     output: &mut W,
 ) -> error::Result<()> {
-    let params = decode::lzma::LZMAParams::read_header(input)?;
+    lzma_decompress_with_options(input, output, &decompress::Options::default())
+}
+
+/// Decompress LZMA data with the provided options.
+pub fn lzma_decompress_with_options<R: io::BufRead, W: io::Write>(
+    input: &mut R,
+    output: &mut W,
+    options: &decompress::Options,
+) -> error::Result<()> {
+    let params = decode::lzma::LZMAParams::read_header(input, options)?;
     let mut decoder = decode::lzma::new_circular(output, params)?;
     let mut rangecoder = decode::rangecoder::RangeDecoder::new(input).or_else(|e| {
         Err(error::Error::LZMAError(format!(
@@ -27,11 +45,20 @@ pub fn lzma_decompress<R: io::BufRead, W: io::Write>(
     Ok(())
 }
 
+/// Compresses the data with default [`Options`](compress/struct.Options.html).
 pub fn lzma_compress<R: io::BufRead, W: io::Write>(
     input: &mut R,
     output: &mut W,
 ) -> io::Result<()> {
-    let encoder = encode::dumbencoder::Encoder::from_stream(output)?;
+    lzma_compress_with_options(input, output, &compress::Options::default())
+}
+
+pub fn lzma_compress_with_options<R: io::BufRead, W: io::Write>(
+    input: &mut R,
+    output: &mut W,
+    options: &compress::Options,
+) -> io::Result<()> {
+    let encoder = encode::dumbencoder::Encoder::from_stream(output, options)?;
     encoder.process(input)
 }