From 017ec9e85ac9a448b2b626f0c44f1d83bdcf26fa Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 5 Nov 2010 23:02:58 +1100
Subject: [PATCH] Move the ram allocation phase into rzip_fd to be able to get
 a more accurate measure of percentage done. Prevent failure when offset is
 not a multiple of page size. Add chunk percentage complete to output. Tweak
 output at various verbosities. Update documentation to reflect improved
 performance of unlimited mode. Update benchmark results. More tidying.

---
 README                |  28 +++++++-
 doc/README.benchmarks |  16 ++---
 main.c                |   5 ++
 rzip.c                | 155 ++++++++++++++++++++++--------------------
 stream.c              |  12 ++--
 5 files changed, 126 insertions(+), 90 deletions(-)

diff --git a/README b/README
index ff7a4d7c..d2445cba 100644
--- a/README
+++ b/README
@@ -74,8 +74,7 @@ a compression window larger than your ramsize, if the file is that large. It
 does this (with the -U option) by implementing one large mmap buffer as per
 normal, and a smaller moving buffer to track which part of the file is
 currently being examined, emulating a much larger single mmapped buffer.
-Unfortunately this mode is 100 times slower once lrzip begins examining the
-ram beyond the larger base window.
+Unfortunately this mode is many times slower.
 
 See the file README.benchmarks in doc/ for performance examples and what kind
 of data lrzip is very good with.
@@ -105,7 +104,26 @@ Q. I want the absolute maximum compression I can possibly get, what do I do?
 A. Try the command line options -MUz. This will use all available ram and ZPAQ
 compression, and even use a compression window larger than you have ram.
 Expect serious swapping to occur if your file is larger than your ram and for
-it to take 1000 times longer. A more practical option is just -M.
+it to take many times longer.
+
+Q. How much slower is the unlimited mode?
+A. It depends on 2 things. First, just how much larger than your ram the file
+is, as the bigger the difference, the slower it will be. The second is how much
+redundant data there is. The more there is, the slower, but ultimately the
+better the compression. Using the example of a 10GB virtual image on a machine
+with 8GB ram, it would allocate about 5.5GB by default, yet is capable of
+allocating all the ram for the 10GB file in -M mode.
+
+Options		Size		Compress	Decompress
+-l		1793312108	05m13s		3m12s
+-lM		1413268368	04m18s		2m54s
+-lU		1413268368	06m05s		2m54s
+
+As you can see, the -U option gives the same compression in this case as the
+-M option, and for about 50% more time. The advantage to using -U is that it
+will work even when the size can't be encompassed by -M, but progressively
+slower. Why isn't it on by default? If the compression window is a LOT larger
+than ram, with a lot of redundant information it can be drastically slower.
 
 Q. Can I use your tool for even more compression than lzma offers?
 A. Yes, the rzip preparation of files makes them more compressible by every
@@ -288,6 +306,10 @@ A. Yes, that's the nature of the compression/decompression mechanism. The jump
 is because the rzip preparation makes the amount of data much smaller than the
 compression backend (lzma) needs to compress.
 
+Q. The percentage counter doesn't always get to 100%.
+A. It's quite hard to predict during the rzip phase how long it will take as
+lots of redundant data will not count towards the percentage.
+
 Q. Tell me about patented compression algorithms, GPL, lawyers and copyright.
 A. No
 
diff --git a/doc/README.benchmarks b/doc/README.benchmarks
index 0458d43e..600f0d07 100644
--- a/doc/README.benchmarks
+++ b/doc/README.benchmarks
@@ -1,4 +1,3 @@
-
 The first comparison is that of a linux kernel tarball (2.6.31). In all cases
 the default options were used. 3 other common compression apps were used for
 comparison, 7z which is an excellent all-round lzma based compression app,
@@ -94,9 +93,9 @@ These benchmarks were done on the quad core with version 0.5.1
 Compression	Size		Percentage	Compress Time	Decompress Time
 None		10737418240	100.0
 gzip		2772899756	 25.8		05m47.35s	2m46.77s
-bzip2		2704781700	 25.2		20m34.269s	7m51.362s
-xz		2272322208	 21.2		58m26.829s	4m46.154s
-7z		2242897134	 20.9		29m28.152s	6m35.952s
+bzip2		2704781700	 25.2		16m15.603s	6m19.718s
+xz		2272322208	 21.2		50m58.437s	3m52.734s
+7z		2242897134	 20.9		26m36.333s	5m41.198s
 lrzip		1354237684	 12.6		29m13.402s	6m55.441s
 lrzip -M	1079528708	 10.1		23m44.226s	4m05.461s
 lrzip -l	1793312108	 16.7		05m13.246s	3m12.886s
@@ -107,10 +106,11 @@ lrzip -zM	1066902006	  9.9		04h07m14s	04h08m
 
 At this end of the spectrum things really start to heat up. The compression
 advantage is massive, with the lzo backend even giving much better results than
-7z, and over a ridiculously short time. What appears to be a big disappointment
-is actually zpaq here which takes more than 8 times longer than lzma for a
-measly .2% improvement. The reason is that most of the advantage here is
-achieved by the rzip first stage since there's a lot of redundant space over
+7z, and over a ridiculously short time. The default lzma backend is slightly
+slower than 7z, but provides a lot more compression. What appears to be a big
+disappointment is actually zpaq here which takes more than 8 times longer than
+lzma for a measly .2% improvement. The reason is that most of the advantage here
+is achieved by the rzip first stage since there's a lot of redundant space over
 huge distances on a virtual image. The -M option which works the memory
 subsystem rather hard making noticeable impact on the rest of the machine also
 does further wonders for the compression and times.
diff --git a/main.c b/main.c
index c8ef0429..92848cb8 100644
--- a/main.c
+++ b/main.c
@@ -720,6 +720,11 @@ int main(int argc, char *argv[])
 		control.flags |= FLAG_SHOW_PROGRESS;
 	}
 
+	if (UNLIMITED && STDIN) {
+		print_err("Cannot have -U and stdin, unlimited mode disabled.\n");
+		control.flags &= ~ FLAG_UNLIMITED;
+	}
+
 	if (argc < 1)
 		control.flags |= FLAG_STDIN;
 
diff --git a/rzip.c b/rzip.c
index ba1c7cb4..30aaa6c0 100644
--- a/rzip.c
+++ b/rzip.c
@@ -71,6 +71,7 @@ struct rzip_state {
 	i64 tag_clean_ptr;
 	i64 last_match;
 	i64 chunk_size;
+	i64 mmap_size;
 	char chunk_bytes;
 	uint32_t cksum;
 	int fd_in, fd_out;
@@ -485,8 +486,7 @@ static void show_distrib(struct rzip_state *st)
 
 static void hash_search(struct rzip_state *st, double pct_base, double pct_multiple)
 {
-	i64 cksum_limit = 0, pct, lastpct=0;
-	i64 p, end;
+	i64 cksum_limit = 0, p, end;
 	tag t = 0;
 	struct {
 		i64 p;
@@ -529,9 +529,8 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi
 	t = full_tag(st, p);
 
 	while (p < end) {
-		i64 offset = 0;
-		i64 mlen;
-		i64 reverse;
+		int lastpct = 0, last_chunkpct = 0;
+		i64 reverse, mlen, offset = 0;
 
 		p++;
 		sb.offset_search = p;
@@ -570,17 +569,17 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi
 			t = full_tag(st, p);
 		}
 
-		if (unlikely(p % 100 == 0)) {
+		if (unlikely(p % 128 == 0)) {
+			int pct, chunk_pct;
+
 			pct = pct_base + (pct_multiple * (100.0 * p) /
 			      st->chunk_size);
-			if (pct != lastpct) {
-				struct stat s1, s2;
-
-				fstat(st->fd_in, &s1);
-				fstat(st->fd_out, &s2);
+			chunk_pct = p / (end / 100);
+			if (pct != lastpct || chunk_pct != last_chunkpct) {
 				if (!STDIN)
-					print_progress("%2lld%%\r", pct);
+					print_progress("Total: %2d%%  Chunk: %2d%%\r", pct, chunk_pct);
 				lastpct = pct;
+				last_chunkpct = chunk_pct;
 			}
 		}
 
@@ -593,6 +592,7 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi
 		}
 	}
 
+	print_progress("\n");
 
 	if (MAX_VERBOSE)
 		show_distrib(st);
@@ -667,67 +667,10 @@ static void mmap_stdin(uchar *buf, struct rzip_state *st)
 	}
 }
 
-static const i64 two_gig = 2 * 1000 * 1000 * 1000;
-
 static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset)
 {
-	i64 size = st->chunk_size;
-
-	if (sizeof(long) == 4 && size > two_gig) {
-		print_verbose("Limiting to 2GB due to 32 bit limitations\n");
-		size = two_gig;
-	}
-	sb.orig_offset = offset;
-retry:
-	/* Mmapping anonymously first will tell us how much ram we can use in
-	 * advance and zeroes it which has a defragmenting effect on ram
-	 * before the real read in. We can map a lot more file backed ram than
-	 * anonymous ram so do not do this preallocation in MAXRAM mode. Using
-	 * the larger mmapped window will cause a lot more ram trashing of the
-	 * system so we do not use MAXRAM mode by default. */
-	if (!MAXRAM || STDIN) {
-		sb.buf_low = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-		/* Better to shrink the window to the largest size that works than fail */
-		if (sb.buf_low == MAP_FAILED) {
-			size = size / 10 * 9;
-			size -= size % 4096;
-			if (unlikely(!size))
-				fatal("Unable to mmap any ram\n");
-			goto retry;
-		}
-		print_maxverbose("Succeeded in preallocating %lld sized mmap\n", size);
-		if (!STDIN) {
-			if (unlikely(munmap(sb.buf_low, size)))
-				fatal("Failed to munmap\n");
-		} else
-			st->chunk_size = size;
-	}
-
-	print_verbose("Reading file into mmapped ram...\n");
-	if (!STDIN) {
-		sb.buf_low = (uchar *)mmap(sb.buf_low, size, PROT_READ, MAP_SHARED, fd_in, offset);
-		if (sb.buf_low == MAP_FAILED) {
-			size = size / 10 * 9;
-			size -= size % 4096;
-			if (unlikely(!size))
-				fatal("Unable to mmap any ram\n");
-			goto retry;
-		}
-	} else
-		mmap_stdin(sb.buf_low, st);
-	print_maxverbose("Succeeded in allocating %lld sized mmap\n", size);
-
-	if (size < st->chunk_size) {
-		if (UNLIMITED && !STDIN)
-			print_verbose("File is beyond window size, will proceed in unlimited mode with a sliding_mmap buffer but may be much slower\n");
-		else {
-			print_verbose("Needed to shrink window size to %lld\n", size);
-			st->chunk_size = size;
-		}
-	}
-
 	/* Initialise the high buffer */
-	if (UNLIMITED && !STDIN) {
+	if (UNLIMITED) {
 		sb.high_length = 65536;
 		sb.buf_high = (uchar *)mmap(NULL, sb.high_length, PROT_READ, MAP_SHARED, fd_in, offset);
 		if (unlikely(sb.buf_high == MAP_FAILED))
@@ -737,7 +680,7 @@ static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset)
 	}
 	sb.offset_low = 0;
 	sb.offset_search = 0;
-	sb.size_low = size;
+	sb.size_low = st->mmap_size;
 	sb.orig_size = st->chunk_size;
 	sb.fd = fd_in;
 }
@@ -748,22 +691,31 @@ static void rzip_chunk(struct rzip_state *st, int fd_in, int fd_out, i64 offset,
 		       double pct_base, double pct_multiple)
 {
 	init_sliding_mmap(st, fd_in, offset);
+
 	st->ss = open_stream_out(fd_out, NUM_STREAMS, st->chunk_size);
 	if (unlikely(!st->ss))
 		fatal("Failed to open streams in rzip_chunk\n");
+
+	print_verbose("Performing rzip pre-processing phase\n");
 	hash_search(st, pct_base, pct_multiple);
+
 	/* unmap buffer before closing and reallocating streams */
 	if (unlikely(munmap(sb.buf_low, sb.size_low)))
 		fatal("Failed to munmap in rzip_chunk\n");
-	if (UNLIMITED && !STDIN) {
+	if (UNLIMITED) {
 		if (unlikely(munmap(sb.buf_high, sb.size_high)))
 			fatal("Failed to munmap in rzip_chunk\n");
 	}
 
+	if (!NO_COMPRESS)
+		print_verbose("Passing chunk data to backend compressor\n");
 	if (unlikely(close_stream_out(st->ss)))
 		fatal("Failed to flush/close streams in rzip_chunk\n");
 }
 
+/* Needs to be less than 31 bits and page aligned on 32 bits */
+static const i64 two_gig = (1ull << 31) - 4096;
+
 /* compress a whole file chunks at a time */
 void rzip_fd(int fd_in, int fd_out)
 {
@@ -808,6 +760,8 @@ void rzip_fd(int fd_in, int fd_out)
 		else
 			chunk_window = len;
 	}
+	if (chunk_window < len)
+		chunk_window -= chunk_window % 4096;
 	st->chunk_size = chunk_window;
 
 	st->level = &levels[control.compression_level];
@@ -825,6 +779,7 @@ void rzip_fd(int fd_in, int fd_out)
 
 	while (len > 0 || (STDIN && !st->stdin_eof)) {
 		double pct_base, pct_multiple;
+		i64 offset = s.st_size - len;
 		int bits = 8;
 
 		/* Flushing the dirty data will decrease our chances of
@@ -834,8 +789,62 @@ void rzip_fd(int fd_in, int fd_out)
 		if (last_chunk)
 			print_verbose("Flushing data to disk.\n");
 		fsync(fd_out);
+
 		if (st->chunk_size > len && !STDIN)
 			st->chunk_size = len;
+		st->mmap_size = st->chunk_size;
+		if (sizeof(long) == 4 && st->mmap_size > two_gig) {
+			print_verbose("Limiting to 2GB due to 32 bit limitations\n");
+			st->mmap_size = two_gig;
+		}
+
+		print_maxverbose("Reading file into mmapped ram...\n");
+retry:
+		/* Mmapping anonymously first will tell us how much ram we can use in
+		 * advance and zeroes it which has a defragmenting effect on ram
+		 * before the real read in. */
+		sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		/* Better to shrink the window to the largest size that works than fail */
+		if (sb.buf_low == MAP_FAILED) {
+			st->mmap_size = st->mmap_size / 10 * 9;
+			st->mmap_size -= st->mmap_size % 4096;
+			if (unlikely(!st->mmap_size))
+				fatal("Unable to mmap any ram\n");
+			goto retry;
+		}
+
+		/* NOTE the buf is saved here for STDIN mode */
+		if (!STDIN) {
+			if (unlikely(munmap(sb.buf_low, st->mmap_size)))
+				fatal("Failed to munmap\n");
+		}
+
+		if (!MAXRAM) {
+			print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size);
+			if (!UNLIMITED)
+				st->chunk_size = st->mmap_size;
+		} else
+			st->mmap_size = st->chunk_size;
+
+		if (!STDIN) {
+			/* The buf is saved here for !STDIN mode */
+			sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset);
+			if (sb.buf_low == MAP_FAILED) {
+				if (unlikely(!MAXRAM))
+					fatal("Failed to remap ram\n");
+				st->mmap_size = st->mmap_size / 10 * 9;
+				st->mmap_size -= st->mmap_size % 4096;
+				if (unlikely(!st->mmap_size))
+					fatal("Unable to mmap any ram\n");
+				goto retry;
+			}
+		} else
+			mmap_stdin(sb.buf_low, st);
+
+		if (st->mmap_size < st->chunk_size)
+			print_verbose("Compression window is larger than ram allocated, will proceed with unlimited mode possibly much slower\n");
+
+		sb.orig_offset = offset;
 		print_maxverbose("Chunk size: %lld\n", st->chunk_size);
 
 		/* Determine the chunk byte width and write it to the file
@@ -874,7 +883,7 @@ void rzip_fd(int fd_in, int fd_out)
 		}
 		last.tv_sec = current.tv_sec;
 		last.tv_usec = current.tv_usec;
-		rzip_chunk(st, fd_in, fd_out, s.st_size - len, pct_base, pct_multiple);
+		rzip_chunk(st, fd_in, fd_out, offset, pct_base, pct_multiple);
 		/* st->chunk_bytes may be shrunk in rzip_chunk */
 		last_chunk = st->chunk_size;
 		len -= st->chunk_size;
diff --git a/stream.c b/stream.c
index 88545e0f..c1d982b6 100644
--- a/stream.c
+++ b/stream.c
@@ -308,7 +308,7 @@ static int zpaq_decompress_buf(struct stream *s)
 	s->buf = c_buf;
 
 	if (unlikely((i64)dlen != s->buflen)) {
-		print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", dlen, s->buflen);
+		print_err("Inconsistent length after decompression. Got %lld bytes, expected %lld\n", (i64)dlen, s->buflen);
 		return -1;
 	}
 
@@ -380,7 +380,7 @@ static int lzma_decompress_buf(struct stream *s, size_t c_len)
 	c_buf = s->buf;
 	s->buf = malloc(dlen);
 	if (unlikely(!s->buf)) {
-		print_err("Failed to allocate %d bytes for decompression\n", dlen);
+		print_err("Failed to allocate %lldd bytes for decompression\n", (i64)dlen);
 		return -1;
 	}
 
@@ -393,7 +393,7 @@ static int lzma_decompress_buf(struct stream *s, size_t c_len)
 	}
 
 	if (unlikely((i64)dlen != s->buflen)) {
-		print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", dlen, s->buflen);
+		print_err("Inconsistent length after decompression. Got %lld bytes, expected %lld\n", (i64)dlen, s->buflen);
 		return -1;
 	}
 
@@ -410,7 +410,7 @@ static int lzo_decompress_buf(struct stream *s, i64 c_len)
 	c_buf = s->buf;
 	s->buf = malloc(dlen);
 	if (unlikely(!s->buf)) {
-		print_err("Failed to allocate %d bytes for decompression\n", (int)dlen);
+		print_err("Failed to allocate %lu bytes for decompression\n", (unsigned long)dlen);
 		return -1;
 	}
 
@@ -421,7 +421,7 @@ static int lzo_decompress_buf(struct stream *s, i64 c_len)
 	}
 
 	if (unlikely((i64)dlen != s->buflen)) {
-		print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", (int)dlen, s->buflen);
+		print_err("Inconsistent length after decompression. Got %lu bytes, expected %lld\n", (unsigned long)dlen, s->buflen);
 		return -1;
 	}
 
@@ -492,7 +492,7 @@ static int write_buf(int f, uchar *p, i64 len)
 		return -1;
 	}
 	if (unlikely(ret != (ssize_t)len)) {
-		print_err("Partial write!? asked for %lld bytes but got %d\n", len, ret);
+		print_err("Partial write!? asked for %lld bytes but got %lld\n", len, (i64)ret);
 		return -1;
 	}
 	return 0;