From 017ec9e85ac9a448b2b626f0c44f1d83bdcf26fa Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 5 Nov 2010 23:02:58 +1100 Subject: [PATCH] Move the ram allocation phase into rzip_fd to be able to get a more accurate measure of percentage done. Prevent failure when offset is not a multiple of page size. Add chunk percentage complete to output. Tweak output at various verbosities. Update documentation to reflect improved performance of unlimited mode. Update benchmark results. More tidying. --- README | 28 +++++++- doc/README.benchmarks | 16 ++--- main.c | 5 ++ rzip.c | 155 ++++++++++++++++++++++-------------------- stream.c | 12 ++-- 5 files changed, 126 insertions(+), 90 deletions(-) diff --git a/README b/README index ff7a4d7c..d2445cba 100644 --- a/README +++ b/README @@ -74,8 +74,7 @@ a compression window larger than your ramsize, if the file is that large. It does this (with the -U option) by implementing one large mmap buffer as per normal, and a smaller moving buffer to track which part of the file is currently being examined, emulating a much larger single mmapped buffer. -Unfortunately this mode is 100 times slower once lrzip begins examining the -ram beyond the larger base window. +Unfortunately this mode is many times slower. See the file README.benchmarks in doc/ for performance examples and what kind of data lrzip is very good with. @@ -105,7 +104,26 @@ Q. I want the absolute maximum compression I can possibly get, what do I do? A. Try the command line options -MUz. This will use all available ram and ZPAQ compression, and even use a compression window larger than you have ram. Expect serious swapping to occur if your file is larger than your ram and for -it to take 1000 times longer. A more practical option is just -M. +it to take many times longer. + +Q. How much slower is the unlimited mode? +A. It depends on 2 things. First, just how much larger than your ram the file +is, as the bigger the difference, the slower it will be. The second is how much +redundant data there is. The more there is, the slower, but ultimately the +better the compression. Using the example of a 10GB virtual image on a machine +with 8GB ram, it would allocate about 5.5GB by default, yet is capable of +allocating all the ram for the 10GB file in -M mode. + +Options Size Compress Decompress +-l 1793312108 05m13s 3m12s +-lM 1413268368 04m18s 2m54s +-lU 1413268368 06m05s 2m54s + +As you can see, the -U option gives the same compression in this case as the +-M option, and for about 50% more time. The advantage to using -U is that it +will work even when the size can't be encompassed by -M, but progressively +slower. Why isn't it on by default? If the compression window is a LOT larger +than ram, with a lot of redundant information it can be drastically slower. Q. Can I use your tool for even more compression than lzma offers? A. Yes, the rzip preparation of files makes them more compressible by every @@ -288,6 +306,10 @@ A. Yes, that's the nature of the compression/decompression mechanism. The jump is because the rzip preparation makes the amount of data much smaller than the compression backend (lzma) needs to compress. +Q. The percentage counter doesn't always get to 100%. +A. It's quite hard to predict during the rzip phase how long it will take as +lots of redundant data will not count towards the percentage. + Q. Tell me about patented compression algorithms, GPL, lawyers and copyright. A. No diff --git a/doc/README.benchmarks b/doc/README.benchmarks index 0458d43e..600f0d07 100644 --- a/doc/README.benchmarks +++ b/doc/README.benchmarks @@ -1,4 +1,3 @@ - The first comparison is that of a linux kernel tarball (2.6.31). In all cases the default options were used. 3 other common compression apps were used for comparison, 7z which is an excellent all-round lzma based compression app, @@ -94,9 +93,9 @@ These benchmarks were done on the quad core with version 0.5.1 Compression Size Percentage Compress Time Decompress Time None 10737418240 100.0 gzip 2772899756 25.8 05m47.35s 2m46.77s -bzip2 2704781700 25.2 20m34.269s 7m51.362s -xz 2272322208 21.2 58m26.829s 4m46.154s -7z 2242897134 20.9 29m28.152s 6m35.952s +bzip2 2704781700 25.2 16m15.603s 6m19.718s +xz 2272322208 21.2 50m58.437s 3m52.734s +7z 2242897134 20.9 26m36.333s 5m41.198s lrzip 1354237684 12.6 29m13.402s 6m55.441s lrzip -M 1079528708 10.1 23m44.226s 4m05.461s lrzip -l 1793312108 16.7 05m13.246s 3m12.886s @@ -107,10 +106,11 @@ lrzip -zM 1066902006 9.9 04h07m14s 04h08m At this end of the spectrum things really start to heat up. The compression advantage is massive, with the lzo backend even giving much better results than -7z, and over a ridiculously short time. What appears to be a big disappointment -is actually zpaq here which takes more than 8 times longer than lzma for a -measly .2% improvement. The reason is that most of the advantage here is -achieved by the rzip first stage since there's a lot of redundant space over +7z, and over a ridiculously short time. The default lzma backend is slightly +slower than 7z, but provides a lot more compression. What appears to be a big +disappointment is actually zpaq here which takes more than 8 times longer than +lzma for a measly .2% improvement. The reason is that most of the advantage here +is achieved by the rzip first stage since there's a lot of redundant space over huge distances on a virtual image. The -M option which works the memory subsystem rather hard making noticeable impact on the rest of the machine also does further wonders for the compression and times. diff --git a/main.c b/main.c index c8ef0429..92848cb8 100644 --- a/main.c +++ b/main.c @@ -720,6 +720,11 @@ int main(int argc, char *argv[]) control.flags |= FLAG_SHOW_PROGRESS; } + if (UNLIMITED && STDIN) { + print_err("Cannot have -U and stdin, unlimited mode disabled.\n"); + control.flags &= ~ FLAG_UNLIMITED; + } + if (argc < 1) control.flags |= FLAG_STDIN; diff --git a/rzip.c b/rzip.c index ba1c7cb4..30aaa6c0 100644 --- a/rzip.c +++ b/rzip.c @@ -71,6 +71,7 @@ struct rzip_state { i64 tag_clean_ptr; i64 last_match; i64 chunk_size; + i64 mmap_size; char chunk_bytes; uint32_t cksum; int fd_in, fd_out; @@ -485,8 +486,7 @@ static void show_distrib(struct rzip_state *st) static void hash_search(struct rzip_state *st, double pct_base, double pct_multiple) { - i64 cksum_limit = 0, pct, lastpct=0; - i64 p, end; + i64 cksum_limit = 0, p, end; tag t = 0; struct { i64 p; @@ -529,9 +529,8 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi t = full_tag(st, p); while (p < end) { - i64 offset = 0; - i64 mlen; - i64 reverse; + int lastpct = 0, last_chunkpct = 0; + i64 reverse, mlen, offset = 0; p++; sb.offset_search = p; @@ -570,17 +569,17 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi t = full_tag(st, p); } - if (unlikely(p % 100 == 0)) { + if (unlikely(p % 128 == 0)) { + int pct, chunk_pct; + pct = pct_base + (pct_multiple * (100.0 * p) / st->chunk_size); - if (pct != lastpct) { - struct stat s1, s2; - - fstat(st->fd_in, &s1); - fstat(st->fd_out, &s2); + chunk_pct = p / (end / 100); + if (pct != lastpct || chunk_pct != last_chunkpct) { if (!STDIN) - print_progress("%2lld%%\r", pct); + print_progress("Total: %2d%% Chunk: %2d%%\r", pct, chunk_pct); lastpct = pct; + last_chunkpct = chunk_pct; } } @@ -593,6 +592,7 @@ static void hash_search(struct rzip_state *st, double pct_base, double pct_multi } } + print_progress("\n"); if (MAX_VERBOSE) show_distrib(st); @@ -667,67 +667,10 @@ static void mmap_stdin(uchar *buf, struct rzip_state *st) } } -static const i64 two_gig = 2 * 1000 * 1000 * 1000; - static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset) { - i64 size = st->chunk_size; - - if (sizeof(long) == 4 && size > two_gig) { - print_verbose("Limiting to 2GB due to 32 bit limitations\n"); - size = two_gig; - } - sb.orig_offset = offset; -retry: - /* Mmapping anonymously first will tell us how much ram we can use in - * advance and zeroes it which has a defragmenting effect on ram - * before the real read in. We can map a lot more file backed ram than - * anonymous ram so do not do this preallocation in MAXRAM mode. Using - * the larger mmapped window will cause a lot more ram trashing of the - * system so we do not use MAXRAM mode by default. */ - if (!MAXRAM || STDIN) { - sb.buf_low = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - /* Better to shrink the window to the largest size that works than fail */ - if (sb.buf_low == MAP_FAILED) { - size = size / 10 * 9; - size -= size % 4096; - if (unlikely(!size)) - fatal("Unable to mmap any ram\n"); - goto retry; - } - print_maxverbose("Succeeded in preallocating %lld sized mmap\n", size); - if (!STDIN) { - if (unlikely(munmap(sb.buf_low, size))) - fatal("Failed to munmap\n"); - } else - st->chunk_size = size; - } - - print_verbose("Reading file into mmapped ram...\n"); - if (!STDIN) { - sb.buf_low = (uchar *)mmap(sb.buf_low, size, PROT_READ, MAP_SHARED, fd_in, offset); - if (sb.buf_low == MAP_FAILED) { - size = size / 10 * 9; - size -= size % 4096; - if (unlikely(!size)) - fatal("Unable to mmap any ram\n"); - goto retry; - } - } else - mmap_stdin(sb.buf_low, st); - print_maxverbose("Succeeded in allocating %lld sized mmap\n", size); - - if (size < st->chunk_size) { - if (UNLIMITED && !STDIN) - print_verbose("File is beyond window size, will proceed in unlimited mode with a sliding_mmap buffer but may be much slower\n"); - else { - print_verbose("Needed to shrink window size to %lld\n", size); - st->chunk_size = size; - } - } - /* Initialise the high buffer */ - if (UNLIMITED && !STDIN) { + if (UNLIMITED) { sb.high_length = 65536; sb.buf_high = (uchar *)mmap(NULL, sb.high_length, PROT_READ, MAP_SHARED, fd_in, offset); if (unlikely(sb.buf_high == MAP_FAILED)) @@ -737,7 +680,7 @@ static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset) } sb.offset_low = 0; sb.offset_search = 0; - sb.size_low = size; + sb.size_low = st->mmap_size; sb.orig_size = st->chunk_size; sb.fd = fd_in; } @@ -748,22 +691,31 @@ static void rzip_chunk(struct rzip_state *st, int fd_in, int fd_out, i64 offset, double pct_base, double pct_multiple) { init_sliding_mmap(st, fd_in, offset); + st->ss = open_stream_out(fd_out, NUM_STREAMS, st->chunk_size); if (unlikely(!st->ss)) fatal("Failed to open streams in rzip_chunk\n"); + + print_verbose("Performing rzip pre-processing phase\n"); hash_search(st, pct_base, pct_multiple); + /* unmap buffer before closing and reallocating streams */ if (unlikely(munmap(sb.buf_low, sb.size_low))) fatal("Failed to munmap in rzip_chunk\n"); - if (UNLIMITED && !STDIN) { + if (UNLIMITED) { if (unlikely(munmap(sb.buf_high, sb.size_high))) fatal("Failed to munmap in rzip_chunk\n"); } + if (!NO_COMPRESS) + print_verbose("Passing chunk data to backend compressor\n"); if (unlikely(close_stream_out(st->ss))) fatal("Failed to flush/close streams in rzip_chunk\n"); } +/* Needs to be less than 31 bits and page aligned on 32 bits */ +static const i64 two_gig = (1ull << 31) - 4096; + /* compress a whole file chunks at a time */ void rzip_fd(int fd_in, int fd_out) { @@ -808,6 +760,8 @@ void rzip_fd(int fd_in, int fd_out) else chunk_window = len; } + if (chunk_window < len) + chunk_window -= chunk_window % 4096; st->chunk_size = chunk_window; st->level = &levels[control.compression_level]; @@ -825,6 +779,7 @@ void rzip_fd(int fd_in, int fd_out) while (len > 0 || (STDIN && !st->stdin_eof)) { double pct_base, pct_multiple; + i64 offset = s.st_size - len; int bits = 8; /* Flushing the dirty data will decrease our chances of @@ -834,8 +789,62 @@ void rzip_fd(int fd_in, int fd_out) if (last_chunk) print_verbose("Flushing data to disk.\n"); fsync(fd_out); + if (st->chunk_size > len && !STDIN) st->chunk_size = len; + st->mmap_size = st->chunk_size; + if (sizeof(long) == 4 && st->mmap_size > two_gig) { + print_verbose("Limiting to 2GB due to 32 bit limitations\n"); + st->mmap_size = two_gig; + } + + print_maxverbose("Reading file into mmapped ram...\n"); +retry: + /* Mmapping anonymously first will tell us how much ram we can use in + * advance and zeroes it which has a defragmenting effect on ram + * before the real read in. */ + sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + /* Better to shrink the window to the largest size that works than fail */ + if (sb.buf_low == MAP_FAILED) { + st->mmap_size = st->mmap_size / 10 * 9; + st->mmap_size -= st->mmap_size % 4096; + if (unlikely(!st->mmap_size)) + fatal("Unable to mmap any ram\n"); + goto retry; + } + + /* NOTE the buf is saved here for STDIN mode */ + if (!STDIN) { + if (unlikely(munmap(sb.buf_low, st->mmap_size))) + fatal("Failed to munmap\n"); + } + + if (!MAXRAM) { + print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size); + if (!UNLIMITED) + st->chunk_size = st->mmap_size; + } else + st->mmap_size = st->chunk_size; + + if (!STDIN) { + /* The buf is saved here for !STDIN mode */ + sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset); + if (sb.buf_low == MAP_FAILED) { + if (unlikely(!MAXRAM)) + fatal("Failed to remap ram\n"); + st->mmap_size = st->mmap_size / 10 * 9; + st->mmap_size -= st->mmap_size % 4096; + if (unlikely(!st->mmap_size)) + fatal("Unable to mmap any ram\n"); + goto retry; + } + } else + mmap_stdin(sb.buf_low, st); + + if (st->mmap_size < st->chunk_size) + print_verbose("Compression window is larger than ram allocated, will proceed with unlimited mode possibly much slower\n"); + + sb.orig_offset = offset; print_maxverbose("Chunk size: %lld\n", st->chunk_size); /* Determine the chunk byte width and write it to the file @@ -874,7 +883,7 @@ void rzip_fd(int fd_in, int fd_out) } last.tv_sec = current.tv_sec; last.tv_usec = current.tv_usec; - rzip_chunk(st, fd_in, fd_out, s.st_size - len, pct_base, pct_multiple); + rzip_chunk(st, fd_in, fd_out, offset, pct_base, pct_multiple); /* st->chunk_bytes may be shrunk in rzip_chunk */ last_chunk = st->chunk_size; len -= st->chunk_size; diff --git a/stream.c b/stream.c index 88545e0f..c1d982b6 100644 --- a/stream.c +++ b/stream.c @@ -308,7 +308,7 @@ static int zpaq_decompress_buf(struct stream *s) s->buf = c_buf; if (unlikely((i64)dlen != s->buflen)) { - print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", dlen, s->buflen); + print_err("Inconsistent length after decompression. Got %lld bytes, expected %lld\n", (i64)dlen, s->buflen); return -1; } @@ -380,7 +380,7 @@ static int lzma_decompress_buf(struct stream *s, size_t c_len) c_buf = s->buf; s->buf = malloc(dlen); if (unlikely(!s->buf)) { - print_err("Failed to allocate %d bytes for decompression\n", dlen); + print_err("Failed to allocate %lldd bytes for decompression\n", (i64)dlen); return -1; } @@ -393,7 +393,7 @@ static int lzma_decompress_buf(struct stream *s, size_t c_len) } if (unlikely((i64)dlen != s->buflen)) { - print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", dlen, s->buflen); + print_err("Inconsistent length after decompression. Got %lld bytes, expected %lld\n", (i64)dlen, s->buflen); return -1; } @@ -410,7 +410,7 @@ static int lzo_decompress_buf(struct stream *s, i64 c_len) c_buf = s->buf; s->buf = malloc(dlen); if (unlikely(!s->buf)) { - print_err("Failed to allocate %d bytes for decompression\n", (int)dlen); + print_err("Failed to allocate %lu bytes for decompression\n", (unsigned long)dlen); return -1; } @@ -421,7 +421,7 @@ static int lzo_decompress_buf(struct stream *s, i64 c_len) } if (unlikely((i64)dlen != s->buflen)) { - print_err("Inconsistent length after decompression. Got %d bytes, expected %lld\n", (int)dlen, s->buflen); + print_err("Inconsistent length after decompression. Got %lu bytes, expected %lld\n", (unsigned long)dlen, s->buflen); return -1; } @@ -492,7 +492,7 @@ static int write_buf(int f, uchar *p, i64 len) return -1; } if (unlikely(ret != (ssize_t)len)) { - print_err("Partial write!? asked for %lld bytes but got %d\n", len, ret); + print_err("Partial write!? asked for %lld bytes but got %lld\n", len, (i64)ret); return -1; } return 0;