Permalink
Browse files

Merge pull request #141 from nall/master

Support for searching in gzipped files
  • Loading branch information...
2 parents f671937 + 34cec74 commit e80a032562eb1b1629775b669f8631170e1c208b @ggreer committed Feb 9, 2013
Showing with 248 additions and 5 deletions.
  1. +2 −2 Makefile.am
  2. +1 −1 configure.ac
  3. +170 −0 src/decompress.c
  4. +19 −0 src/decompress.h
  5. +8 −1 src/options.c
  6. +1 −0 src/options.h
  7. +35 −1 src/search.c
  8. +2 −0 src/search.h
  9. +10 −0 src/util.h
View
@@ -1,8 +1,8 @@
ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
bin_PROGRAMS = ag
-ag_SOURCES = src/ignore.c src/log.c src/options.c src/print.c src/scandir.c src/search.c src/util.c src/main.c
-ag_LDADD = ${PCRE_LIBS} -lpthread
+ag_SOURCES = src/ignore.c src/log.c src/options.c src/print.c src/scandir.c src/search.c src/util.c src/decompress.c src/main.c
+ag_LDADD = ${PCRE_LIBS} -lz -lpthread
man_MANS = doc/ag.1
View
@@ -16,7 +16,7 @@ PKG_CHECK_MODULES([PCRE], [libpcre])
CFLAGS="$CFLAGS $PCRE_CFLAGS -Wall -Wextra -std=c89 -D_GNU_SOURCE"
LDFLAGS="$LDFLAGS"
-AC_CHECK_HEADERS([pthread.h])
+AC_CHECK_HEADERS([pthread.h, zlib.h])
AC_CHECK_DECL([PCRE_CONFIG_JIT], [AC_DEFINE([USE_PCRE_JIT], [], [Use PCRE JIT])], [], [#include <pcre.h>])
View
@@ -0,0 +1,170 @@
+#include <zlib.h>
+
+#include "decompress.h"
+
+/* Code in decompress_zlib from
+ *
+ * https://raw.github.com/madler/zlib/master/examples/zpipe.c
+ *
+ * zpipe.c: example of proper use of zlib's inflate() and deflate()
+ * Not copyrighted -- provided to the public domain
+ * Version 1.4 11 December 2005 Mark Adler
+ */
+
+
+static void* decompress_zlib(const void* buf, const int buf_len,
+ const char* dir_full_path, int* new_buf_len) {
+ int ret = 0;
+ unsigned char* result = NULL;
+ size_t result_size = 0;
+ size_t pagesize = 0;
+ z_stream stream;
+
+ log_debug("Decompressing zlib file %s", dir_full_path);
+
+ /* allocate inflate state */
+ stream.zalloc = Z_NULL;
+ stream.zfree = Z_NULL;
+ stream.opaque = Z_NULL;
+ stream.avail_in = 0;
+ stream.next_in = Z_NULL;
+
+ /* Add 32 to allow zlib and gzip format detection */
+ if(inflateInit2(&stream, 32 + 15) != Z_OK) {
+ log_err("Unable to initialize zlib: %s", stream.msg);
+ goto error_out;
+ }
+
+ stream.avail_in = buf_len;
+ stream.next_in = (void*)buf;
+
+ pagesize = getpagesize();
+ result_size = ((buf_len + pagesize - 1) & ~(pagesize - 1));
+ do {
+ do {
+ /* Double the buffer size and realloc */
+ result_size *= 2;
+ result = (unsigned char*)realloc(result, result_size * sizeof(unsigned char));
+ if(result == NULL) {
+ log_err("Unable to allocate %d bytes to decompress file %s", result_size * sizeof(unsigned char), dir_full_path);
+ inflateEnd(&stream);
+ goto error_out;
+ }
+
+ stream.avail_out = result_size / 2;
+ stream.next_out = &result[stream.total_out];
+ ret = inflate(&stream, Z_SYNC_FLUSH);
+ log_debug("inflate ret = %d", ret);
+ switch(ret) {
+ case Z_STREAM_ERROR: {
+ log_err("Found stream error while decompressing zlib stream: %s", stream.msg);
+ inflateEnd(&stream);
+ goto error_out;
+ }
+ case Z_NEED_DICT:
+ case Z_DATA_ERROR:
+ case Z_MEM_ERROR: {
+ log_err("Found mem/data error while decompressing zlib stream: %s", stream.msg);
+ inflateEnd(&stream);
+ goto error_out;
+ }
+ }
+ } while(stream.avail_out == 0);
+ } while(ret == Z_OK);
+
+ *new_buf_len = stream.total_out;
+ inflateEnd(&stream);
+
+ if(ret == Z_STREAM_END) {
+ return result;
+ }
+
+error_out:
+ *new_buf_len = 0;
+ return NULL;
+}
+
+static void* decompress_lwz(const void* buf, const int buf_len,
+ const char* dir_full_path, int* new_buf_len) {
+ (void)buf; (void)buf_len;
+ log_err("LWZ (UNIX compress) files not yet supported: %s", dir_full_path);
+ *new_buf_len = 0;
+ return NULL;
+}
+
+static void* decompress_zip(const void* buf, const int buf_len,
+ const char* dir_full_path, int* new_buf_len) {
+ (void)buf; (void)buf_len;
+ log_err("Zip files not yet supported: %s", dir_full_path);
+ *new_buf_len = 0;
+ return NULL;
+}
+
+
+/* This function is very hot. It's called on every file when zip is enabled. */
+void* decompress(const ag_compression_type zip_type, const void* buf, const int buf_len,
+ const char* dir_full_path, int* new_buf_len) {
+
+ switch(zip_type) {
+ case AG_GZIP:
+ return decompress_zlib(buf, buf_len, dir_full_path, new_buf_len);
+ case AG_COMPRESS:
+ return decompress_lwz(buf, buf_len, dir_full_path, new_buf_len);
+ case AG_ZIP:
+ return decompress_zip(buf, buf_len, dir_full_path, new_buf_len);
+ case AG_NO_COMPRESSION:
+ log_err("File %s is not compressed", dir_full_path);
+ break;
+ default:
+ log_err("Unsupported compression type: %d", zip_type);
+ }
+
+ *new_buf_len = 0;
+ return NULL;
+}
+
+
+/* This function is very hot. It's called on every file. */
+ag_compression_type is_zipped(const void* buf, const int buf_len) {
+ /* Zip magic numbers
+ * compressed file: { 0x1F, 0x9B }
+ * http://en.wikipedia.org/wiki/Compress
+ *
+ * gzip file: { 0x1F, 0x8B }
+ * http://www.gzip.org/zlib/rfc-gzip.html#file-format
+ *
+ * zip file: { 0x50, 0x4B, 0x03, 0x04 }
+ * http://www.pkware.com/documents/casestudies/APPNOTE.TXT (Section 4.3)
+ */
+
+ const unsigned char *buf_c = buf;
+
+ if(buf_len == 0) {
+ return AG_NO_COMPRESSION;
+ }
+
+ /* Check for gzip & compress */
+ if(buf_len >= 2) {
+ if(buf_c[0] == 0x1F) {
+ if(buf_c[1] == 0x8B) {
+ log_debug("Found gzip-based stream");
+ return AG_GZIP;
+ } else if(buf_c[1] == 0x9B) {
+ log_debug("Found compress-based stream");
+ return AG_COMPRESS;
+ }
+ }
+ }
+
+ /* Check for zip */
+ if(buf_len >= 4) {
+ if(buf_c[0] == 0x50 && buf_c[1] == 0x4B && buf_c[2] == 0x03 && buf_c[3] == 0x04)
+ {
+ log_debug("Found zip-based stream");
+ return AG_ZIP;
+ }
+ }
+
+ return AG_NO_COMPRESSION;
+}
+
View
@@ -0,0 +1,19 @@
+#ifndef DECOMPRESS_H
+#define DECOMPRESS_H
+
+#include "config.h"
+#include "log.h"
+#include "options.h"
+
+typedef enum {
+ AG_NO_COMPRESSION,
+ AG_GZIP,
+ AG_COMPRESS,
+ AG_ZIP
+} ag_compression_type;
+
+ag_compression_type is_zipped(const void* buf, const int buf_len);
+
+void* decompress(const ag_compression_type zip_type, const void* buf, const int buf_len, const char* dir_full_path, int* new_buf_len);
+#endif
+
View
@@ -73,6 +73,7 @@ Search options:\n\
(.gitigore, .hgignore, .svnignore; still obey .agignore)\n\
-v --invert-match\n\
-w --word-regexp Only match whole words\n\
+-z --search-zip Search contents of compressed (e.g., gzip) files\n\
\n");
}
@@ -186,6 +187,7 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
{ "print-long-lines", no_argument, &opts.print_long_lines, 1 },
{ "recurse", no_argument, NULL, 'r' },
{ "search-binary", no_argument, &opts.search_binary_files, 1 },
+ { "search-zip", no_argument, &opts.search_zip_files, 1 },
{ "search-files", no_argument, &opts.search_stream, 0 },
{ "skip-vcs-ignores", no_argument, NULL, 'U' },
{ "smart-case", no_argument, NULL, 'S' },
@@ -216,14 +218,15 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
group = 0;
}
- while ((ch = getopt_long(argc, argv, "A:aB:C:DG:g:fhiLlm:np:QRrSsvVtuUw", longopts, &opt_index)) != -1) {
+ while ((ch = getopt_long(argc, argv, "A:aB:C:DG:g:fhiLlm:np:QRrSsvVtuUwz", longopts, &opt_index)) != -1) {
switch (ch) {
case 'A':
opts.after = atoi(optarg);
break;
case 'a':
opts.search_all_files = 1;
opts.search_binary_files = 1;
+ opts.search_zip_files = 1;
break;
case 'B':
opts.before = atoi(optarg);
@@ -291,6 +294,7 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
opts.search_all_files = 1;
break;
case 'u':
+ opts.search_zip_files = 1;
opts.search_binary_files = 1;
opts.search_all_files = 1;
opts.search_hidden_files = 1;
@@ -307,6 +311,9 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
case 'w':
opts.word_regexp = 1;
break;
+ case 'z':
+ opts.search_zip_files = 1;
+ break;
case 0: /* Long option */
if (strcmp(longopts[opt_index].name, "ackmate-dir-filter") == 0) {
compile_study(&opts.ackmate_dir_filter, &opts.ackmate_dir_filter_extra, optarg, 0, 0);
View
@@ -49,6 +49,7 @@ typedef struct {
int search_all_files;
int skip_vcs_ignores;
int search_binary_files;
+ int search_zip_files;
int search_hidden_files;
int search_stream; /* true if tail -F blah | ag */
int stats;
View
@@ -1,8 +1,39 @@
#include "search.h"
#include "scandir.h"
+/* Thin wrapper around search_nonzipped_buf (the workhorse). This just checks
+ * a few bytes in the buf to see if it's zipped or not. If it is, decompress in
+ * memory. Otherwise, pass it on through to search_nonzipped_buf
+ */
void search_buf(const char *buf, const int buf_len,
const char *dir_full_path) {
+ char* _buf = (char*)buf;
+ int _buf_len = buf_len;
+ ag_compression_type zip_type = AG_NO_COMPRESSION;
+
+ zip_type = is_zipped((void*)_buf, _buf_len);
+ if (zip_type != AG_NO_COMPRESSION) {
+ if(!opts.search_zip_files) {
+ log_debug("File %s is zipped. Skipping...", dir_full_path);
+ return;
+ }
+ _buf = decompress(zip_type, buf, buf_len, dir_full_path, &_buf_len);
+ if(_buf == NULL || _buf_len == 0) {
+ log_warn("Cannot decompress zipped file %s", dir_full_path);
+ return;
+ }
+ }
+
+ search_nonzipped_buf(_buf, _buf_len, dir_full_path);
+
+ /* Check if this is the _buf we allocated in decompress. If so, free it */
+ if(_buf != buf) {
+ free(_buf);
+ }
+}
+
+void search_nonzipped_buf(const char *buf, const int buf_len,
+ const char *dir_full_path) {
int binary = -1; /* 1 = yes, 0 = no, -1 = don't know */
int buf_offset = 0;
@@ -153,7 +184,10 @@ void search_stream(FILE *stream, const char *path) {
size_t line_cap = 0;
while ((line_len = getline(&line, &line_cap, stream)) > 0) {
- search_buf(line, line_len, path);
+ /* Don't try to determine if this is zipped since it's line based.
+ * TODO: if this moves to being non-zipped based, change to search_buf
+ */
+ search_nonzipped_buf(line, line_len, path);
}
free(line);
View
@@ -39,6 +39,8 @@ pthread_mutex_t work_queue_mtx;
void search_buf(const char *buf, const int buf_len,
const char *dir_full_path);
+void search_nonzipped_buf(const char *buf, const int buf_len,
+ const char *dir_full_path);
void search_stream(FILE *stream, const char *path);
void search_file(const char *file_full_path);
View
@@ -41,6 +41,13 @@ typedef struct {
struct timeval time_end;
} ag_stats;
+typedef enum {
+ AG_NO_COMPRESSION,
+ AG_GZIP,
+ AG_COMPRESS,
+ AG_ZIP
+} ag_compression_type;
+
ag_stats stats;
typedef const char *(*strncmp_fp)(const char*, const char*, const size_t, const size_t, const size_t[]);
@@ -55,6 +62,9 @@ strncmp_fp get_strstr(cli_options opts);
int invert_matches(match matches[], int matches_len, const int buf_len);
void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opts, const int study_opts);
+void* decompress(const ag_compression_type zip_type, const void* buf, const int buf_len, const char* dir_full_path, int* new_buf_len);
+ag_compression_type is_zipped(const void* buf, const int buf_len);
+
int is_binary(const void* buf, const int buf_len);
int is_regex(const char* query);
int is_fnmatch(const char* filename);

0 comments on commit e80a032

Please sign in to comment.