diff --git a/Makefile b/Makefile
index 99fbc10..9a3198b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,19 +1,19 @@
build:
mvn clean package
-# cdxj:
-# @echo "creating *.cdxj index files from the local warcs"
-# cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
-# cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
-# cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
+cdxj: build ensure_jwarc
+ @echo "creating *.cdxj index files from the local warcs"
+ java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
+ mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
+ mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
+
+extract:
+ @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
+ java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
+ java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
+ java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
+ @echo "hint: python -m json.tool extraction.json"
-# extract:
-# @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
-# warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
-# warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
-# warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
-# @echo "hint: python -m json.tool extraction.json"
-#
# cdx_toolkit:
# @echo demonstrate that we have this entry in the index
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
@@ -31,15 +31,15 @@ build:
# python ./warcio-iterator.py TEST-000000.extracted.warc.gz
# @echo
#
-# download_collinfo:
-# @echo "downloading collinfo.json so we can find out the crawl name"
-# curl -O https://index.commoncrawl.org/collinfo.json
-#
-# CC-MAIN-2024-22.warc.paths.gz:
-# @echo "downloading the list from s3, requires s3 auth even though it is free"
-# @echo "note that this file should be in the repo"
-# aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
-#
+download_collinfo:
+ @echo "downloading collinfo.json so we can find out the crawl name"
+ curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
+
+CC-MAIN-2024-22.warc.paths.gz:
+ @echo "downloading the list from s3, requires s3 auth even though it is free"
+ @echo "note that this file should be in the repo"
+ aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz
+
# duck_local_files:
# @echo "warning! 300 gigabyte download"
# python duck.py local_files
@@ -52,11 +52,12 @@ build:
# @echo "warning! this might take 1-10 minutes"
# python duck.py cloudfront
#
-get_jwarc:
+
+jwarc.jar:
@echo "downloading JWarc JAR"
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
-wreck_the_warc: build get_jwarc
+wreck_the_warc: build jwarc.jar
@echo
@echo we will break and then fix this warc
cp data/whirlwind.warc.gz data/testing.warc.gz
@@ -67,24 +68,24 @@ wreck_the_warc: build get_jwarc
gzip data/testing.warc
@echo
@echo showing the records in the compressed warc - note the offsets of request and response are
- java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
+ java -jar jwarc.jar ls data/testing.warc.gz
@echo
@echo access the request record - failing
- java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
+ java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
@echo
@echo access the response record - failing
- java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 3734 || /usr/bin/true
+ java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
@echo
@echo "now let's do it the right way"
gzip -d data/testing.warc.gz
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
@echo
@echo showing the records in the compressed warc - note the skewed offsets of request and response
- java -jar jwarc-0.33.0.jar ls data/testing.warc.gz
+ java -jar jwarc.jar ls data/testing.warc.gz
@echo
@echo access the request record - works
- java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 518 | head
+ java -jar jwarc.jar extract data/testing.warc.gz 518 | head
@echo
@echo access the response record - works
- java -jar jwarc-0.33.0.jar extract data/testing.warc.gz 1027 | head -n 20
+ java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20
@echo
diff --git a/README.md b/README.md
index 3ad5f53..ec77ce9 100644
--- a/README.md
+++ b/README.md
@@ -414,11 +414,78 @@ Feel free to experiment more by looking at other part of the records, or extract
## Task 3: Index the WARC, WET, and WAT
-TBA
+The example WARC files we've been using are tiny and easy to work with. The real WARC files are around a gigabyte in size and contain about 30,000 webpages each. What's more, we have around 24 million of these files! To read all of them, we could iterate, but what if we wanted random access so we could read just one particular record? We do that with an index.
+```mermaid
+flowchart LR
+ warc --> indexer --> cdxj & columnar
+ warc@{shape: cyl}
+ cdxj@{ shape: stored-data}
+ columnar@{ shape: stored-data}
+```
+
+
+We have two versions of the index: the CDX index and the columnar index. The CDX index is useful for looking up single pages, whereas the columnar index is better suited to analytical and bulk queries. We'll look at both in this tour, starting with the CDX index.
+
+### CDX(J) index
+
+The CDX index files are sorted plain-text files, with each line containing information about a single capture in the WARC. Technically, Common Crawl uses CDXJ index files since the information about each capture is formatted as JSON. We'll use CDX and CDXJ interchangeably in this tour for legacy reasons 💅
+
+We can create our own CDXJ index from the local WARCs by running:
+
+```make cdxj```
+
+This uses the JWARC library and, partially, a home-cooked code that we wrote to support WET and WAT records, to generate CDXJ index files for our WARC files by running the code below:
+
+
+ Click to view code
+
+```
+creating *.cdxj index files from the local warcs
+java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
+mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
+```
+
+
+
+Now look at the `.cdxj` files with `cat whirlwind*.cdxj`. You'll see that each file has one entry in the index. The WARC only has the response record indexed, since by default cdxj-indexer guesses that you won't ever want to random-access the request or metadata. WET and WAT have the conversion and metadata records indexed (Common Crawl doesn't publish a WET or WAT index, just WARC).
+
+For each of these records, there's one text line in the index - yes, it's a flat file! It starts with a string like `org,wikipedia,an)/wiki/escopete 20240518015810`, followed by a JSON blob. The starting string is the primary key of the index. The first thing is a [SURT](http://crawler.archive.org/articles/user_manual/glossary.html#surt) (Sort-friendly URI Reordering Transform). The big integer is a date, in ISO-8601 format with the delimiters removed.
+
+What is the purpose of this funky format? It's done this way because these flat files (300 gigabytes total per crawl) can be sorted on the primary key using any out-of-core sort utility e.g. the standard Linux `sort`, or one of the Hadoop-based out-of-core sort functions.
+
+The JSON blob has enough information to cleanly isolate the raw data of a single record: it defines which WARC file the record is in, and the byte offset and length of the record within this file. We'll use that in the next section.
## Task 4: Use the CDXJ index to extract a subset of raw content from the local WARC, WET, and WAT
-TBA
+Normally, compressed files aren't random access. However, the WARC files use a trick to make this possible, which is that every record needs to be separately compressed. The `gzip` compression utility supports this, but it's rarely used.
+
+To extract one record from a warc file, all you need to know is the filename and the offset into the file. If you're reading over the web, then it really helps to know the exact length of the record.
+
+Run:
+
+```make extract```
+
+to run a set of extractions from your local
+`whirlwind.*.gz` files with `JWARC` using the commands below:
+
+
+ Click to view code
+
+```
+creating extraction.* from local warcs, the offset numbers are from the cdxj index
+java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
+java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
+java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
+hint: python -m json.tool extraction.json
+```
+
+
+
+The offset numbers in the Makefile are the same
+ones as in the index. Look at the three output files: `extraction.html`, `extraction.txt`, and `extraction.json` (pretty-print the json with `python -m json.tool extraction.json`).
+
+Notice that we extracted HTML from the WARC, text from WET, and JSON from the WAT (as shown in the different file extensions). This is because the payload in each file type is formatted differently!
## Task 5: Wreck the WARC by compressing it wrong
diff --git a/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java b/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java
new file mode 100644
index 0000000..75f7c5a
--- /dev/null
+++ b/src/main/java/org/commoncrawl/whirlwind/CdxWriterWithDynamicFiltering.java
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.commoncrawl.whirlwind;
+
+import org.netpreserve.jwarc.*;
+import org.netpreserve.jwarc.cdx.CdxFormat;
+import org.netpreserve.jwarc.cdx.CdxRequestEncoder;
+import org.netpreserve.jwarc.cdx.CdxWriter;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.net.URI;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+public class CdxWriterWithDynamicFiltering extends CdxWriter {
+ private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
+ .withZone(ZoneOffset.UTC);
+
+ private final Writer writer;
+ private CdxFormat format = CdxFormat.CDXJ;
+ private boolean postAppend = false;
+ private Consumer warningHandler;
+ private Predicate recordFilter = null;
+
+ public CdxWriterWithDynamicFiltering(Writer writer) {
+ super(writer);
+ this.writer = writer;
+ }
+
+ @Override
+ public void setFormat(CdxFormat format) {
+ super.setFormat(format);
+ this.format = format;
+ }
+
+ public CdxFormat getFormat() {
+ return this.format;
+ }
+
+ @Override
+ public void setPostAppend(boolean postAppend) {
+ super.setPostAppend(postAppend);
+ this.postAppend = postAppend;
+ }
+
+ @Override
+ public void onWarning(Consumer warningHandler) {
+ super.onWarning(warningHandler);
+ this.warningHandler = warningHandler;
+ }
+
+ @Override
+ public void process(WarcReader reader, String filename) throws IOException {
+
+ if (recordFilter == null) {
+ super.process(reader, filename);
+ return;
+ }
+
+ // Custom processing for filtered record types, since we are filtering, we get
+ // and process
+ // every record here.
+ WarcRecord record = reader.next().orElse(null);
+ while (record != null) {
+ try {
+ String recordType = record.type().toLowerCase();
+
+ long position = reader.position();
+
+ // Handle WarcCaptureRecord types (response, resource, revisit, request)
+ if (record instanceof WarcCaptureRecord) {
+ WarcCaptureRecord capture = (WarcCaptureRecord) record;
+ URI id = record.version().getProtocol().equals("ARC") ? null : record.id();
+
+ // Ensure HTTP header is parsed for revisit records
+ if (record instanceof WarcRevisit && record.contentType().base().equals(MediaType.HTTP)) {
+ ((WarcRevisit) record).http();
+ }
+
+ // Advance to next record to calculate length
+ record = reader.next().orElse(null);
+ long length = reader.position() - position;
+
+ // Skip records without a date
+ if (!capture.headers().first("WARC-Date").isPresent()) {
+ emitWarning(filename, position, "Skipping record due to missing or invalid date");
+ continue;
+ }
+
+ String encodedRequest = null;
+ if (postAppend) {
+ while (encodedRequest == null && record instanceof WarcCaptureRecord
+ && ((WarcCaptureRecord) record).concurrentTo().contains(id)) {
+ if (record instanceof WarcRequest) {
+ HttpRequest httpRequest = ((WarcRequest) record).http();
+ encodedRequest = CdxRequestEncoder.encode(httpRequest);
+ }
+ record = reader.next().orElse(null);
+ }
+ }
+
+ write(capture, filename, position, length, encodedRequest);
+ }
+ // Handle WarcConversion (from WET files) and other WarcTargetRecord types
+ else if (record instanceof WarcTargetRecord) {
+ WarcTargetRecord targetRecord = (WarcTargetRecord) record;
+
+ // Advance to next record to calculate length
+ record = reader.next().orElse(null);
+ long length = reader.position() - position;
+
+ // Skip records without a date
+ if (!targetRecord.headers().first("WARC-Date").isPresent()) {
+ emitWarning(filename, position, "Skipping record due to missing or invalid date");
+ continue;
+ }
+
+ writeTargetRecord(targetRecord, filename, position, length);
+ } else {
+ // Skip non-target records (like warcinfo)
+ record = reader.next().orElse(null);
+ }
+ } catch (ParsingException e) {
+ emitWarning(filename, reader.position(), "ParsingException: " + e.getBaseMessage());
+ record = reader.next().orElse(null);
+ }
+ }
+ }
+
+ @Override
+ public void setRecordFilter(Predicate recordFilter) {
+ super.setRecordFilter(recordFilter);
+ this.recordFilter = recordFilter;
+ }
+
+ /**
+ * Writes a CDXJ record for a WarcTargetRecord (like WarcConversion from WET
+ * files).
+ *
+ * TODO: make it more generic and integrated into jwarc
+ */
+ private void writeTargetRecord(WarcTargetRecord record, String filename, long position, long length)
+ throws IOException {
+ String target = record.target();
+ if (target == null) {
+ emitWarning(filename, position, "Skipping record due to missing target URI");
+ return;
+ }
+
+ // Build CDXJ line: surt timestamp {json}
+ StringBuilder line = new StringBuilder();
+
+ // SURT-formatted URL key
+ String surt = URIs.toNormalizedSurt(target);
+ line.append(escape(surt));
+ line.append(' ');
+
+ // Timestamp
+ String timestamp = DATE_FORMAT.format(record.date());
+ line.append(timestamp);
+ line.append(' ');
+
+ // JSON block
+ line.append('{');
+
+ // URL
+ line.append("\"url\": \"");
+ escapeJsonString(line, target);
+ line.append("\"");
+
+ // MIME type
+ try {
+ if (record.payload().isPresent()) {
+ MediaType mime = record.payload().get().type();
+ if (mime != null) {
+ line.append(", \"mime\": \"");
+ escapeJsonString(line, mime.base().toString());
+ line.append("\"");
+ }
+ }
+ } catch (IOException e) {
+ // Skip mime if payload can't be read
+ }
+
+ // Digest
+ record.payloadDigest().ifPresent(digest -> {
+ line.append(", \"digest\": \"");
+ escapeJsonString(line, digest.raw());
+ line.append("\"");
+ });
+
+ // Filename
+ if (filename != null) {
+ line.append(", \"filename\": \"");
+ escapeJsonString(line, filename);
+ line.append("\"");
+ }
+
+ // Offset
+ line.append(", \"offset\": \"");
+ line.append(position);
+ line.append("\"");
+
+ // Length
+ line.append(", \"length\": \"");
+ line.append(length);
+ line.append("\"");
+
+ line.append('}');
+
+ writer.write(line.toString());
+ writer.write('\n');
+ }
+
+ private void emitWarning(String filename, long position, String message) {
+ if (warningHandler == null)
+ return;
+ warningHandler.accept(filename + " (offset " + position + ") " + message);
+ }
+
+ // Borrowed from org.netpreserve.jwarc.cdx.CdxWriter
+ // TODO: remove duplication
+ private static String escape(String str) {
+ if (str == null)
+ return null;
+ return str.replace(" ", "%20").replace("\n", "%0A").replace("\0", "%00");
+ }
+
+ // Borrowed from org.netpreserve.jwarc.cdx.CdxWriter
+ // TODO: remove duplication
+ private static void escapeJsonString(StringBuilder out, String value) {
+ for (int i = 0; i < value.length(); i++) {
+ char c = value.charAt(i);
+ if (c == '"')
+ out.append("\\\"");
+ else if (c == '\\')
+ out.append("\\\\");
+ else if (c == '\b')
+ out.append("\\b");
+ else if (c == '\f')
+ out.append("\\f");
+ else if (c == '\n')
+ out.append("\\n");
+ else if (c == '\r')
+ out.append("\\r");
+ else if (c == '\t')
+ out.append("\\t");
+ else if (c <= 0x1f) {
+ out.append("\\u00");
+ out.append(Character.forDigit((c & 0xf0) >>> 4, 16));
+ out.append(Character.forDigit(c & 0xf, 16));
+ } else {
+ out.append(c);
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java b/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java
new file mode 100644
index 0000000..b5c0d6e
--- /dev/null
+++ b/src/main/java/org/commoncrawl/whirlwind/CdxjIndexer.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.commoncrawl.whirlwind;
+
+import org.apache.commons.lang3.StringUtils;
+import org.netpreserve.jwarc.WarcReader;
+import org.netpreserve.jwarc.cdx.CdxFormat;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+public class CdxjIndexer {
+
+ public static void main(String[] args) throws IOException {
+ String inputFile = null;
+ Set recordTypes = null;
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("--records") && i + 1 < args.length) {
+ // Support comma-separated record types: --records conversion,metadata
+ String typesArg = args[++i];
+ recordTypes = new HashSet<>(Arrays.asList(typesArg.split(",")));
+ } else if (args[i].equals("--help") || args[i].equals("-h")) {
+ printUsage();
+ System.exit(0);
+ } else if (!args[i].startsWith("-")) {
+ inputFile = args[i];
+ } else {
+ System.err.println("Unknown option: " + args[i]);
+ printUsage();
+ System.exit(1);
+ }
+ }
+
+ if (inputFile == null) {
+ System.err.println("Error: Input file is required");
+ printUsage();
+ System.exit(1);
+ }
+
+ Path requested = Path.of(inputFile).toAbsolutePath().normalize();
+ if (!Files.isRegularFile(requested)) {
+ throw new SecurityException("Invalid WARC path: " + requested);
+ }
+
+ // TODO: Move this into the WarcReader or the process of iterating over the
+ // records
+ if (requested.toString().endsWith("gz") || requested.toString().endsWith("gzip")) {
+ try {
+ ValidateWARC.validateRandomAccessWarcOrFail(requested);
+ } catch (IOException e) {
+ System.err.println("This file is probably not a multi-member gzip but a single gzip file.\n"
+ + "To allow seek, a gzipped WARC must have each record compressed into a single gzip member and concatenated together.\n\n"
+ + "This file is likely still valid and can be fixed by running:\n"
+ + "mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args=\""
+ + inputFile + " " + inputFile.replace(".gz", ".recompressed.gz") + "\"");
+ System.exit(-1);
+ }
+ }
+
+ try (InputStream in = Files.newInputStream(requested);
+ CdxWriterWithDynamicFiltering cdxjWriter = new CdxWriterWithDynamicFiltering(
+ new OutputStreamWriter(System.out));
+ WarcReader reader = new WarcReader(in)) {
+ reader.setLenient(true);
+ cdxjWriter.setFormat(CdxFormat.CDXJ);
+ if (recordTypes != null) {
+ Set unmodifiableRecordTypes = Collections.unmodifiableSet(recordTypes);
+ cdxjWriter.setRecordFilter(
+ record -> unmodifiableRecordTypes.contains(StringUtils.lowerCase(record.type())));
+ }
+ cdxjWriter.process(reader, requested.toString());
+ }
+ }
+
+ private static void printUsage() {
+ System.err.println("Usage: cdxj-indexer [OPTIONS] ");
+ System.err.println();
+ System.err.println("Options:");
+ System.err.println(" --records Comma-separated list of record types to index");
+ System.err.println(" (e.g., conversion, response, metadata)");
+ System.err.println(" --help, -h Show this help message");
+ System.err.println();
+ System.err.println("Examples:");
+ System.err.println(" cdxj-indexer file.warc.gz");
+ System.err.println(" cdxj-indexer --records conversion file.wet.gz");
+ System.err.println(" cdxj-indexer --records response,resource file.warc.gz");
+ }
+}
diff --git a/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java b/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java
index dff7a63..4a03072 100644
--- a/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java
+++ b/src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java
@@ -46,7 +46,7 @@ public static void main(String[] args) throws Exception {
}
- public static int getWarcCompressionInformation(Path inputWarc) throws IllegalArgumentException {
+ public static int getWarcCompressionInformation(Path inputWarc) throws IOException {
final AtomicInteger memberCount = new AtomicInteger(0);
try (InputStream fis = Files.newInputStream(inputWarc);