diff --git a/build.properties b/build.properties
index 66d3353..dcc4b7a 100644
--- a/build.properties
+++ b/build.properties
@@ -10,3 +10,10 @@ hadoop.path=/usr/lib/hadoop
# version ID of this build
commoncrawl.version=1.0
+mvn.ant.task.version=2.1.3
+# Maven dependency download locations
+mvn.repo=http://repo1.maven.org/maven2
+apache.repo=https://repository.apache.org/content/repositories/releases
+mvn.ant.task.url=${mvn.repo}/org/apache/maven/maven-ant-tasks/${mvn.ant.task.version}
+mvn.ant.task.jar=maven-ant-tasks-${mvn.ant.task.version}.jar
+
diff --git a/build.xml b/build.xml
index 93f2a09..103bba7 100644
--- a/build.xml
+++ b/build.xml
@@ -8,23 +8,14 @@
-
+
-
-
-
-
-
-
-
-
-
-
+
@@ -77,11 +68,6 @@
-
-
-
-
-
@@ -107,7 +93,7 @@
-
+
@@ -442,6 +428,7 @@
+
diff --git a/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java b/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java
index b649374..17ada49 100644
--- a/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java
+++ b/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java
@@ -13,5 +13,12 @@ public interface Constants {
public static final String ARCFileHeader_FetchTimeStamp = "x_commoncrawl_FetchTimestamp";
public static final String ARCFileHeader_ContentTruncated = "x-commoncrawl-ContentTruncated";
public static final String ARCFileHeader_SOURCE_IS_GZIPED = "x_commoncrawl_SourceIsGZIP";
+ public static final String ARCFileHeader_ARC_Offset = "x_commoncrawl_ArcOffset";
+ public static final String ARCFileHeader_ARC_Timestamp = "x_commoncrawl_ArcTimestamp";
+ public static final String ARCFileHeader_HostIP = "x_commoncrawl_HostIP";
+ public static final String ARCFileHeader_ARC_MimeType = "x_commoncrawl_ArcMimeType";
+ public static final String ARCFileHeader_ARC_RecordLen = "x_commoncrawl_ArcRecordLen";
+ public static final String ARCFileHeader_ARC_PayloadLen = "x_commoncrawl_ArcPayloadLen";
+
}
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java
index 8a5a4e3..bee632c 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.IOException;
import java.util.Arrays;
@@ -27,7 +27,6 @@
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
-import org.commoncrawl.util.shared.ArcFileReader;
/**
* A map reduce input format for gzipped ARC files.
@@ -179,7 +178,7 @@ public InputSplit[] getSplits(JobConf job, int ignored) throws IOException {
*/
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
- return new ARCSplitReader(job, (ARCSplit) split, arcSource, blockSize);
+ return new ARCSplitReader(job, (ARCSplit) split, arcSource, blockSize);
}
/**
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java
index 6b3d19f..acddc97 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.DataInput;
import java.io.DataOutput;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java
index e9fc949..e812353 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.IOException;
import java.io.InputStream;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java
index 6aadbfa..c007080 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.DataInput;
import java.io.DataOutput;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java
index 38d3f29..f428dcd 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.IOException;
import java.util.Collection;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java
index 22542e4..3b3dac9 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -32,7 +32,6 @@
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.protocol.shared.ArcFileItem;
-import org.commoncrawl.util.shared.ArcFileReader;
/**
* A Hadooop {@link RecordReader} for reading {@link ARCSplit}s.
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java
index c7c4f84..fd09f93 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.util.shared;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.BufferedReader;
import java.io.EOFException;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java
index fd9adc3..03497cc 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.IOException;
import java.io.InputStream;
diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java
index a635f87..16a3cc6 100644
--- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java
+++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-package org.commoncrawl.hadoop.io;
+package org.commoncrawl.hadoop.io.deprecated;
import java.io.File;
import java.io.FileInputStream;