From 5e8cf9f831bd22131ea87e273caca92ed765c99e Mon Sep 17 00:00:00 2001 From: Ahad Rana Date: Thu, 17 Jan 2013 09:27:36 -0800 Subject: [PATCH] 1. Fix build.xml to fetch maven ant task properly. 2. Remove external Hadoop dependency. 3. Add deprecated files back in. --- build.properties | 7 +++++++ build.xml | 21 ++++--------------- .../crawl/common/shared/Constants.java | 7 +++++++ .../hadoop/io/deprecated/ARCInputFormat.java | 5 ++--- .../hadoop/io/deprecated/ARCResource.java | 2 +- .../hadoop/io/deprecated/ARCSource.java | 2 +- .../hadoop/io/deprecated/ARCSplit.java | 2 +- .../io/deprecated/ARCSplitCalculator.java | 2 +- .../hadoop/io/deprecated/ARCSplitReader.java | 3 +-- .../hadoop/io/deprecated/ArcFileReader.java | 2 +- .../hadoop/io/deprecated/JetS3tARCSource.java | 2 +- .../hadoop/io/deprecated/LocalARCSource.java | 2 +- 12 files changed, 28 insertions(+), 29 deletions(-) diff --git a/build.properties b/build.properties index 66d3353..dcc4b7a 100644 --- a/build.properties +++ b/build.properties @@ -10,3 +10,10 @@ hadoop.path=/usr/lib/hadoop # version ID of this build commoncrawl.version=1.0 +mvn.ant.task.version=2.1.3 +# Maven dependency download locations +mvn.repo=http://repo1.maven.org/maven2 +apache.repo=https://repository.apache.org/content/repositories/releases +mvn.ant.task.url=${mvn.repo}/org/apache/maven/maven-ant-tasks/${mvn.ant.task.version} +mvn.ant.task.jar=maven-ant-tasks-${mvn.ant.task.version}.jar + diff --git a/build.xml b/build.xml index 93f2a09..103bba7 100644 --- a/build.xml +++ b/build.xml @@ -8,23 +8,14 @@ - + - - - - - - - - - - + @@ -77,11 +68,6 @@ - - - - - @@ -107,7 +93,7 @@ - + @@ -442,6 +428,7 @@ + diff --git a/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java b/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java index b649374..17ada49 100644 --- a/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java +++ b/src/main/java/org/commoncrawl/crawl/common/shared/Constants.java @@ -13,5 +13,12 @@ public interface Constants { public static final String ARCFileHeader_FetchTimeStamp = "x_commoncrawl_FetchTimestamp"; public static final String ARCFileHeader_ContentTruncated = "x-commoncrawl-ContentTruncated"; public static final String ARCFileHeader_SOURCE_IS_GZIPED = "x_commoncrawl_SourceIsGZIP"; + public static final String ARCFileHeader_ARC_Offset = "x_commoncrawl_ArcOffset"; + public static final String ARCFileHeader_ARC_Timestamp = "x_commoncrawl_ArcTimestamp"; + public static final String ARCFileHeader_HostIP = "x_commoncrawl_HostIP"; + public static final String ARCFileHeader_ARC_MimeType = "x_commoncrawl_ArcMimeType"; + public static final String ARCFileHeader_ARC_RecordLen = "x_commoncrawl_ArcRecordLen"; + public static final String ARCFileHeader_ARC_PayloadLen = "x_commoncrawl_ArcPayloadLen"; + } diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java index 8a5a4e3..bee632c 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCInputFormat.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.IOException; import java.util.Arrays; @@ -27,7 +27,6 @@ import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.ReflectionUtils; -import org.commoncrawl.util.shared.ArcFileReader; /** * A map reduce input format for gzipped ARC files. @@ -179,7 +178,7 @@ public InputSplit[] getSplits(JobConf job, int ignored) throws IOException { */ public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { - return new ARCSplitReader(job, (ARCSplit) split, arcSource, blockSize); + return new ARCSplitReader(job, (ARCSplit) split, arcSource, blockSize); } /** diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java index 6b3d19f..acddc97 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCResource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.DataInput; import java.io.DataOutput; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java index e9fc949..e812353 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.IOException; import java.io.InputStream; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java index 6aadbfa..c007080 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplit.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.DataInput; import java.io.DataOutput; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java index 38d3f29..f428dcd 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitCalculator.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.IOException; import java.util.Collection; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java index 22542e4..3b3dac9 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ARCSplitReader.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -32,7 +32,6 @@ import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.protocol.shared.ArcFileItem; -import org.commoncrawl.util.shared.ArcFileReader; /** * A Hadooop {@link RecordReader} for reading {@link ARCSplit}s. diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java index c7c4f84..fd09f93 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/ArcFileReader.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.util.shared; +package org.commoncrawl.hadoop.io.deprecated; import java.io.BufferedReader; import java.io.EOFException; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java index fd9adc3..03497cc 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/JetS3tARCSource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.IOException; import java.io.InputStream; diff --git a/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java b/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java index a635f87..16a3cc6 100644 --- a/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java +++ b/src/main/java/org/commoncrawl/hadoop/io/deprecated/LocalARCSource.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.commoncrawl.hadoop.io; +package org.commoncrawl.hadoop.io.deprecated; import java.io.File; import java.io.FileInputStream;