Permalink
Browse files

Moved hardcoded strings in to constants

  • Loading branch information...
1 parent 8462ba9 commit 3e94e961c2ace8f88651aa24bbd9f166092ce58d training committed Dec 28, 2012
Showing with 60 additions and 31 deletions.
  1. +22 −7 src/BoggleDriver.java
  2. +2 −2 src/BoggleMapper.java
  3. +6 −6 src/BoggleWordMapper.java
  4. +30 −16 src/UserDictBloom.java
View
@@ -18,7 +18,20 @@
public class BoggleDriver extends Configured implements Tool {
private static final Logger logger = Logger.getLogger(BoggleDriver.class);
- public static final int MINIMUM_WORD_SIZE = 3;
+ /** The parameter name for the minimum word size to output */
+ public static final String MINIMUM_WORD_SIZE_PARAM = "minimumwordsize";
+
+ /** The default value for the minimum word size to output */
+ public static final int MINIMUM_WORD_SIZE_DEFAULT = 3;
+
+ /** The parameter name for the bloom filter location */
+ public static final String BLOOM_PARAM = "bloompath";
+
+ /** The parameter name for the dictionary location */
+ public static final String DICTIONARY_PARAM = "dictionarypath";
+
+ /** The parameter name for the roll to be serialized */
+ public static final String ROLL_PARAM = "roll";
@Override
public int run(String[] args) throws Exception {
@@ -33,7 +46,10 @@ public int run(String[] args) throws Exception {
String output = args[3];
Configuration configuration = getConf();
- configuration.set("mapreduce.input.lineinputformat.linespermap", "8");
+ // To change how the mappers are created to process the roll,
+ // pass in -D mapreduce.input.lineinputformat.linespermap=0
+ // or in code uncomment:
+ //configuration.set("mapreduce.input.lineinputformat.linespermap", "8");
FileSystem fileSystem = FileSystem.get(configuration);
@@ -49,15 +65,14 @@ public int run(String[] args) throws Exception {
return -1;
}
- configuration.set("bloompath", bloomPath);
- configuration.set("dictionarypath", dictionary);
+ configuration.set(BLOOM_PARAM, bloomPath);
+ configuration.set(DICTIONARY_PARAM, dictionary);
BoggleRoll roll = BoggleRoll.createRoll();
- configuration.set("roll", roll.serialize());
+ configuration.set(ROLL_PARAM, roll.serialize());
writeRollFile(input, fileSystem, roll);
- boolean isDone = false;
int iteration = 0;
long previousWordCount = 0;
@@ -101,7 +116,7 @@ public int run(String[] args) throws Exception {
previousWordCount = currentWordCount;
iteration++;
- } while (!isDone);
+ } while (true);
// Check for words and output to final directory
Job job = new Job(configuration);
View
@@ -22,12 +22,12 @@
public void setup(Context context) throws IOException {
Configuration configuration = context.getConfiguration();
- roll = BoggleRoll.deserialize(configuration.get("roll"));
+ roll = BoggleRoll.deserialize(configuration.get(BoggleDriver.ROLL_PARAM));
FileSystem fileSystem = FileSystem.get(configuration);
bloomFilter = new BloomFilter(UserDictBloom.vectorSize, UserDictBloom.nbHash, UserDictBloom.hashType);
- bloomFilter.readFields(fileSystem.open(new Path(configuration.get("bloompath"))));
+ bloomFilter.readFields(fileSystem.open(new Path(configuration.get(BoggleDriver.BLOOM_PARAM))));
}
@Override
View
@@ -15,18 +15,16 @@
public class BoggleWordMapper extends Mapper<LongWritable, Text, Text, RollGraphWritable> {
private static final Logger logger = Logger.getLogger(BoggleWordMapper.class);
- private BoggleRoll roll;
-
private HashSet<String> words = new HashSet<String>();
+ private int minimumWordSize = 0;
+
@Override
public void setup(Context context) throws IOException {
Configuration configuration = context.getConfiguration();
- roll = BoggleRoll.deserialize(configuration.get("roll"));
-
FileSystem fileSystem = FileSystem.get(configuration);
- FSDataInputStream dict = fileSystem.open(new Path(configuration.get("dictionarypath")));
+ FSDataInputStream dict = fileSystem.open(new Path(configuration.get(BoggleDriver.DICTIONARY_PARAM)));
String line;
@@ -47,6 +45,8 @@ public void setup(Context context) throws IOException {
}
dict.close();
+
+ minimumWordSize = configuration.getInt(BoggleDriver.MINIMUM_WORD_SIZE_PARAM, BoggleDriver.MINIMUM_WORD_SIZE_DEFAULT);
}
@Override
@@ -60,7 +60,7 @@ public void map(LongWritable key, Text value, Context context) throws IOExceptio
if (values.length == 3) {
String charsSoFar = values[0];
- if (charsSoFar.length() >= BoggleDriver.MINIMUM_WORD_SIZE) {
+ if (charsSoFar.length() >= minimumWordSize) {
// See if the word actually appears in the dictionary
if (words.contains(charsSoFar)) {
RollGraphWritable rollGraph = RollGraphWritable.deserialize(values[1] + " " + values[2]);
View
@@ -8,35 +8,49 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;
public class UserDictBloom {
/** The vector size for the Bloom Filter */
- public static final int vectorSize = 1048576;
- /** The number of hashes for the Bloom Filter */
- public static final int nbHash = 3;
- /** The type of hashing to use for the Bloom Filter */
- public static final int hashType = Hash.MURMUR_HASH;
-
+ public static final int vectorSize = 1048576;
+ /** The number of hashes for the Bloom Filter */
+ public static final int nbHash = 3;
+ /** The type of hashing to use for the Bloom Filter */
+ public static final int hashType = Hash.MURMUR_HASH;
+
public static void main(String args[]) {
try {
+ String wordFile = null;
+ String bloomFile = null;
+
+ if (args.length == 2) {
+ wordFile = args[0];
+ bloomFile = args[1];
+ } else if (args.length == 0) {
+ wordFile = "/usr/share/dict/words";
+ bloomFile = "bloom.out";
+ } else {
+ System.out.println("Usage <pathtodictionary> <pathtobloomfile>");
+ }
+
+ System.out.println("Reading dictionary from " + wordFile + " outputting Bloom Filter to " + bloomFile);
+
// Go through every word in the words file
- BufferedReader dict = new BufferedReader(new FileReader("/usr/share/dict/words"));
+ BufferedReader dict = new BufferedReader(new FileReader(wordFile));
String line;
Pattern words = Pattern.compile("[a-z]*");
-
+
BloomFilter bloomFilter = new BloomFilter(vectorSize, nbHash, hashType);
-
+
while ((line = dict.readLine()) != null) {
// Normalize all words to lower case and remove all dashes
line = line.toLowerCase().replace("-", "");
Matcher matcher = words.matcher(line);
-
+
if (matcher.matches()) {
// Add to Bloom Filter breaking up the word along the way
for (int i = 0; i < line.length(); i++) {
@@ -47,16 +61,16 @@ public static void main(String args[]) {
System.out.println("Skipping entry: \"" + line + "\"");
}
}
-
+
dict.close();
-
+
// Write out the Bloom Filter to a file
Configuration configuration = new Configuration();
FileSystem fs = FileSystem.get(configuration);
-
- DataOutputStream outputStream = FileSystem.create(fs, new Path("bloom.out"), FsPermission.getDefault());
+
+ DataOutputStream outputStream = fs.create(new Path(bloomFile));
bloomFilter.write(outputStream);
-
+
outputStream.close();
} catch (IOException e) {
e.printStackTrace();

0 comments on commit 3e94e96

Please sign in to comment.