Permalink
Browse files

Added more output from Bloom creation.

  • Loading branch information...
1 parent 568bc41 commit e4eeafb876ab81376d1e5907b92b96fa07593c20 @eljefe6a committed Jan 4, 2013
Showing with 14 additions and 0 deletions.
  1. +14 −0 src/UserDictBloom.java
View
@@ -2,6 +2,7 @@
import java.io.DataOutputStream;
import java.io.FileReader;
import java.io.IOException;
+import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -45,6 +46,10 @@ public static void main(String args[]) {
Pattern words = Pattern.compile("[a-z]*");
BloomFilter bloomFilter = new BloomFilter(VECTOR_SIZE, NBHASH, HASH_TYPE);
+
+ HashSet<String> hashSet = new HashSet<String>();
+ long size = 0;
+ long totalWords = 0;
while ((line = dict.readLine()) != null) {
// Normalize all words to lower case and remove all dashes
@@ -56,11 +61,20 @@ public static void main(String args[]) {
for (int i = 0; i < line.length(); i++) {
String wordPiece = line.substring(0, i + 1);
bloomFilter.add(new Key(wordPiece.getBytes()));
+
+ if (!hashSet.contains(wordPiece)) {
+ hashSet.add(wordPiece);
+ size += wordPiece.length();
+ }
}
+
+ totalWords++;
} else {
System.out.println("Skipping entry: \"" + line + "\"");
}
}
+
+ System.out.println("Total Words:" + totalWords + " Unique Word Partss:" + hashSet.size() + " Size:" + size);
dict.close();

0 comments on commit e4eeafb

Please sign in to comment.