Skip to content
Browse files

Don't try to parse text in binary files

  • Loading branch information...
1 parent ea62f89 commit 7275fd06579b4a234b5ff83f2fe93fbdc6d8e532 @mikecafarella mikecafarella committed Aug 24, 2012
View
2 src/java/com/cloudera/recordbreaker/analyzer/FSCrawler.java
@@ -83,8 +83,6 @@ protected void addSingleFile(File f, boolean isDir, long crawlid) throws IOExcep
Date dateModified = new Date(f.lastModified());
try {
analyzer.insertIntoFiles(f, isDir, owner, fileDateFormat.format(dateModified), crawlid, tgs);
- System.err.println("Done! added " + f);
-
} catch (SQLiteException sle) {
throw new IOException(sle.getMessage());
}
View
9 src/java/com/cloudera/recordbreaker/analyzer/FormatAnalyzer.java
@@ -76,11 +76,14 @@ public DataDescriptor describeData(File f, int maxLines) throws IOException {
// It's not one of the known formats, so apply LearnStructure (and
// SchemaDictionary), then emit the resulting Avro data.
try {
- return new UnknownTextDataDescriptor(f, schemaDbDir, maxLines);
+ boolean isTextData = UnknownTextDataDescriptor.isTextData(f);
+ if (isTextData) {
+ return new UnknownTextDataDescriptor(f, schemaDbDir, maxLines);
+ }
} catch (Exception iex) {
- // If that doesn't work, then give up and call it unstructured
- return new UnstructuredFileDescriptor(f);
}
+ // If that doesn't work, then give up and call it unstructured
+ return new UnstructuredFileDescriptor(f);
}
}
}
View
24 src/java/com/cloudera/recordbreaker/analyzer/UnknownTextDataDescriptor.java
@@ -16,6 +16,8 @@
import java.io.File;
import java.io.IOException;
+import java.io.FileInputStream;
+import java.io.BufferedInputStream;
import java.util.List;
import java.util.ArrayList;
@@ -36,6 +38,28 @@
* @see DataDescriptor
*/
public class UnknownTextDataDescriptor implements DataDescriptor {
+ /**
+ * Test whether the input param is a text file.
+ * We do this by examining the first k bytes. If 90% or more
+ * of them are ASCII chars, then we assume it's text.
+ */
+ final static double asciiThreshold = 0.9;
+ public static boolean isTextData(File f) throws IOException {
+ BufferedInputStream in = new BufferedInputStream(new FileInputStream(f));
+ byte buf[] = new byte[1024];
+ int numBytes = in.read(buf);
+ if (numBytes < 0) {
+ return false;
+ }
+ int numASCIIChars = 0;
+ for (int i = 0; i < numBytes; i++) {
+ if (buf[i] >= 32 && buf[i] < 128) {
+ numASCIIChars++;
+ }
+ }
+ return ((numASCIIChars / (1.0 * numBytes)) > asciiThreshold);
+ }
+
File f;
File schemaDictDir;
File workingAvroFile;
View
1 src/java/com/cloudera/recordbreaker/learnstructure/InferredType.java
@@ -141,7 +141,6 @@ public GenericContainer parse(String str) {
return (GenericContainer) pr.getData();
}
}
- //System.err.println("Blargh!! Failed on " + str);
return null;
}
abstract ParseResult internalParse(String s, Map<String, Integer> targetUnionDecisions, boolean mustConsumeStr);

0 comments on commit 7275fd0

Please sign in to comment.
Something went wrong with that request. Please try again.