Browse files

Limit the amount of time we spend analyzing any one file

  • Loading branch information...
1 parent 39b4826 commit ea62f8952f9f63236df7967e8a7f26a4f41651ae @mikecafarella mikecafarella committed Aug 23, 2012
View
4 src/java/com/cloudera/recordbreaker/analyzer/FSAnalyzer.java
@@ -349,8 +349,8 @@ protected Long job(SQLiteConnection db) throws SQLiteException {
/**
* Try to describe the contents of the given file
*/
- DataDescriptor describeData(File f) throws IOException {
- return formatAnalyzer.describeData(f);
+ DataDescriptor describeData(File f, int maxLines) throws IOException {
+ return formatAnalyzer.describeData(f, maxLines);
}
///////////////////////////////////////////////////
View
10 src/java/com/cloudera/recordbreaker/analyzer/FSCrawler.java
@@ -34,7 +34,11 @@
* @author "Michael Cafarella" <mjc@cloudera.com>
***********************************************************/
public class FSCrawler {
+ final static int MAX_ANALYSIS_LINES = 400;
+ final static int MAX_CRAWL_DEPTH = 5;
+
static SimpleDateFormat fileDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+
Hashtable<Long, Thread> pendingCrawls = new Hashtable<Long, Thread>();
Hashtable<Long, CrawlRuntimeStatus> crawlStatusInfo = new Hashtable<Long, CrawlRuntimeStatus>();
FSAnalyzer analyzer;
@@ -56,7 +60,7 @@ protected void addSingleFile(File f, boolean isDir, long crawlid) throws IOExcep
List<TypeGuess> tgs = new ArrayList<TypeGuess>();
if (! isDir) {
- DataDescriptor descriptor = analyzer.describeData(f);
+ DataDescriptor descriptor = analyzer.describeData(f, MAX_ANALYSIS_LINES);
try {
List<SchemaDescriptor> schemas = descriptor.getSchemaDescriptor();
@@ -79,6 +83,8 @@ protected void addSingleFile(File f, boolean isDir, long crawlid) throws IOExcep
Date dateModified = new Date(f.lastModified());
try {
analyzer.insertIntoFiles(f, isDir, owner, fileDateFormat.format(dateModified), crawlid, tgs);
+ System.err.println("Done! added " + f);
+
} catch (SQLiteException sle) {
throw new IOException(sle.getMessage());
}
@@ -115,7 +121,7 @@ protected void recursiveCrawlBuildList(File f, int subdirDepth, long crawlId, Li
*/
public synchronized boolean getStartNonblockingCrawl(final String fsUrl) {
try {
- final int subdirDepth = 3;
+ final int subdirDepth = MAX_CRAWL_DEPTH;
long fsId = analyzer.getCreateFilesystem(fsUrl, true);
if (fsId < 0) {
return false;
View
6 src/java/com/cloudera/recordbreaker/analyzer/FormatAnalyzer.java
@@ -54,7 +54,7 @@ public FormatAnalyzer(File schemaDbDir) {
* @param f a <code>File</code> value
* @return a <code>DataDescriptor</code> value
*/
- public DataDescriptor describeData(File f) throws IOException {
+ public DataDescriptor describeData(File f, int maxLines) throws IOException {
String fname = f.getName();
// Test to see if the file is one of a handful of known structured formats.
if (CSVDataDescriptor.isCSV(f)) {
@@ -76,7 +76,7 @@ public DataDescriptor describeData(File f) throws IOException {
// It's not one of the known formats, so apply LearnStructure (and
// SchemaDictionary), then emit the resulting Avro data.
try {
- return new UnknownTextDataDescriptor(f, schemaDbDir);
+ return new UnknownTextDataDescriptor(f, schemaDbDir, maxLines);
} catch (Exception iex) {
// If that doesn't work, then give up and call it unstructured
return new UnstructuredFileDescriptor(f);
@@ -101,7 +101,7 @@ public static void main(String argv[]) throws IOException {
File schemaDbDir = new File(argv[1]);
FormatAnalyzer fa = new FormatAnalyzer(schemaDbDir);
- DataDescriptor descriptor = fa.describeData(inputFile);
+ DataDescriptor descriptor = fa.describeData(inputFile, -1);
System.err.println("Filename: " + descriptor.getFilename());
System.err.println("Filetype identifier: " + descriptor.getFileTypeIdentifier());
List<SchemaDescriptor> schemas = descriptor.getSchemaDescriptor();
View
4 src/java/com/cloudera/recordbreaker/analyzer/UnknownTextDataDescriptor.java
@@ -45,7 +45,7 @@
/**
* Creates a new <code>UnknownTextDataDescriptor</code>.
*/
- public UnknownTextDataDescriptor(File f, File schemaDictDir) throws IOException {
+ public UnknownTextDataDescriptor(File f, File schemaDictDir, int maxLines) throws IOException {
this.f = f;
this.schemaDictDir = schemaDictDir;
this.workingAvroFile = File.createTempFile("textdesc", "avro", null);
@@ -55,7 +55,7 @@ public UnknownTextDataDescriptor(File f, File schemaDictDir) throws IOException
// 2. Test it against the known database of types.
// 3. Return the top-k types/schemas that we discover, as long as they pass a threshold.
LearnStructure ls = new LearnStructure();
- ls.inferRecordFormat(f, workingSchemaFile, null, null, workingAvroFile, false);
+ ls.inferRecordFormat(f, workingSchemaFile, null, null, workingAvroFile, false, maxLines);
// The most basic schema descriptor is the raw one that captures the anonymous avro file
schemaDescriptors.add(new UnknownTextSchemaDescriptor(workingAvroFile));
View
7 src/java/com/cloudera/recordbreaker/learnstructure/LearnStructure.java
@@ -42,7 +42,7 @@ public LearnStructure() {
/**
*/
- public void inferRecordFormat(File f, File schemaFile, File parseTreeFile, File jsonDataFile, File avroDataFile, boolean verbose) throws IOException {
+ public void inferRecordFormat(File f, File schemaFile, File parseTreeFile, File jsonDataFile, File avroDataFile, boolean verbose, int maxLines) throws IOException {
// Store parse errors and results
List<Integer> unparseableLineNos = new ArrayList<Integer>();
List<String> unparseableStrs = new ArrayList<String>();
@@ -58,6 +58,9 @@ public void inferRecordFormat(File f, File schemaFile, File parseTreeFile, File
String s = in.readLine();
int lineno = 0;
while (s != null) {
+ if (maxLines >= 0 && lineno >= maxLines) {
+ break;
+ }
List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s);
if (chunkToks != null) {
allChunks.add(chunkToks);
@@ -226,6 +229,6 @@ public static void main(String argv[]) throws IOException {
avroDataFile = new File(outdir, DATA_FILENAME);
}
LearnStructure ls = new LearnStructure();
- ls.inferRecordFormat(f, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true);
+ ls.inferRecordFormat(f, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true, -1);
}
}

0 comments on commit ea62f89

Please sign in to comment.