Skip to content

Commit

Permalink
Issue #106 Parser no longer needs full copy of file in memory
Browse files Browse the repository at this point in the history
The parser can read a line at a time now and add it to the StringTree
being built in StringTreeBuilder, and does not require an
ArrayList<String> holding the entire contents of the file in memory
(even temporarily).
  • Loading branch information
Matt Harrah (frizbog) committed Jul 3, 2016
1 parent 2ed14d6 commit d88a00f
Show file tree
Hide file tree
Showing 11 changed files with 65 additions and 47 deletions.
Expand Up @@ -40,8 +40,8 @@ abstract class AbstractEncodingSpecificReader {
* A collection of entire lines (Strings to intern) when loading into StringTree's - these are the strings that
* appear super-frequently in files. This will help keep from making loads of duplicated copies in the heap.
*/
protected final static List<String> STRINGS_TO_INTERN = Arrays
.asList(new String[] { "3 DATA", "1 BIRT", "1 SEX M", "1 SEX F", "1 DEAT", "1 MARR", "1 BURI", "1 EVEN", "1 RESI" });
protected final static List<String> STRINGS_TO_INTERN = Arrays.asList(new String[] { "3 DATA", "1 BIRT", "1 SEX M", "1 SEX F", "1 DEAT", "1 MARR", "1 BURI",
"1 EVEN", "1 RESI" });

/**
* The stream of bytes to read
Expand Down Expand Up @@ -78,7 +78,7 @@ abstract class AbstractEncodingSpecificReader {
}

/**
* Get the next line of the file.
* Get the next line of the file. Must not return empty strings, or lines that are not left-trimmed.
*
* @return the next line of the file, or null if no more lines to read.
* @throws IOException
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/gedcom4j/io/reader/AnselReader.java
Expand Up @@ -126,7 +126,7 @@ public String nextLine() throws IOException, GedcomParserException {
}

// Check for carriage returns or line feeds - signify EOL
if (currChar == 0x0D || currChar == 0x0A) {
if ((currChar == 0x0D || currChar == 0x0A)) {

// Check for line breaks between combining diacritics and the base characters

Expand Down
23 changes: 10 additions & 13 deletions src/main/java/org/gedcom4j/io/reader/AsciiReader.java
Expand Up @@ -78,24 +78,21 @@ public String nextLine() throws IOException, GedcomParserException {
if (currChar < 0) {
// hit EOF - add final line buffer (last line) and get out
eof = true;
result = lineBuffer.toString();
if (lineBuffer.length() > 0) {
result = lineBuffer.toString();
}
break;
}

// Check for carriage returns - signify EOL
if (currChar == 0x0D) {
result = lineBuffer.toString();
lineBuffer.setLength(0);
break;
// Ignore leading spaces
if (currChar == ' ' && lineBuffer.length() == 0) {
continue;
}

// Check for line feeds - signify EOL (unless prev char was a
// CR)
if (currChar == 0x0A) {
if (lastChar != 0x0D) {
result = lineBuffer.toString();
lineBuffer.setLength(0);
}
// Check for carriage returns or line feeds - signify EOL
if ((currChar == 0x0D || currChar == 0x0A) && lineBuffer.length() > 0) {
result = lineBuffer.toString();
lineBuffer.setLength(0);
break;
}

Expand Down
4 changes: 4 additions & 0 deletions src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java
Expand Up @@ -139,6 +139,10 @@ public String nextLine() throws IOException, GedcomParserException {
throw new ParserCancelledException("File load is cancelled");
}
String result = encodingSpecificReader.nextLine();
/*
* assert (result == null || result.length() > 0 && result.charAt(0) != ' ') : "nextLine() should either return
* null, or a non-empty left-trimmed string";
*/
linesProcessed++;
if (linesProcessed % parser.getReadNotificationRate() == 0 || result == null) {
parser.notifyFileObservers(new FileProgressEvent(this, linesProcessed, result == null));
Expand Down
Expand Up @@ -91,7 +91,9 @@ public String nextLine() throws IOException, GedcomParserException {
// Check for EOF
if (currChar1 < 0 || currChar2 < 0) {
// hit EOF - add final line buffer (last line) and get out
result = lineBuffer.toString();
if (lineBuffer.length() > 0) {
result = lineBuffer.toString();
}
eof = true;
break;
}
Expand Down
Expand Up @@ -93,7 +93,9 @@ public String nextLine() throws IOException, GedcomParserException {
// Check for EOF
if (currChar1 < 0 || currChar2 < 0) {
// hit EOF - add final line buffer (last line) and get out
result = lineBuffer.toString();
if (lineBuffer.length() > 0) {
result = lineBuffer.toString();
}
eof = true;
break;
}
Expand Down
22 changes: 20 additions & 2 deletions src/main/java/org/gedcom4j/parser/GedcomParser.java
Expand Up @@ -2276,8 +2276,26 @@ private void loadUserReference(StringTree st, UserReference u) {
* if there is an error with parsing the data from the stream
*/
private StringTree makeStringTreeFromStream(BufferedInputStream bytes) throws IOException, GedcomParserException {
List<String> lines = new GedcomFileReader(this, bytes).getLines();
return new StringTreeBuilder(this, bytes).makeStringTreeFromFlatLines(lines);
/* This was the old way - loaded entire file into arraylist of strings */
/*
* List<String> lines = new GedcomFileReader(this, bytes).getLines(); return new StringTreeBuilder(this,
* bytes).makeStringTreeFromFlatLines(lines);
*/
/*
* This is the new way - reads line at a time and adds each one to StringTree, avoiding temp arraylist of
* strings
*/
GedcomFileReader gfr = new GedcomFileReader(this, bytes);
StringTreeBuilder stb = new StringTreeBuilder(this);
String line = gfr.nextLine();
while (line != null) {
stb.appendLine(line);
line = gfr.nextLine();
if (cancelled) {
throw new ParserCancelledException("File load/parse is cancelled");
}
}
return stb.getTree();
}

/**
Expand Down
41 changes: 18 additions & 23 deletions src/main/java/org/gedcom4j/parser/StringTreeBuilder.java
@@ -1,6 +1,5 @@
package org.gedcom4j.parser;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
Expand Down Expand Up @@ -49,11 +48,6 @@ static String leftTrim(String line) {
*/
private final StringTree[] lastNodeAtLevel = new StringTree[100];

/**
* The buffered input stream we are reading from
*/
private final BufferedInputStream inputStream;

/**
* A flag indicating whether the current line from the input file begins with a 1-2 digit level number followed by a
* space
Expand Down Expand Up @@ -96,17 +90,24 @@ static String leftTrim(String line) {
* @param parser
* the {@link GedcomParser} this object will be assisting with making a {@link StringTree} for
*
* @param inputStream
* The buffered input stream we are reading from
*/
public StringTreeBuilder(GedcomParser parser, BufferedInputStream inputStream) {
public StringTreeBuilder(GedcomParser parser) {
this.parser = parser;
this.inputStream = inputStream;
treeForWholeFile.level = -1;
getTree().level = -1;
mostRecentlyAdded = null;
lineNum = 0; // Haven't read any lines yet
}

/**
* Get the string tree representing the entire file - the string tree that this {@link StringTreeBuilder} object was
* created to build
*
* @return the string tree representing the entire file
*/
public StringTree getTree() {
return treeForWholeFile;
}

/**
* Add the supplied line to the right place in the StringTree being built
*
Expand Down Expand Up @@ -145,19 +146,13 @@ void appendLine(String l) throws GedcomParserException {
* if there is an error with parsing the data from the stream
*/
StringTree makeStringTreeFromFlatLines(List<String> lines) throws IOException, GedcomParserException {
treeForWholeFile.level = -1;
getTree().level = -1;
mostRecentlyAdded = null;
try {
while (lineNum < lines.size()) {
String l = leftTrim(lines.get(lineNum));
appendLine(l);
}
} finally {
if (inputStream != null) {
inputStream.close();
}
while (lineNum < lines.size()) {
String l = leftTrim(lines.get(lineNum));
appendLine(l);
}
return treeForWholeFile;
return getTree();
}

/**
Expand All @@ -176,7 +171,7 @@ private void addNewNode() throws GedcomParserException {

StringTree addTo = null;
if (treeForCurrentLine.level == 0) {
addTo = treeForWholeFile;
addTo = getTree();
} else {
addTo = lastNodeAtLevel[treeForCurrentLine.level - 1];
}
Expand Down
Expand Up @@ -33,7 +33,7 @@ public class AsciiReaderListenerTest extends AbstractReaderListenerTest {
* Constructor
*/
public AsciiReaderListenerTest() {
super("sample/willis-ascii.ged", 20037, 41);
super("sample/willis-ascii.ged", 20036, 41);
}

}
Expand Up @@ -33,7 +33,7 @@ public class UnicodeBigEndianReaderListenerTest extends AbstractReaderListenerTe
* Constructor
*/
public UnicodeBigEndianReaderListenerTest() {
super("sample/willis-unicode-bigendian.ged", 20037, 41);
super("sample/willis-unicode-bigendian.ged", 20036, 41);
}

}
Expand Up @@ -33,7 +33,7 @@ public class UnicodeLittleEndianReaderListenerTest extends AbstractReaderListene
* Constructor
*/
public UnicodeLittleEndianReaderListenerTest() {
super("sample/willis-unicode-littleendian.ged", 20037, 41);
super("sample/willis-unicode-littleendian.ged", 20036, 41);
}

}

0 comments on commit d88a00f

Please sign in to comment.