Skip to content

Commit

Permalink
Issue #106 - First chop at AsciiReader
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt Harrah (frizbog) committed Jul 2, 2016
1 parent 3dfa8f5 commit 197638c
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 64 deletions.
97 changes: 38 additions & 59 deletions src/main/java/org/gedcom4j/io/reader/AsciiReader.java
Expand Up @@ -23,21 +23,37 @@

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.gedcom4j.exception.ParserCancelledException;
import org.gedcom4j.io.event.FileProgressEvent;
import org.gedcom4j.exception.GedcomParserException;
import org.gedcom4j.parser.GedcomParser;

/**
* A reader that loads from an input stream and gives back a collection of strings representing the data therein. This
* implementation handles ASCII encoding (1 byte per character, no extended character support).
* A reader that reads a single line from an Ascii-encoded file.
*
* @author frizbog
*/
class AsciiReader extends AbstractEncodingSpecificReader {

/**
* Are we at the end of file yet?
*/
private boolean eof = false;

/**
* The prior character we read
*/
private int lastChar = -1;

/**
* The current character we've just read
*/
private int currChar = -1;

/**
* The line buffer for the current line
*/
private final StringBuilder lineBuffer = new StringBuilder();

/**
* Constructor
*
Expand All @@ -51,87 +67,50 @@ protected AsciiReader(GedcomParser parser, InputStream byteStream) {
super(parser, byteStream);
}

/**
* {@inheritDoc}
*/
@Override
protected List<String> load() throws IOException, ParserCancelledException {
List<String> result = new ArrayList<String>();
StringBuilder lineBuffer = new StringBuilder();

int lastChar;
int b = -1;
boolean eof = false;

public String nextLine() throws IOException, GedcomParserException {
String result = null;
while (!eof) {
lastChar = b;
b = byteStream.read();
lastChar = currChar;
currChar = byteStream.read();

// Check for EOF
if (b < 0) {
if (currChar < 0) {
// hit EOF - add final line buffer (last line) and get out
addNonBlankLine(result, lineBuffer);
eof = true;
result = lineBuffer.toString();
break;
}

// Check for carriage returns - signify EOL
if (b == 0x0D) {
addNonBlankLine(result, lineBuffer);
if (currChar == 0x0D) {
result = lineBuffer.toString();
lineBuffer.setLength(0);
continue;
break;
}

// Check for line feeds - signify EOL (unless prev char was a
// CR)
if (b == 0x0A) {
if (currChar == 0x0A) {
if (lastChar != 0x0D) {
addNonBlankLine(result, lineBuffer);
result = lineBuffer.toString();
lineBuffer.setLength(0);
}
continue;
break;
}

// All other characters in 0x00 to 0x7F range are treated the
// same,
// regardless of encoding, and added as is
if (b < 0x80) {
lineBuffer.append(Character.valueOf((char) b));
if (currChar < 0x80) {
lineBuffer.append(Character.valueOf((char) currChar));
continue;
}

// If we fell through to here, we have an extended character
throw new IOException("Extended characters not supported in ASCII: 0x" + Integer.toHexString(b));
throw new IOException("Extended characters not supported in ASCII: 0x" + Integer.toHexString(currChar));
}
parser.notifyFileObservers(new FileProgressEvent(this, linesRead, true));
return result;
}

/**
* Add line to result if it is not blank
*
* @param result
* the resulting list of lines
* @param lineBuffer
* the line buffer
* @throws ParserCancelledException
* if the file load is cancelled
*/
private void addNonBlankLine(List<String> result, StringBuilder lineBuffer) throws ParserCancelledException {
if (parser.isCancelled()) {
throw new ParserCancelledException("File load is cancelled");
}
if (lineBuffer.length() > 0) {
String s = lineBuffer.toString();
if (STRINGS_TO_INTERN.contains(s)) {
result.add(s.intern());
} else {
result.add(s);
}
}
linesRead++;
if (linesRead % parser.getReadNotificationRate() == 0) {
parser.notifyFileObservers(new FileProgressEvent(this, linesRead, false));
}
}

}
6 changes: 5 additions & 1 deletion src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java
Expand Up @@ -26,6 +26,7 @@
import java.util.List;

import org.gedcom4j.exception.GedcomParserException;
import org.gedcom4j.exception.ParserCancelledException;
import org.gedcom4j.exception.UnsupportedGedcomCharsetException;
import org.gedcom4j.io.event.FileProgressEvent;
import org.gedcom4j.parser.GedcomParser;
Expand Down Expand Up @@ -110,7 +111,7 @@ public List<String> getLines() throws IOException, GedcomParserException {
String s = null;
do {
s = nextLine();
if (s != null) {
if (s != null && s.length() > 0) {
result.add(s);
}
} while (s != null);
Expand All @@ -127,6 +128,9 @@ public List<String> getLines() throws IOException, GedcomParserException {
* if the file is malformed and cannot be processed as a result
*/
public String nextLine() throws IOException, GedcomParserException {
if (parser.isCancelled()) {
throw new ParserCancelledException("File load is cancelled");
}
String result = encodingSpecificReader.nextLine();
linesProcessed++;
if (linesProcessed % parser.getReadNotificationRate() == 0 || result == null) {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/gedcom4j/parser/StringTreeBuilder.java
Expand Up @@ -28,7 +28,7 @@ static String leftTrim(String line) {
if (line == null) {
return null;
}
if (line == "") {
if (line.length() == 0) {
return "";
}
if (!Character.isWhitespace(line.charAt(0))) {
Expand Down
Expand Up @@ -33,7 +33,7 @@ public class AsciiReaderListenerTest extends AbstractReaderListenerTest {
* Constructor
*/
public AsciiReaderListenerTest() {
super("sample/willis-ascii.ged", 20036, 41);
super("sample/willis-ascii.ged", 20037, 41);
}

}
12 changes: 10 additions & 2 deletions src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java
Expand Up @@ -29,6 +29,7 @@
import java.util.List;

import org.gedcom4j.exception.GedcomParserException;
import org.gedcom4j.exception.UnsupportedGedcomCharsetException;
import org.gedcom4j.parser.GedcomParser;
import org.junit.Test;

Expand Down Expand Up @@ -177,9 +178,11 @@ public void testAnselLfOnly() throws IOException, GedcomParserException {
*
* @throws IOException
* if the first few bytes of the "file" cannot be read
* @throws UnsupportedGedcomCharsetException
* if the character set is not supported
*/
@Test
public void testFirstNBytes() throws IOException {
public void testFirstNBytes() throws IOException, UnsupportedGedcomCharsetException {
GedcomFileReader gfr = new GedcomFileReader(new GedcomParser(),
new BufferedInputStream(new ByteArrayInputStream(new byte[] { 0x12, 0x34, 0x56, 0x78 })));
// Haven't save the first chunk yet
Expand Down Expand Up @@ -459,9 +462,11 @@ public void testUtf8LfNoBOM() throws IOException, GedcomParserException {
*/
void testUtf8File(String fileName) throws IOException, FileNotFoundException, GedcomParserException {
FileInputStream fileInputStream = null;
BufferedInputStream bufferedInputStream = null;
try {
fileInputStream = new FileInputStream(fileName);
GedcomFileReader gr = new GedcomFileReader(new GedcomParser(), new BufferedInputStream(fileInputStream));
bufferedInputStream = new BufferedInputStream(fileInputStream);
GedcomFileReader gr = new GedcomFileReader(new GedcomParser(), bufferedInputStream);
List<String> lines = gr.getLines();
assertNotNull(lines);
assertEquals(77, lines.size());
Expand All @@ -481,6 +486,9 @@ void testUtf8File(String fileName) throws IOException, FileNotFoundException, Ge
assertEquals("1 NAME Ellen /\u0141owenst\u0117in/", lines.get(49));
assertEquals("1 NAME Fred /\u00DBlrich/", lines.get(53));
} finally {
if (bufferedInputStream != null) {
bufferedInputStream.close();
}
if (fileInputStream != null) {
fileInputStream.close();
}
Expand Down

0 comments on commit 197638c

Please sign in to comment.