diff --git a/src/main/java/org/gedcom4j/io/reader/AnselReader.java b/src/main/java/org/gedcom4j/io/reader/AnselReader.java index 6045cd6b..1f06d8f4 100644 --- a/src/main/java/org/gedcom4j/io/reader/AnselReader.java +++ b/src/main/java/org/gedcom4j/io/reader/AnselReader.java @@ -64,7 +64,7 @@ class AnselReader extends AbstractEncodingSpecificReader { /** * Are we at the end of file yet? */ - private final boolean eof = false; + private boolean eof = false; /** * Constructor @@ -92,24 +92,18 @@ public String nextLine() throws IOException, GedcomParserException { // Check for EOF if (currChar < 0) { result = getThisLine(); + eof = true; break; } - // Check for carriage returns - signify EOL - if (currChar == 0x0D) { - result = getThisLine(); - lineBufferIdx = 0; - break; - } - - // Check for line feeds - signify EOL (unless prev char was a - // CR) - if (currChar == 0x0A) { - if (lastChar != 0x0D) { + // Check for carriage returns or line feeds - signify EOL + if (currChar == 0x0D || currChar == 0x0A) { + if (lineBufferIdx > 0) { result = getThisLine(); lineBufferIdx = 0; + break; } - break; + continue; } // All other characters are treated the same at this point, @@ -118,10 +112,11 @@ public String nextLine() throws IOException, GedcomParserException { if (lineBufferIdx >= 255) { result = getThisLine(); + lineBufferIdx = 0; insertSyntheticConcTag(); + break; } - continue; } return result; } @@ -146,8 +141,8 @@ private int getCurrentLevelFromLineBuffer() throws GedcomParserException { * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't * put in CONC's on the fly (because we don't know what level we're at) */ - throw new GedcomParserException( - "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically."); + throw new GedcomParserException("Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + + "Can't split automatically."); } } else { if (lineBuffer[1] == ' ') { @@ -157,8 +152,8 @@ private int getCurrentLevelFromLineBuffer() throws GedcomParserException { * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't * put in CONC's on the fly (because we don't know what level we're at) */ - throw new GedcomParserException( - "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically."); + throw new GedcomParserException("Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + + "Can't split automatically."); } } } else { @@ -166,8 +161,8 @@ private int getCurrentLevelFromLineBuffer() throws GedcomParserException { * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in * CONC's on the fly (because we don't know what level we're at) */ - throw new GedcomParserException( - "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically."); + throw new GedcomParserException("Line " + linesRead + + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically."); } return level; } diff --git a/src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java b/src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java index 2505ccff..3770e496 100644 --- a/src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java +++ b/src/main/java/org/gedcom4j/io/reader/GedcomFileReader.java @@ -155,23 +155,6 @@ long firstNBytes(int n) { return result; } - /** - * Save off a chunk of the beginning of the input stream to memory for easy inspection. The data is loaded into the - * field - * - * @throws IOException - * if the stream of bytes cannot be read. - */ - void saveFirstChunk() throws IOException { - byteStream.mark(FIRST_CHUNK_SIZE); - int read = byteStream.read(firstChunk); - if (read < 0) { - throw new IOException("Unable to read bytes off stream"); - } - byteStream.reset(); - - } - /** * Tries to determined from examining the first 1000 lines/2k of the file if the file is ASCII, ANSEL, or UTF-8 * encoded using a variety of means. @@ -279,4 +262,21 @@ private AbstractEncodingSpecificReader getEncodingSpecificReader() throws IOExce } } + + /** + * Save off a chunk of the beginning of the input stream to memory for easy inspection. The data is loaded into the + * field + * + * @throws IOException + * if the stream of bytes cannot be read. + */ + private void saveFirstChunk() throws IOException { + byteStream.mark(FIRST_CHUNK_SIZE); + int read = byteStream.read(firstChunk); + if (read < 0) { + throw new IOException("Unable to read bytes off stream"); + } + byteStream.reset(); + + } } diff --git a/src/main/java/org/gedcom4j/io/reader/UnicodeBigEndianReader.java b/src/main/java/org/gedcom4j/io/reader/UnicodeBigEndianReader.java index baf8f471..470cb66a 100644 --- a/src/main/java/org/gedcom4j/io/reader/UnicodeBigEndianReader.java +++ b/src/main/java/org/gedcom4j/io/reader/UnicodeBigEndianReader.java @@ -105,23 +105,17 @@ public String nextLine() throws IOException, GedcomParserException { beginningOfFile = false; - // Check for carriage returns - signify EOL - if (currChar1 == 0x00 && currChar2 == 0x0D) { - result = lineBuffer.toString(); - lineBuffer.setLength(0); - break; - } - - // Check for line feeds - signify EOL (unless prev char was a - // CR) - if (currChar1 == 0x00 && currChar2 == 0x0A) { - if (lastChar1 != 0x00 || lastChar2 != 0x0D) { + // Check for carriage returns or line feeds - signify EOL + if ((currChar1 == 0x00 && currChar2 == 0x0D) || (currChar1 == 0x00 && currChar2 == 0x0A)) { + if (lineBuffer.length() > 0) { result = lineBuffer.toString(); lineBuffer.setLength(0); + break; } - break; + continue; } + // Do bit shifting stuff to make the character from the bytes int unicodeChar = currChar1 << 8 | currChar2; lineBuffer.append(Character.valueOf((char) unicodeChar)); } diff --git a/src/main/java/org/gedcom4j/io/reader/UnicodeLittleEndianReader.java b/src/main/java/org/gedcom4j/io/reader/UnicodeLittleEndianReader.java index 4a0fa151..7fde1709 100644 --- a/src/main/java/org/gedcom4j/io/reader/UnicodeLittleEndianReader.java +++ b/src/main/java/org/gedcom4j/io/reader/UnicodeLittleEndianReader.java @@ -107,21 +107,14 @@ public String nextLine() throws IOException, GedcomParserException { beginningOfFile = false; - // Check for carriage returns - signify EOL - if (currChar1 == 0x0D && currChar2 == 0x00) { - result = lineBuffer.toString(); - lineBuffer.setLength(0); - break; - } - - // Check for line feeds - signify EOL (unless prev char was a - // CR) - if (currChar1 == 0x0A && currChar2 == 0x00) { - if (lastChar1 != 0x0D || lastChar2 != 0x00) { + // Check for carriage returns or line feeds - signify EOL + if ((currChar1 == 0x0D && currChar2 == 0x00) || (currChar1 == 0x0A && currChar2 == 0x00)) { + if (lineBuffer.length() > 0) { result = lineBuffer.toString(); lineBuffer.setLength(0); + break; } - break; + continue; } int unicodeChar = currChar2 << 8 | currChar1; diff --git a/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java b/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java index 1e201cb0..98874918 100644 --- a/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java +++ b/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java @@ -127,11 +127,9 @@ public void testAnselDecodingSingleLine() throws IOException, GedcomParserExcept s = new BufferedInputStream(new ByteArrayInputStream(anselData)); GedcomFileReader gr = new GedcomFileReader(new GedcomParser(), s); - ; - List lines = gr.getLines(); - assertNotNull(lines); - assertFalse(lines.isEmpty()); - assertEquals("0 He\u0141\u0141o", lines.get(0)); + String l = gr.nextLine(); + assertNotNull(l); + assertEquals("0 He\u0141\u0141o", l); } finally { if (s != null) { s.close(); @@ -183,19 +181,14 @@ public void testAnselLfOnly() throws IOException, GedcomParserException { */ @Test public void testFirstNBytes() throws IOException, UnsupportedGedcomCharsetException { - GedcomFileReader gfr = new GedcomFileReader(new GedcomParser(), - new BufferedInputStream(new ByteArrayInputStream(new byte[] { 0x12, 0x34, 0x56, 0x78 }))); + byte[] bytes = new byte[] { '0', ' ', 0x12, 0x34 }; + GedcomFileReader gfr = new GedcomFileReader(new GedcomParser(), new BufferedInputStream(new ByteArrayInputStream(bytes))); // Haven't save the first chunk yet assertNotNull(gfr.firstChunk); - assertEquals(0x0, gfr.firstNBytes(1)); - assertEquals(0x0, gfr.firstNBytes(2)); - assertEquals(0x0, gfr.firstNBytes(3)); - - gfr.saveFirstChunk(); - assertEquals(0x12, gfr.firstNBytes(1)); - assertEquals(0x1234, gfr.firstNBytes(2)); - assertEquals(0x123456, gfr.firstNBytes(3)); - assertEquals(0x12345678, gfr.firstNBytes(4)); + assertEquals(0x30, gfr.firstNBytes(1)); + assertEquals(0x3020, gfr.firstNBytes(2)); + assertEquals(0x302012, gfr.firstNBytes(3)); + assertEquals(0x30201234, gfr.firstNBytes(4)); } /** @@ -460,7 +453,7 @@ public void testUtf8LfNoBOM() throws IOException, GedcomParserException { * @throws GedcomParserException * if the file load was cancelled or had malformed data */ - void testUtf8File(String fileName) throws IOException, FileNotFoundException, GedcomParserException { + private void testUtf8File(String fileName) throws IOException, FileNotFoundException, GedcomParserException { FileInputStream fileInputStream = null; BufferedInputStream bufferedInputStream = null; try { diff --git a/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java b/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java index 7a3e0d6a..7e440216 100644 --- a/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java +++ b/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java @@ -66,14 +66,13 @@ public void testAnsel() throws IOException, GedcomParserException { strings.add(s); s = ar.nextLine(); } - assertNotNull(strings); assertEquals("Should say there were 12 lines even though the file only has 11", 12, strings.size()); assertEquals("0 @N1@ NOTE This is an ridiculously long line that exceeds the GEDCOM maximum line length of 255 characters " + "so that we can test whether the readers can properly introduce CONC tags on the fly and keep going as if " + "everything was ok when the file has lines ", strings.get(9)); - assertEquals("1 CONC that are way too long like this one is, even though there are lots of programs that write non-standard GEDCOM files.", - strings.get(10)); + assertEquals("1 CONC that are way too long like this one is, even though there are lots of programs that write non-standard GEDCOM files.", strings + .get(10)); gp = new GedcomParser(); gp.load("sample/superlongline-ansel.ged");