Issue #106 - all but one test passes

frizbog · Jul 3, 2016 · d66c9e6 · d66c9e6
1 parent e6ce913
commit d66c9e6
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 39 deletions.
diff --git a/src/main/java/org/gedcom4j/io/reader/AnselReader.java b/src/main/java/org/gedcom4j/io/reader/AnselReader.java
@@ -23,6 +23,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Arrays;
 
 import org.gedcom4j.exception.GedcomParserException;
 import org.gedcom4j.io.encoding.AnselHandler;
@@ -36,6 +37,11 @@
  */
 class AnselReader extends AbstractEncodingSpecificReader {
 
+    /**
+     * The byte value at which combining diacritics begin in ANSEL encoding
+     */
+    private static final char ANSEL_DIACRITICS_BEGIN_AT = 0x00E0;
+
     /**
      * Helper class
      */
@@ -52,19 +58,36 @@ class AnselReader extends AbstractEncodingSpecificReader {
     private int currChar = -1;
 
     /**
-     * Last character read
+     * Are we at the end of file yet?
      */
-    private int lastChar;
+    private boolean eof = false;
 
     /**
      * The line buffer
      */
     private final char[] lineBuffer = new char[256];
 
     /**
-     * Are we at the end of file yet?
+     * Prior character read
      */
-    private boolean eof = false;
+    private int oneCharBack = -1;
+
+    /**
+     * Character prior to the last character read before the current character - need this because sometimes there are
+     * two combining diactrical characters
+     */
+    private int twoCharsBack = -1;
+
+    /**
+     * Index into {@link #holdingBin} array
+     */
+    private int holdingBinIdx = 0;
+
+    /**
+     * A holding bin for combining diacritics that are separated from the base character by a line break. This makes it
+     * possible for us to keep the diacritics and the base character together.
+     */
+    private final char[] holdingBin = new char[2];
 
     /**
      * Constructor
@@ -86,7 +109,8 @@ public String nextLine() throws IOException, GedcomParserException {
         }
         String result = null;
         while (!eof) {
-            lastChar = currChar;
+            twoCharsBack = oneCharBack;
+            oneCharBack = currChar;
             currChar = byteStream.read();
 
             // Check for EOF
@@ -96,73 +120,115 @@ public String nextLine() throws IOException, GedcomParserException {
                 break;
             }
 
+            // Ignore leading spaces
+            if (currChar == ' ' && lineBufferIdx == 0) {
+                continue;
+            }
+
             // Check for carriage returns or line feeds - signify EOL
             if (currChar == 0x0D || currChar == 0x0A) {
+
+                // Check for line breaks between combining diacritics and the base characters
+
+                if (oneCharBack >= ANSEL_DIACRITICS_BEGIN_AT) {
+                    if (twoCharsBack >= ANSEL_DIACRITICS_BEGIN_AT) {
+                        /*
+                         * Two diacritics at end of line, already in the lineBuffer, and presumably the base character
+                         * is at the beginning of the next line (after a CONC tag) - store in holding bin
+                         */
+                        holdingBin[holdingBinIdx++] = (char) twoCharsBack;
+                        twoCharsBack = -1; // Keeps from holding characters in reserve repeatedly
+                    }
+                    /*
+                     * One diacritic at end of line, already in the lineBuffer, and presumably the base character is at
+                     * the beginning of the next line (after a CONC tag) - store in holding bin
+                     */
+                    holdingBin[holdingBinIdx++] = (char) oneCharBack;
+                    oneCharBack = -1; // Keeps from holding characters in reserve repeatedly
+                }
+
+                // If we have a line break and contents in the buffer, return the string
                 if (lineBufferIdx > 0) {
                     result = getThisLine();
-                    lineBufferIdx = 0;
                     break;
                 }
+
+                // Otherwise, ignore the extra line break characters
                 continue;
             }
 
-            // All other characters are treated the same at this point,
-            // regardless of encoding, and added as is
-            lineBuffer[lineBufferIdx++] = (char) currChar;
+            // If this is a CONC line, AND if we have held-over diacritics from the previous line, pretend they're here
+            // on the byte strem now
+            if (holdingBinIdx > 0 && isStartOfConcLine()) {
+                lineBuffer[lineBufferIdx++] = holdingBin[0];
+                if (holdingBinIdx > 1) {
+                    lineBuffer[lineBufferIdx++] = holdingBin[1];
+                }
+                holdingBinIdx = 0;
+                holdingBin[0] = ' ';
+                holdingBin[1] = ' ';
 
-            if (lineBufferIdx >= 255) {
+            }
+
+            // Split line if it's too long, but don't split diactrics apart from their base characters
+            if (lineBufferIdx >= 250 && currChar < ANSEL_DIACRITICS_BEGIN_AT) {
                 result = getThisLine();
-                lineBufferIdx = 0;
-                insertSyntheticConcTag();
+                insertSyntheticConcTag(result);
                 break;
             }
 
+            // All other characters are treated the same at this point,
+            // regardless of encoding, and added as is
+            lineBuffer[lineBufferIdx++] = (char) currChar;
+
         }
         return result;
+
     }
 
     /**
-     * Determine what level the current line in the line buffer is
+     * Determine what level was in use on the provided line
+     * 
+     * @param line
+     *            the line to determine the level of
      * 
-     * @return what level the current line in the line buffer is
+     * @return what level the supplied line was
      * @throws GedcomParserException
      *             if the line level can't be determined, because the file doesn't begin with a 1 or 2 digit number
      *             followed by a space.
      */
-    private int getCurrentLevelFromLineBuffer() throws GedcomParserException {
+    private int getLevelFromLine(String line) throws GedcomParserException {
         int level = -1;
-        if (Character.isDigit(lineBuffer[0])) {
-            if (Character.isDigit(lineBuffer[1])) {
-                if (lineBuffer[2] == ' ') {
-                    level = Character.getNumericValue(lineBuffer[0]) * 10 + Character.getNumericValue(lineBuffer[1]);
+        char[] lineChars = line.toCharArray();
+        if (Character.isDigit(lineChars[0])) {
+            if (Character.isDigit(lineChars[1])) {
+                if (lineChars[2] == ' ') {
+                    level = Character.getNumericValue(lineChars[0]) * 10 + Character.getNumericValue(lineChars[1]);
 
                 } else {
                     /*
                      * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
                      * put in CONC's on the fly (because we don't know what level we're at)
                      */
-                    throw new GedcomParserException("Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. "
-                            + "Can't split automatically.");
+                    throw new GedcomParserException("Line " + linesRead + " does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
                 }
             } else {
-                if (lineBuffer[1] == ' ') {
-                    level = Character.getNumericValue(lineBuffer[0]);
+                if (lineChars[1] == ' ') {
+                    level = Character.getNumericValue(lineChars[0]);
                 } else {
                     /*
                      * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
                      * put in CONC's on the fly (because we don't know what level we're at)
                      */
-                    throw new GedcomParserException("Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. "
-                            + "Can't split automatically.");
+                    throw new GedcomParserException("Line " + linesRead + " does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
                 }
             }
         } else {
             /*
              * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in
              * CONC's on the fly (because we don't know what level we're at)
              */
-            throw new GedcomParserException("Line " + linesRead
-                    + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically.");
+            throw new GedcomParserException("Line " + linesRead + " does not begin with a 1 or 2 digit number. Can't split automatically.");
         }
         return level;
     }
@@ -175,26 +241,27 @@ private int getCurrentLevelFromLineBuffer() throws GedcomParserException {
     private String getThisLine() {
         String result = null;
         if (lineBufferIdx > 0) {
-            String s = new String(lineBuffer).substring(0, lineBufferIdx);
+            String s = new String(lineBuffer).substring(0, lineBufferIdx - holdingBinIdx);
             result = anselHandler.toUtf16(s);
-            if (STRINGS_TO_INTERN.contains(result)) {
-                result = result.intern();
-            }
         }
         linesRead++;
+        Arrays.fill(lineBuffer, ' ');
+        lineBufferIdx = 0;
         return result;
     }
 
     /**
      * Insert synthetic CONC tags into the character buffer as if they had been there the whole time
      * 
+     * @param previousLine
+     *            the previous line
+     * 
      * @throws GedcomParserException
      */
-    private void insertSyntheticConcTag() throws GedcomParserException {
-        int level = getCurrentLevelFromLineBuffer();
+    private void insertSyntheticConcTag(String previousLine) throws GedcomParserException {
+        int level = getLevelFromLine(previousLine);
 
-        lineBufferIdx = 0;
-        parser.warnings.add("Line " + linesRead + " exceeds 255 characters - introducing synthetic CONC tag to split line");
+        parser.warnings.add("Line " + linesRead + " exceeds max length - introducing synthetic CONC tag to split line");
         level++;
         if (level > 9) {
             lineBuffer[lineBufferIdx++] = Character.forDigit(level / 10, 10);
@@ -208,6 +275,19 @@ private void insertSyntheticConcTag() throws GedcomParserException {
         lineBuffer[lineBufferIdx++] = 'N';
         lineBuffer[lineBufferIdx++] = 'C';
         lineBuffer[lineBufferIdx++] = ' ';
+        lineBuffer[lineBufferIdx++] = (char) currChar;
+    }
+
+    /**
+     * Are we at the beginning of the text portion of a CONC line? If so, now would be the time to insert any held-over
+     * characters from the previous line
+     * 
+     * @return true iff we at the beginning of the text portion of a CONC line
+     */
+    private boolean isStartOfConcLine() {
+        return (lineBufferIdx >= 7 && Character.isDigit(lineBuffer[lineBufferIdx - 7]) && lineBuffer[lineBufferIdx - 6] == ' ' && lineBuffer[lineBufferIdx
+                - 5] == 'C' && lineBuffer[lineBufferIdx - 4] == 'O' && lineBuffer[lineBufferIdx - 3] == 'N' && lineBuffer[lineBufferIdx - 2] == 'C'
+                && lineBuffer[lineBufferIdx - 1] == ' ');
     }
 
 }
diff --git a/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java b/src/test/java/org/gedcom4j/io/reader/GedcomFileReaderTest.java
@@ -53,7 +53,7 @@ public void testAnselCrlf() throws IOException, GedcomParserException {
          * Some encoded ANSEL data for a file with two lines. Line one consists of a zero, a space, and an uppercase H.
          * Line two consists of a lowercase o. The lines are separated by a CRLF.
          */
-        byte[] anselData = { 0x30, 0x20, 0x48, 0x0D, 0x0A, 0x6F };
+        byte[] anselData = { 0x30, 0x20, 0x48, /* CRLF begin */ 0x0D, 0x0A, /* CRLF end */ 0x6F };
 
         BufferedInputStream s = null;
         try {

diff --git a/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java b/src/test/java/org/gedcom4j/io/reader/LongLineReaderTest.java
@@ -70,9 +70,9 @@ public void testAnsel() throws IOException, GedcomParserException {
             assertEquals("Should say there were 12 lines even though the file only has 11", 12, strings.size());
             assertEquals("0 @N1@ NOTE This is an ridiculously long line that exceeds the GEDCOM maximum line length of 255 characters "
                     + "so that we can test whether the readers can properly introduce CONC tags on the fly and keep going as if "
-                    + "everything was ok when the file has lines ", strings.get(9));
-            assertEquals("1 CONC that are way too long like this one is, even though there are lots of programs that write non-standard GEDCOM files.", strings
-                    .get(10));
+                    + "everything was ok when the file has l", strings.get(9));
+            assertEquals("1 CONC ines that are way too long like this one is, even though there are lots of programs that write non-standard GEDCOM files.",
+                    strings.get(10));
 
             gp = new GedcomParser();
             gp.load("sample/superlongline-ansel.ged");