Issue #106 - First chop at AnselReader

frizbog · Jul 2, 2016 · 3dfa8f5 · 3dfa8f5
1 parent 44ce70d
commit 3dfa8f5
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 153 deletions.
diff --git a/src/main/java/org/gedcom4j/exception/UnsupportedGedcomCharsetException.java b/src/main/java/org/gedcom4j/exception/UnsupportedGedcomCharsetException.java
@@ -26,7 +26,7 @@
  * 
  * @author frizbog1
  */
-public class UnsupportedGedcomCharsetException extends Exception {
+public class UnsupportedGedcomCharsetException extends GedcomParserException {
 
     /**
      * Serial Version UID

diff --git a/src/main/java/org/gedcom4j/io/encoding/AnselHandler.java b/src/main/java/org/gedcom4j/io/encoding/AnselHandler.java
@@ -178,54 +178,6 @@ public List<String> toAnselLines(List<String> utf16Lines) {
         return result;
     }
 
-    /**
-     * Converts a file (list) of ansel lines into utf16 lines
-     *
-     * @param anselLines
-     *            a list of strings, each character of which represents an unconverted ANSEL byte
-     * @return a list of UTF16 strings
-     */
-    public List<String> toUtf16Lines(List<String> anselLines) {
-        List<String> result = new ArrayList<String>();
-        String prevAnsel = null;
-        for (String ansel : anselLines) {
-            /*
-             * If concatenating from the previous line, need to see if the last character on previous line is a
-             * diacritical mark modifying the beginning of this line
-             */
-            if (prevAnsel != null && ansel.length() >= 6 && ansel.substring(2, 6).equals("CONC") && endsWithDiacritical(prevAnsel)) {
-                // Remove the last line we just added - need to adjust it and re-add it - not terribly efficient, but
-                // simpler to code
-                result.remove(result.size() - 1);
-
-                // Strip the leading combining diacritical off previous line
-                char d1 = prevAnsel.charAt(prevAnsel.length() - 1);
-                prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
-                char d2 = 0;
-                if (endsWithDiacritical(prevAnsel)) {
-                    // There was a second diacritical at the end of the line
-                    d2 = prevAnsel.charAt(prevAnsel.length() - 1);
-                    prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
-                }
-                // Re-add the line with the diacriticals removed
-                result.add(toUtf16(prevAnsel));
-                // Insert the diacriticals on the current line so they stay with the character being modified
-                if (d2 == 0) {
-                    ansel = ansel.substring(0, 7) + d1 + ansel.substring(7);
-                } else {
-                    ansel = ansel.substring(0, 7) + d2 + d1 + ansel.substring(7);
-                }
-                // And translate/add it
-                result.add(toUtf16(ansel));
-            } else {
-                // Simpler case - just translate current line
-                result.add(toUtf16(ansel));
-            }
-            prevAnsel = ansel;
-        }
-        return result;
-    }
-
     /**
      * Convert an string of ANSEL bytes to UTF-16
      *
@@ -234,7 +186,7 @@ public List<String> toUtf16Lines(List<String> anselLines) {
      *            string, unconverted to any unicode and without changing the order of characters.
      * @return the UTF16 string representation of the ANSEL data, after translation
      */
-    String toUtf16(String ansel) {
+    public String toUtf16(String ansel) {
         char[] utf16 = new char[512];
         int anselIndex = 0;
         int utfIdx = 0;
@@ -291,6 +243,54 @@ String toUtf16(String ansel) {
         return s;
     }
 
+    /**
+     * Converts a file (list) of ansel lines into utf16 lines
+     *
+     * @param anselLines
+     *            a list of strings, each character of which represents an unconverted ANSEL byte
+     * @return a list of UTF16 strings
+     */
+    public List<String> toUtf16Lines(List<String> anselLines) {
+        List<String> result = new ArrayList<String>();
+        String prevAnsel = null;
+        for (String ansel : anselLines) {
+            /*
+             * If concatenating from the previous line, need to see if the last character on previous line is a
+             * diacritical mark modifying the beginning of this line
+             */
+            if (prevAnsel != null && ansel.length() >= 6 && ansel.substring(2, 6).equals("CONC") && endsWithDiacritical(prevAnsel)) {
+                // Remove the last line we just added - need to adjust it and re-add it - not terribly efficient, but
+                // simpler to code
+                result.remove(result.size() - 1);
+
+                // Strip the leading combining diacritical off previous line
+                char d1 = prevAnsel.charAt(prevAnsel.length() - 1);
+                prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
+                char d2 = 0;
+                if (endsWithDiacritical(prevAnsel)) {
+                    // There was a second diacritical at the end of the line
+                    d2 = prevAnsel.charAt(prevAnsel.length() - 1);
+                    prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
+                }
+                // Re-add the line with the diacriticals removed
+                result.add(toUtf16(prevAnsel));
+                // Insert the diacriticals on the current line so they stay with the character being modified
+                if (d2 == 0) {
+                    ansel = ansel.substring(0, 7) + d1 + ansel.substring(7);
+                } else {
+                    ansel = ansel.substring(0, 7) + d2 + d1 + ansel.substring(7);
+                }
+                // And translate/add it
+                result.add(toUtf16(ansel));
+            } else {
+                // Simpler case - just translate current line
+                result.add(toUtf16(ansel));
+            }
+            prevAnsel = ansel;
+        }
+        return result;
+    }
+
     /**
      * Return true if ANSEL string ends in a combining diacritical
      *

diff --git a/src/main/java/org/gedcom4j/io/reader/AbstractEncodingSpecificReader.java b/src/main/java/org/gedcom4j/io/reader/AbstractEncodingSpecificReader.java
@@ -85,13 +85,14 @@ public void cancel() {
     }
 
     /**
-     * Read all the lines using the appropriate encoding
+     * Get the next line of the file.
      * 
-     * @return all the lines of the input stream
+     * @return the next line of the file, or null if no more lines to read.
      * @throws IOException
-     *             if there is a problem reading the bytes
+     *             if the file cannot be read
      * @throws GedcomParserException
-     *             if the file load is cancelled or fails
+     *             if the file is malformed and cannot be parsed as a GEDCOM file for some reason
      */
-    protected abstract List<String> load() throws IOException, GedcomParserException;
+    public abstract String nextLine() throws IOException, GedcomParserException;
+
 }
diff --git a/src/main/java/org/gedcom4j/io/reader/AnselReader.java b/src/main/java/org/gedcom4j/io/reader/AnselReader.java
@@ -23,13 +23,9 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
 
 import org.gedcom4j.exception.GedcomParserException;
-import org.gedcom4j.exception.ParserCancelledException;
 import org.gedcom4j.io.encoding.AnselHandler;
-import org.gedcom4j.io.event.FileProgressEvent;
 import org.gedcom4j.parser.GedcomParser;
 
 /**
@@ -66,9 +62,9 @@ class AnselReader extends AbstractEncodingSpecificReader {
     private final char[] lineBuffer = new char[256];
 
     /**
-     * The resulting list of strings containing the file data
+     * Are we at the end of file yet?
      */
-    List<String> result = new ArrayList<String>();
+    private final boolean eof = false;
 
     /**
      * Constructor
@@ -83,137 +79,140 @@ protected AnselReader(GedcomParser parser, InputStream byteStream) {
         super(parser, byteStream);
     }
 
-    /**
-     * {@inheritDoc}
-     */
     @Override
-    protected List<String> load() throws IOException, GedcomParserException {
-
-        boolean eof = false;
+    public String nextLine() throws IOException, GedcomParserException {
+        if (eof) {
+            return null;
+        }
+        String result = null;
         while (!eof) {
             lastChar = currChar;
             currChar = byteStream.read();
 
             // Check for EOF
             if (currChar < 0) {
-                addNonBlankLine();
+                result = getThisLine();
                 break;
             }
 
             // Check for carriage returns - signify EOL
             if (currChar == 0x0D) {
-                addNonBlankLine();
+                result = getThisLine();
                 lineBufferIdx = 0;
-                continue;
+                break;
             }
 
             // Check for line feeds - signify EOL (unless prev char was a
             // CR)
             if (currChar == 0x0A) {
                 if (lastChar != 0x0D) {
-                    addNonBlankLine();
+                    result = getThisLine();
                     lineBufferIdx = 0;
                 }
-                continue;
+                break;
             }
 
             // All other characters are treated the same at this point,
             // regardless of encoding, and added as is
             lineBuffer[lineBufferIdx++] = (char) currChar;
 
-            forceLineSplitIfNeeded();
+            if (lineBufferIdx >= 255) {
+                result = getThisLine();
+                insertSyntheticConcTag();
+            }
 
             continue;
         }
-        result = anselHandler.toUtf16Lines(result);
-        parser.notifyFileObservers(new FileProgressEvent(this, linesRead, true));
         return result;
     }
 
     /**
-     * Add line to result if it is not blank. Notify listeners of progress every 100 lines.
+     * Determine what level the current line in the line buffer is
      * 
-     * @throws ParserCancelledException
-     *             if the file load is cancelled
+     * @return what level the current line in the line buffer is
+     * @throws GedcomParserException
+     *             if the line level can't be determined, because the file doesn't begin with a 1 or 2 digit number
+     *             followed by a space.
      */
-    private void addNonBlankLine() throws ParserCancelledException {
-        if (parser.isCancelled()) {
-            throw new ParserCancelledException("File load is cancelled");
+    private int getCurrentLevelFromLineBuffer() throws GedcomParserException {
+        int level = -1;
+        if (Character.isDigit(lineBuffer[0])) {
+            if (Character.isDigit(lineBuffer[1])) {
+                if (lineBuffer[2] == ' ') {
+                    level = Character.getNumericValue(lineBuffer[0]) * 10 + Character.getNumericValue(lineBuffer[1]);
+
+                } else {
+                    /*
+                     * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
+                     * put in CONC's on the fly (because we don't know what level we're at)
+                     */
+                    throw new GedcomParserException(
+                            "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
+                }
+            } else {
+                if (lineBuffer[1] == ' ') {
+                    level = Character.getNumericValue(lineBuffer[0]);
+                } else {
+                    /*
+                     * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
+                     * put in CONC's on the fly (because we don't know what level we're at)
+                     */
+                    throw new GedcomParserException(
+                            "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
+                }
+            }
+        } else {
+            /*
+             * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in
+             * CONC's on the fly (because we don't know what level we're at)
+             */
+            throw new GedcomParserException(
+                    "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically.");
         }
+        return level;
+    }
+
+    /**
+     * Get the current line buffer's contents
+     * 
+     * @return the current line buffer's contents
+     */
+    private String getThisLine() {
+        String result = null;
         if (lineBufferIdx > 0) {
             String s = new String(lineBuffer).substring(0, lineBufferIdx);
-            if (STRINGS_TO_INTERN.contains(s)) {
-                result.add(s.intern());
-            } else {
-                result.add(s);
+            result = anselHandler.toUtf16(s);
+            if (STRINGS_TO_INTERN.contains(result)) {
+                result = result.intern();
             }
         }
         linesRead++;
-        if (linesRead % parser.getReadNotificationRate() == 0) {
-            parser.notifyFileObservers(new FileProgressEvent(this, linesRead, false));
-        }
+        return result;
     }
 
     /**
-     * Force synthetic CONC tags for any line longer than 255 characters
+     * Insert synthetic CONC tags into the character buffer as if they had been there the whole time
      * 
-     * @throws ParserCancelledException
      * @throws GedcomParserException
      */
-    private void forceLineSplitIfNeeded() throws ParserCancelledException, GedcomParserException {
-        if (lineBufferIdx >= 255) {
-            int level = -1;
-            addNonBlankLine();
-            if (Character.isDigit(lineBuffer[0])) {
-                if (Character.isDigit(lineBuffer[1])) {
-                    if (lineBuffer[2] == ' ') {
-                        level = Character.getNumericValue(lineBuffer[0]) * 10 + Character.getNumericValue(lineBuffer[1]);
-
-                    } else {
-                        /*
-                         * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we
-                         * can't put in CONC's on the fly (because we don't know what level we're at)
-                         */
-                        throw new GedcomParserException(
-                                "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
-                    }
-                } else {
-                    if (lineBuffer[1] == ' ') {
-                        level = Character.getNumericValue(lineBuffer[0]);
-                    } else {
-                        /*
-                         * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we
-                         * can't put in CONC's on the fly (because we don't know what level we're at)
-                         */
-                        throw new GedcomParserException(
-                                "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
-                    }
-                }
-            } else {
-                /*
-                 * Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in
-                 * CONC's on the fly (because we don't know what level we're at)
-                 */
-                throw new GedcomParserException(
-                        "Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically.");
-            }
-
-            lineBufferIdx = 0;
-            parser.warnings.add("Line " + linesRead + " exceeds 255 characters - introducing synthetic CONC tag to split line");
-            level++;
-            if (level > 9) {
-                lineBuffer[lineBufferIdx++] = Character.forDigit(level / 10, 10);
-                lineBuffer[lineBufferIdx++] = Character.forDigit(level % 10, 10);
-            } else {
-                lineBuffer[lineBufferIdx++] = Character.forDigit(level, 10);
-            }
-            lineBuffer[lineBufferIdx++] = ' ';
-            lineBuffer[lineBufferIdx++] = 'C';
-            lineBuffer[lineBufferIdx++] = 'O';
-            lineBuffer[lineBufferIdx++] = 'N';
-            lineBuffer[lineBufferIdx++] = 'C';
-            lineBuffer[lineBufferIdx++] = ' ';
+    private void insertSyntheticConcTag() throws GedcomParserException {
+        int level = getCurrentLevelFromLineBuffer();
+
+        lineBufferIdx = 0;
+        parser.warnings.add("Line " + linesRead + " exceeds 255 characters - introducing synthetic CONC tag to split line");
+        level++;
+        if (level > 9) {
+            lineBuffer[lineBufferIdx++] = Character.forDigit(level / 10, 10);
+            lineBuffer[lineBufferIdx++] = Character.forDigit(level % 10, 10);
+        } else {
+            lineBuffer[lineBufferIdx++] = Character.forDigit(level, 10);
         }
+        lineBuffer[lineBufferIdx++] = ' ';
+        lineBuffer[lineBufferIdx++] = 'C';
+        lineBuffer[lineBufferIdx++] = 'O';
+        lineBuffer[lineBufferIdx++] = 'N';
+        lineBuffer[lineBufferIdx++] = 'C';
+        lineBuffer[lineBufferIdx++] = ' ';
     }
 
 }