Skip to content

Commit

Permalink
Issue #106 - First chop at AnselReader
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt Harrah (frizbog) committed Jul 2, 2016
1 parent 44ce70d commit 3dfa8f5
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 153 deletions.
Expand Up @@ -26,7 +26,7 @@
*
* @author frizbog1
*/
public class UnsupportedGedcomCharsetException extends Exception {
public class UnsupportedGedcomCharsetException extends GedcomParserException {

/**
* Serial Version UID
Expand Down
98 changes: 49 additions & 49 deletions src/main/java/org/gedcom4j/io/encoding/AnselHandler.java
Expand Up @@ -178,54 +178,6 @@ public List<String> toAnselLines(List<String> utf16Lines) {
return result;
}

/**
* Converts a file (list) of ansel lines into utf16 lines
*
* @param anselLines
* a list of strings, each character of which represents an unconverted ANSEL byte
* @return a list of UTF16 strings
*/
public List<String> toUtf16Lines(List<String> anselLines) {
List<String> result = new ArrayList<String>();
String prevAnsel = null;
for (String ansel : anselLines) {
/*
* If concatenating from the previous line, need to see if the last character on previous line is a
* diacritical mark modifying the beginning of this line
*/
if (prevAnsel != null && ansel.length() >= 6 && ansel.substring(2, 6).equals("CONC") && endsWithDiacritical(prevAnsel)) {
// Remove the last line we just added - need to adjust it and re-add it - not terribly efficient, but
// simpler to code
result.remove(result.size() - 1);

// Strip the leading combining diacritical off previous line
char d1 = prevAnsel.charAt(prevAnsel.length() - 1);
prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
char d2 = 0;
if (endsWithDiacritical(prevAnsel)) {
// There was a second diacritical at the end of the line
d2 = prevAnsel.charAt(prevAnsel.length() - 1);
prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
}
// Re-add the line with the diacriticals removed
result.add(toUtf16(prevAnsel));
// Insert the diacriticals on the current line so they stay with the character being modified
if (d2 == 0) {
ansel = ansel.substring(0, 7) + d1 + ansel.substring(7);
} else {
ansel = ansel.substring(0, 7) + d2 + d1 + ansel.substring(7);
}
// And translate/add it
result.add(toUtf16(ansel));
} else {
// Simpler case - just translate current line
result.add(toUtf16(ansel));
}
prevAnsel = ansel;
}
return result;
}

/**
* Convert an string of ANSEL bytes to UTF-16
*
Expand All @@ -234,7 +186,7 @@ public List<String> toUtf16Lines(List<String> anselLines) {
* string, unconverted to any unicode and without changing the order of characters.
* @return the UTF16 string representation of the ANSEL data, after translation
*/
String toUtf16(String ansel) {
public String toUtf16(String ansel) {
char[] utf16 = new char[512];
int anselIndex = 0;
int utfIdx = 0;
Expand Down Expand Up @@ -291,6 +243,54 @@ String toUtf16(String ansel) {
return s;
}

/**
* Converts a file (list) of ansel lines into utf16 lines
*
* @param anselLines
* a list of strings, each character of which represents an unconverted ANSEL byte
* @return a list of UTF16 strings
*/
public List<String> toUtf16Lines(List<String> anselLines) {
List<String> result = new ArrayList<String>();
String prevAnsel = null;
for (String ansel : anselLines) {
/*
* If concatenating from the previous line, need to see if the last character on previous line is a
* diacritical mark modifying the beginning of this line
*/
if (prevAnsel != null && ansel.length() >= 6 && ansel.substring(2, 6).equals("CONC") && endsWithDiacritical(prevAnsel)) {
// Remove the last line we just added - need to adjust it and re-add it - not terribly efficient, but
// simpler to code
result.remove(result.size() - 1);

// Strip the leading combining diacritical off previous line
char d1 = prevAnsel.charAt(prevAnsel.length() - 1);
prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
char d2 = 0;
if (endsWithDiacritical(prevAnsel)) {
// There was a second diacritical at the end of the line
d2 = prevAnsel.charAt(prevAnsel.length() - 1);
prevAnsel = prevAnsel.substring(0, prevAnsel.length() - 1);
}
// Re-add the line with the diacriticals removed
result.add(toUtf16(prevAnsel));
// Insert the diacriticals on the current line so they stay with the character being modified
if (d2 == 0) {
ansel = ansel.substring(0, 7) + d1 + ansel.substring(7);
} else {
ansel = ansel.substring(0, 7) + d2 + d1 + ansel.substring(7);
}
// And translate/add it
result.add(toUtf16(ansel));
} else {
// Simpler case - just translate current line
result.add(toUtf16(ansel));
}
prevAnsel = ansel;
}
return result;
}

/**
* Return true if ANSEL string ends in a combining diacritical
*
Expand Down
Expand Up @@ -85,13 +85,14 @@ public void cancel() {
}

/**
* Read all the lines using the appropriate encoding
* Get the next line of the file.
*
* @return all the lines of the input stream
* @return the next line of the file, or null if no more lines to read.
* @throws IOException
* if there is a problem reading the bytes
* if the file cannot be read
* @throws GedcomParserException
* if the file load is cancelled or fails
* if the file is malformed and cannot be parsed as a GEDCOM file for some reason
*/
protected abstract List<String> load() throws IOException, GedcomParserException;
public abstract String nextLine() throws IOException, GedcomParserException;

}
175 changes: 87 additions & 88 deletions src/main/java/org/gedcom4j/io/reader/AnselReader.java
Expand Up @@ -23,13 +23,9 @@

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.gedcom4j.exception.GedcomParserException;
import org.gedcom4j.exception.ParserCancelledException;
import org.gedcom4j.io.encoding.AnselHandler;
import org.gedcom4j.io.event.FileProgressEvent;
import org.gedcom4j.parser.GedcomParser;

/**
Expand Down Expand Up @@ -66,9 +62,9 @@ class AnselReader extends AbstractEncodingSpecificReader {
private final char[] lineBuffer = new char[256];

/**
* The resulting list of strings containing the file data
* Are we at the end of file yet?
*/
List<String> result = new ArrayList<String>();
private final boolean eof = false;

/**
* Constructor
Expand All @@ -83,137 +79,140 @@ protected AnselReader(GedcomParser parser, InputStream byteStream) {
super(parser, byteStream);
}

/**
* {@inheritDoc}
*/
@Override
protected List<String> load() throws IOException, GedcomParserException {

boolean eof = false;
public String nextLine() throws IOException, GedcomParserException {
if (eof) {
return null;
}
String result = null;
while (!eof) {
lastChar = currChar;
currChar = byteStream.read();

// Check for EOF
if (currChar < 0) {
addNonBlankLine();
result = getThisLine();
break;
}

// Check for carriage returns - signify EOL
if (currChar == 0x0D) {
addNonBlankLine();
result = getThisLine();
lineBufferIdx = 0;
continue;
break;
}

// Check for line feeds - signify EOL (unless prev char was a
// CR)
if (currChar == 0x0A) {
if (lastChar != 0x0D) {
addNonBlankLine();
result = getThisLine();
lineBufferIdx = 0;
}
continue;
break;
}

// All other characters are treated the same at this point,
// regardless of encoding, and added as is
lineBuffer[lineBufferIdx++] = (char) currChar;

forceLineSplitIfNeeded();
if (lineBufferIdx >= 255) {
result = getThisLine();
insertSyntheticConcTag();
}

continue;
}
result = anselHandler.toUtf16Lines(result);
parser.notifyFileObservers(new FileProgressEvent(this, linesRead, true));
return result;
}

/**
* Add line to result if it is not blank. Notify listeners of progress every 100 lines.
* Determine what level the current line in the line buffer is
*
* @throws ParserCancelledException
* if the file load is cancelled
* @return what level the current line in the line buffer is
* @throws GedcomParserException
* if the line level can't be determined, because the file doesn't begin with a 1 or 2 digit number
* followed by a space.
*/
private void addNonBlankLine() throws ParserCancelledException {
if (parser.isCancelled()) {
throw new ParserCancelledException("File load is cancelled");
private int getCurrentLevelFromLineBuffer() throws GedcomParserException {
int level = -1;
if (Character.isDigit(lineBuffer[0])) {
if (Character.isDigit(lineBuffer[1])) {
if (lineBuffer[2] == ' ') {
level = Character.getNumericValue(lineBuffer[0]) * 10 + Character.getNumericValue(lineBuffer[1]);

} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
* put in CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
}
} else {
if (lineBuffer[1] == ' ') {
level = Character.getNumericValue(lineBuffer[0]);
} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't
* put in CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
}
}
} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in
* CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically.");
}
return level;
}

/**
* Get the current line buffer's contents
*
* @return the current line buffer's contents
*/
private String getThisLine() {
String result = null;
if (lineBufferIdx > 0) {
String s = new String(lineBuffer).substring(0, lineBufferIdx);
if (STRINGS_TO_INTERN.contains(s)) {
result.add(s.intern());
} else {
result.add(s);
result = anselHandler.toUtf16(s);
if (STRINGS_TO_INTERN.contains(result)) {
result = result.intern();
}
}
linesRead++;
if (linesRead % parser.getReadNotificationRate() == 0) {
parser.notifyFileObservers(new FileProgressEvent(this, linesRead, false));
}
return result;
}

/**
* Force synthetic CONC tags for any line longer than 255 characters
* Insert synthetic CONC tags into the character buffer as if they had been there the whole time
*
* @throws ParserCancelledException
* @throws GedcomParserException
*/
private void forceLineSplitIfNeeded() throws ParserCancelledException, GedcomParserException {
if (lineBufferIdx >= 255) {
int level = -1;
addNonBlankLine();
if (Character.isDigit(lineBuffer[0])) {
if (Character.isDigit(lineBuffer[1])) {
if (lineBuffer[2] == ' ') {
level = Character.getNumericValue(lineBuffer[0]) * 10 + Character.getNumericValue(lineBuffer[1]);

} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we
* can't put in CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
}
} else {
if (lineBuffer[1] == ' ') {
level = Character.getNumericValue(lineBuffer[0]);
} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we
* can't put in CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. " + "Can't split automatically.");
}
}
} else {
/*
* Line is too long and doesn't begin with a 1 or 2 digit number followed by a space, so we can't put in
* CONC's on the fly (because we don't know what level we're at)
*/
throw new GedcomParserException(
"Line " + linesRead + " exceeds 255 characters and does not begin with a 1 or 2 digit number. Can't split automatically.");
}

lineBufferIdx = 0;
parser.warnings.add("Line " + linesRead + " exceeds 255 characters - introducing synthetic CONC tag to split line");
level++;
if (level > 9) {
lineBuffer[lineBufferIdx++] = Character.forDigit(level / 10, 10);
lineBuffer[lineBufferIdx++] = Character.forDigit(level % 10, 10);
} else {
lineBuffer[lineBufferIdx++] = Character.forDigit(level, 10);
}
lineBuffer[lineBufferIdx++] = ' ';
lineBuffer[lineBufferIdx++] = 'C';
lineBuffer[lineBufferIdx++] = 'O';
lineBuffer[lineBufferIdx++] = 'N';
lineBuffer[lineBufferIdx++] = 'C';
lineBuffer[lineBufferIdx++] = ' ';
private void insertSyntheticConcTag() throws GedcomParserException {
int level = getCurrentLevelFromLineBuffer();

lineBufferIdx = 0;
parser.warnings.add("Line " + linesRead + " exceeds 255 characters - introducing synthetic CONC tag to split line");
level++;
if (level > 9) {
lineBuffer[lineBufferIdx++] = Character.forDigit(level / 10, 10);
lineBuffer[lineBufferIdx++] = Character.forDigit(level % 10, 10);
} else {
lineBuffer[lineBufferIdx++] = Character.forDigit(level, 10);
}
lineBuffer[lineBufferIdx++] = ' ';
lineBuffer[lineBufferIdx++] = 'C';
lineBuffer[lineBufferIdx++] = 'O';
lineBuffer[lineBufferIdx++] = 'N';
lineBuffer[lineBufferIdx++] = 'C';
lineBuffer[lineBufferIdx++] = ' ';
}

}

0 comments on commit 3dfa8f5

Please sign in to comment.