Permalink
Browse files

Added "hyphens" function to return the hyphen indexes. Added some mis…

…sing files from previous commit.
  • Loading branch information...
1 parent 2c20b8f commit 93b2f0e3a3ff1d4a0f198c996c5f18fae8326245 @dedeibel committed Jun 12, 2012
View
3 README.txt
@@ -28,6 +28,9 @@ I've included a script called native-build.pl which might help to build
on at least Linux and Mac and 32 and 64 bit, for windows you
can take it as a hint of what to do.
+When building the sources on your own, make sure you have "gawk" installed if
+you run into related errors.
+
* Building the Java API
View
2 build.xml
@@ -171,7 +171,7 @@
<formatter type="plain" usefile="false" /> <!-- to screen -->
<batchtest fork="yes">
<fileset dir="test/src">
- <include name="**/*Test*.java"/>
+ <include name="**/*Test.java"/>
</fileset>
</batchtest>
</junit>
View
1 experiment/buildhyphenate.sh
@@ -1,4 +1,5 @@
echo "building hyphenate ..."
+# If "-lhyphen" is missing, make sure "native-src" is build
gcc -std=gnu99 -Wall -o hyphenate hyphenate.c -I ../native-src/hyphen-2.8.3 -L ../native-src/hyphen-2.8.3/.libs -lhyphen -ggdb
echo "running hyphenate ..."
./hyphenate hyph_mini_de.dic danke
View
43 experiment/hyphenate.c
@@ -29,15 +29,16 @@ int main(int argc, char** argv) {
printf("So I shall hyphenate this: %s\n", word);
char hword[BUFSIZE];
- char *hyphens = (char *)malloc(word_length + 5);
- char ** rep;
- int * pos;
- int * cut;
-
hword[0] = '\0';
- rep = NULL;
- pos = NULL;
- cut = NULL;
+ char *hyphens = (char *)malloc(word_length + 1);
+
+ // Are these correct?
+ char **rep = (char **)malloc(word_length * sizeof(char*));
+ int * pos = (int*)malloc(word_length * sizeof(int));
+ int * cut = (int*)malloc(word_length * sizeof(int));
+ memset(rep, 0, word_length * sizeof(char*));
+ memset(pos, 0, word_length * sizeof(int));
+ memset(cut, 0, word_length * sizeof(int));
if (hnj_hyphen_hyphenate2(dict, word, word_length, hyphens, hword, &rep, &pos, &cut)) {
free(hyphens);
@@ -47,7 +48,7 @@ int main(int argc, char** argv) {
}
printf("Hyphens: ");
- for (int i = 0; i < word_length; ++i) {
+ for (int i = 0; i < word_length + 1; ++i) {
if (hyphens[i] & 1) {
printf(".-");
}
@@ -58,7 +59,29 @@ int main(int argc, char** argv) {
printf("\n");
printf("Hyphenated word: %s\n", hword);
- // TODO rep and cut etc not set, why?
+ // rep and cut etc not set, why? - Schifffart from the header doku used ...
+ printf("rep: ");
+ for (int i = 0; i < word_length; ++i) {
+ if (rep[i] != NULL) {
+ printf("%s", rep[i]);
+ }
+ else {
+ printf("_");
+ }
+ }
+ printf("\n");
+
+ printf("pos: ");
+ for (int i = 0; i < word_length; ++i) {
+ printf("%d", pos[i]);
+ }
+ printf("\n");
+
+ printf("cut: ");
+ for (int i = 0; i < word_length; ++i) {
+ printf("%d", cut[i]);
+ }
+ printf("\n");
free(hyphens);
free(word);
View
170 src/name/benjaminpeter/hyphen/Dictionary.java
@@ -10,6 +10,9 @@
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
import com.sun.jna.Pointer;
import com.sun.jna.ptr.PointerByReference;
@@ -18,8 +21,10 @@
* Class representing a single dictionary.
*/
public class Dictionary {
+
/**
- * The pointer to the hunspell object as returned by the hunspell constructor.
+ * The pointer to the hunspell object as returned by the hunspell
+ * constructor.
*/
private Pointer hunspellDict = null;
@@ -34,56 +39,56 @@
* Creates an instance of the dictionary.
*
* @param hunspellLibrary
- * The hunspell native library inside of class Hyphen
+ * The hunspell native library inside of class Hyphen
* @param baseFileName
- * the base name of the dictionary,
+ * the base name of the dictionary,
* @throws IOException
- * If the dictionary file could not be read
+ * If the dictionary file could not be read
*/
Dictionary(final HyphenLibrary hunspellLibrary, final String baseFileName)
throws IOException {
this.hunspellLibrary = hunspellLibrary;
File dic = new File(baseFileName);
if (!dic.canRead()) {
- throw new FileNotFoundException("The dictionary files " + baseFileName
- + " could not be read");
+ throw new FileNotFoundException("The dictionary files "
+ + baseFileName + " could not be read");
}
hunspellDict = hunspellLibrary.hnj_hyphen_load(dic.toString());
encoding = determineEncoding(dic);
}
private String determineEncoding(final File dic) throws IOException {
- InputStream fis = null;
- InputStreamReader is = null;
- BufferedReader br = null;
-
- try {
- fis = new FileInputStream(dic);
- is = new InputStreamReader(fis);
- br = new BufferedReader(is);
- String line;
- if ((line = br.readLine()) != null) {
- try {
- return Charset.forName(line).name();
- } catch (UnsupportedCharsetException e) {
- System.err.println("Could not determine dic encoding by first line: '"
- + line + "' using latin1.");
- }
- }
- }
- finally {
- if (br != null) {
- br.close();
- }
- if (is != null) {
- is.close();
- }
- if (fis != null) {
- fis.close();
- }
- }
+ InputStream fis = null;
+ InputStreamReader is = null;
+ BufferedReader br = null;
+
+ try {
+ fis = new FileInputStream(dic);
+ is = new InputStreamReader(fis);
+ br = new BufferedReader(is);
+ String line;
+ if ((line = br.readLine()) != null) {
+ try {
+ return Charset.forName(line).name();
+ } catch (UnsupportedCharsetException e) {
+ System.err
+ .println("Could not determine dic encoding by first line: '"
+ + line + "' using latin1.");
+ }
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (is != null) {
+ is.close();
+ }
+ if (fis != null) {
+ fis.close();
+ }
+ }
return "ISO-8859-1";
}
@@ -98,31 +103,32 @@ public void destroy() {
}
/**
- * Check if a word is spelled correctly
+ * Hyphenate the word. The resulting word has "=" entered where hyphenation
+ * is allowed.
*
* @param word
- * The word to check.
+ * The word to check.
* @throws HyphenationException
- * Is returned if the hyphenation fails. It is thrown when the C
- * library does not return zero.
+ * Is returned if the hyphenation fails. It is thrown when the C
+ * library does not return zero.
*/
public String hyphenate(final String word) throws HyphenationException {
- PointerByReference rep = new PointerByReference();
- PointerByReference pos = new PointerByReference();
- PointerByReference cut = new PointerByReference();
-
try {
/*
- * Case must be converted to lower case before hyphenation. The encoding
- * must also match the dictionary's encoding. And finally we need to
- * create a null terimated C-String.
+ * Case must be converted to lower case before hyphenation. The
+ * encoding must also match the dictionary's encoding. And finally
+ * we need to create a null terminated C-String.
+ */
+ byte[] asciiWord = convertWordToCString(word, encoding);
+ byte[] hyphens = createHyphensBuffer(asciiWord.length);
+ byte[] hyphenated = createHyphenatedBuffer(asciiWord.length);
+ /*
+ * I didn't understand how the "complementary" thing works yet, so
+ * just pass in null for now.
*/
- byte[] asciiWord = stringToBytes(word.toLowerCase(), encoding);
- byte[] hyphens = new byte[asciiWord.length + 1];
- byte[] hyphenated = new byte[asciiWord.length * 2];
-
int success = hunspellLibrary.hnj_hyphen_hyphenate2(hunspellDict,
- asciiWord, asciiWord.length, hyphens, hyphenated, rep, pos, cut);
+ asciiWord, asciiWord.length, hyphens, hyphenated,
+ newPointerRef(), newPointerRef(), newPointerRef());
if (success != 0) {
throw new HyphenationException(
"Hyphenation failed, please check input encoding and stderr output.");
@@ -136,9 +142,67 @@ public String hyphenate(final String word) throws HyphenationException {
}
}
+ /**
+ *
+ * @return A collection of the indexes of the letters after which a hyphen
+ * must be added. For example "dan=ke" would contain { 2 }
+ */
+ public Collection<Integer> hyphens(final String word)
+ throws HyphenationException {
+ try {
+ /*
+ * Case must be converted to lower case before hyphenation. The
+ * encoding must also match the dictionary's encoding. And finally
+ * we need to create a null terminated C-String.
+ */
+ byte[] asciiWord = convertWordToCString(word, encoding);
+ byte[] hyphens = createHyphensBuffer(asciiWord.length);
+ /*
+ * I didn't understand how the "complementary" thing works yet, so
+ * just pass in null for now.
+ */
+ int success = hunspellLibrary.hnj_hyphen_hyphenate2(hunspellDict,
+ asciiWord, asciiWord.length, hyphens, null,
+ newPointerRef(), newPointerRef(), newPointerRef());
+ if (success != 0) {
+ throw new HyphenationException(
+ "Hyphenation failed, please check input encoding and stderr output.");
+ }
+
+ List<Integer> hyphenIndexes = new LinkedList<Integer>();
+
+ for (int i = 0; i < asciiWord.length + 1; ++i) {
+ if ((hyphens[i] & 1) == 1) {
+ hyphenIndexes.add(i);
+ }
+ }
+ return hyphenIndexes;
+ } catch (UnsupportedEncodingException e) {
+ throw new HyphenationException(
+ "Hyphenation failed, please check system available encodings.");
+ }
+ }
+
+ private byte[] createHyphenatedBuffer(int length) {
+ return new byte[length * 2];
+ }
+
+ private byte[] createHyphensBuffer(int length) {
+ return new byte[length + 1];
+ }
+
+ private byte[] convertWordToCString(final String word, String encoding)
+ throws UnsupportedEncodingException {
+ return stringToBytes(word.toLowerCase(), encoding);
+ }
+
+ private PointerByReference newPointerRef() {
+ return new PointerByReference(Pointer.NULL);
+ }
+
/*
- * Determine the size of the string, byte is expected to be a zero terminated
- * C-string.
+ * Determine the size of the string, byte is expected to be a zero
+ * terminated C-string.
*/
private int strlen(final byte[] hyphenated) {
int i = 0;
View
40 src/name/benjaminpeter/hyphen/Hyphen.java
@@ -42,7 +42,7 @@ public static Hyphen getInstance() throws UnsatisfiedLinkError,
* directory specified.
*
* @param libDir
- * Optional absolute directory where the native lib can be found.
+ * Optional absolute directory where the native lib can be found.
*/
public static Hyphen getInstance(final String libDir)
throws UnsatisfiedLinkError, UnsupportedOperationException {
@@ -64,14 +64,14 @@ protected void tryLoad(final String libFile)
* Constructor for the library, loads the native lib.
*
* Loading is done in the first of the following three ways that works: 1)
- * Unmodified load in the provided directory. 2) libFile stripped back to the
- * base name (^lib(.*)\.so on unix) 3) The library is searched for in the
- * classpath, extracted to disk and loaded.
+ * Unmodified load in the provided directory. 2) libFile stripped back to
+ * the base name (^lib(.*)\.so on unix) 3) The library is searched for in
+ * the classpath, extracted to disk and loaded.
*
* @param libDir
- * Optional absolute directory where the native lib can be found.
+ * Optional absolute directory where the native lib can be found.
* @throws UnsupportedOperationException
- * if the OS or architecture is simply not supported.
+ * if the OS or architecture is simply not supported.
*/
protected Hyphen(final String libDir) throws UnsatisfiedLinkError,
UnsupportedOperationException {
@@ -82,7 +82,8 @@ protected Hyphen(final String libDir) throws UnsatisfiedLinkError,
HyphenLibrary.class);
} catch (UnsatisfiedLinkError urgh) {
- // Oh dear, the library was not found in the file system, let's try the
+ // Oh dear, the library was not found in the file system, let's try
+ // the
// classpath
libFile = libName();
InputStream is = Hyphen.class.getResourceAsStream("/" + libFile);
@@ -105,7 +106,8 @@ protected Hyphen(final String libDir) throws UnsatisfiedLinkError,
}
} catch (IOException e) {
- throw new Error("Failed to create temporary file for " + libFile, e);
+ throw new Error("Failed to create temporary file for "
+ + libFile, e);
} finally {
try {
is.close();
@@ -130,8 +132,8 @@ public String getLibFile() {
/**
* Calculate the filename of the native hunspell lib. The files have
- * completely different names to allow them to live in the same directory and
- * avoid confusion.
+ * completely different names to allow them to live in the same directory
+ * and avoid confusion.
*/
public static String libName() throws UnsupportedOperationException {
String os = System.getProperty("os.name").toLowerCase();
@@ -182,7 +184,7 @@ public static String libNameBare() throws UnsupportedOperationException {
return "hyphen-linux-x86-64";
}
- } else if (os.startsWith("sunos")) {
+ // } else if (os.startsWith("sunos")) {
// if (arch.equals("sparc")) {
// return "hyphen-sunos-sparc-64";
// }
@@ -201,14 +203,18 @@ public static String libNameBare() throws UnsupportedOperationException {
* Gets an instance of the dictionary.
*
* @param baseFileName
- * the base name of the dictionary, passing /dict/da_DK means that
- * the files /dict/da_DK.dic and /dict/da_DK.aff get loaded
+ * the base name of the dictionary, passing /dict/da_DK means
+ * that the files /dict/da_DK.dic and /dict/da_DK.aff get loaded
* @throws IOException
- * If the dictionary file could not be read
+ * If the dictionary file could not be read
*/
- public Dictionary getDictionary(final String baseFileName) throws IOException {
+ public Dictionary getDictionary(final String baseFileName)
+ throws IOException {
- /* TODO: Detect if the dictionary files have changed and reload if they have */
+ /*
+ * TODO: Detect if the dictionary files have changed and reload if they
+ * have
+ */
if (map.containsKey(baseFileName)) {
return map.get(baseFileName);
} else {
@@ -222,7 +228,7 @@ public Dictionary getDictionary(final String baseFileName) throws IOException {
* Removes a dictionary from the internal cache
*
* @param baseFileName
- * the base name of the dictionary, as passed to getDictionary()
+ * the base name of the dictionary, as passed to getDictionary()
*/
public void destroyDictionary(final String baseFileName) {
if (map.containsKey(baseFileName)) {
View
4 src/name/benjaminpeter/hyphen/HyphenLibrary.java
@@ -16,7 +16,7 @@
* Create the hyphen lib instance
*
* @param fn
- * The hyphenation file path
+ * The hyphenation file path
* @return The hyphen library object
*/
public Pointer hnj_hyphen_load(String fn);
@@ -25,7 +25,7 @@
* Free the hyphen lib
*
* @param dict
- * The hyphen library object returned by Hyphen_load
+ * The hyphen library object returned by Hyphen_load
*/
public void hnj_hyphen_free(Pointer dict);
View
3 src/name/benjaminpeter/hyphen/HyphenMain.java
@@ -19,7 +19,8 @@ public static void main(final String[] args) {
} else if (args.length == 2) {
final String dict = args[0];
final String word = args[1];
- System.err.println("Loading Hyphen, dict: " + dict + " word: " + word);
+ System.err.println("Loading Hyphen, dict: " + dict + " word: "
+ + word);
Dictionary d = Hyphen.getInstance().getDictionary(dict);
System.err.println("Hyphen library and dictionary loaded");
View
4 test/resources/hyph_mini_de.dic
@@ -1,4 +0,0 @@
-ISO8859-1
-dan1ke
-ver1si1che1rung
-m�h1le
View
35 test/src/name/benjaminpeter/hyphen/DictionaryTest.java
@@ -0,0 +1,35 @@
+package name.benjaminpeter.hyphen;
+
+import static org.junit.Assert.*;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Test;
+
+public class DictionaryTest {
+
+ @Test
+ public void testHypensSimple() throws UnsatisfiedLinkError,
+ UnsupportedOperationException, IOException, HyphenationException {
+ Dictionary dic = Hyphen.getInstance().getDictionary(
+ TestConstants.DIC_PATH_LATIN1);
+ Collection<Integer> hyphens = dic.hyphens("danke");
+ assertNotNull(hyphens);
+ assertEquals(1, hyphens.size());
+ assertEquals(Integer.valueOf(2), hyphens.iterator().next());
+ }
+
+ @Test
+ public void testHypensMultiple() throws UnsatisfiedLinkError,
+ UnsupportedOperationException, IOException, HyphenationException {
+ Dictionary dic = Hyphen.getInstance().getDictionary(
+ TestConstants.DIC_PATH_LATIN1);
+ Collection<Integer> hyphens = dic.hyphens("Versicherung");
+ assertNotNull(hyphens);
+ assertEquals(3, hyphens.size());
+ assertTrue(Arrays.asList(Integer.valueOf(2), Integer.valueOf(4),
+ Integer.valueOf(7)).equals(dic.hyphens("Versicherung")));
+ }
+
+}
View
16 test/src/name/benjaminpeter/hyphen/HyphenTest.java
@@ -7,12 +7,6 @@
public class HyphenTest {
- private static final String DIC_PATH_LATIN1 = "test/resources/hyph_mini_de_iso.dic";
-
- private static final String DIC_PATH_UTF8 = "test/resources/hyph_mini_de_utf8.dic";
-
- private static final String DIC_PATH_DEFAULT = "test/resources/hyph_mini_de_none.dic";
-
@Test
public void testInstance() {
Hyphen firstInstance = Hyphen.getInstance();
@@ -24,37 +18,37 @@ public void testInstance() {
@Test
public void testGetDictionary() throws HyphenationException, IOException {
Hyphen hyphen = Hyphen.getInstance();
- Dictionary dic = hyphen.getDictionary(DIC_PATH_LATIN1);
+ Dictionary dic = hyphen.getDictionary(TestConstants.DIC_PATH_LATIN1);
assertNotNull(dic);
}
@Test
public void testHyphenateSimple() throws HyphenationException,
UnsatisfiedLinkError, UnsupportedOperationException, IOException {
- Dictionary dic = Hyphen.getInstance().getDictionary(DIC_PATH_LATIN1);
+ Dictionary dic = Hyphen.getInstance().getDictionary(TestConstants.DIC_PATH_LATIN1);
assertEquals("dan=ke", dic.hyphenate("danke"));
assertEquals("ver=si=che=rung", dic.hyphenate("Versicherung"));
}
@Test
public void testHyphenateUmlaut() throws HyphenationException,
UnsatisfiedLinkError, UnsupportedOperationException, IOException {
- Dictionary dic = Hyphen.getInstance().getDictionary(DIC_PATH_LATIN1);
+ Dictionary dic = Hyphen.getInstance().getDictionary(TestConstants.DIC_PATH_LATIN1);
assertEquals("müh=le", dic.hyphenate("Mühle"));
}
@Test
public void testHyphenateUmlautUTF8Dict() throws HyphenationException,
UnsatisfiedLinkError, UnsupportedOperationException, IOException {
- Dictionary dic = Hyphen.getInstance().getDictionary(DIC_PATH_UTF8);
+ Dictionary dic = Hyphen.getInstance().getDictionary(TestConstants.DIC_PATH_UTF8);
assertEquals("müh=le", dic.hyphenate("Mühle"));
}
@Test
public void testHyphenateFallbackEncLatinDict()
throws HyphenationException, UnsatisfiedLinkError,
UnsupportedOperationException, IOException {
- Dictionary dic = Hyphen.getInstance().getDictionary(DIC_PATH_DEFAULT);
+ Dictionary dic = Hyphen.getInstance().getDictionary(TestConstants.DIC_PATH_DEFAULT);
assertEquals("müh=le", dic.hyphenate("Mühle"));
}
}
View
9 test/src/name/benjaminpeter/hyphen/TestConstants.java
@@ -0,0 +1,9 @@
+package name.benjaminpeter.hyphen;
+
+public class TestConstants {
+
+ static final String DIC_PATH_LATIN1 = "test/resources/hyph_mini_de_iso.dic";
+ static final String DIC_PATH_UTF8 = "test/resources/hyph_mini_de_utf8.dic";
+ static final String DIC_PATH_DEFAULT = "test/resources/hyph_mini_de_none.dic";
+
+}

0 comments on commit 93b2f0e

Please sign in to comment.