Fix 'ui' diphthong problem for Latin syllabifier (#754)

* added .cache to gitignore * fixed macron issue and added unit tests * add final newline * added .cache to gitignore * fixed ui problem * Cui fix * remove 2.7 files
cltk · Apr 3, 2018 · b80c0a4 · b80c0a4
1 parent 308755a
commit b80c0a4
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/cltk/prosody/latin/ScansionConstants.py b/cltk/prosody/latin/ScansionConstants.py
@@ -53,6 +53,8 @@ def __init__(self, unstressed="U", stressed="-", optional_terminal_ending="X", s
         self.DIPTHONGS = ["ae", "au", "ei", "eu", "oe", "ui", "Ui", "uī",
                           # because the last vowel can be accented by position: potuisse
                           "Ae", "Au", "Ei", "Eu", "Oe"]
+        self.UI_EXCEPTIONS = { "cui": ["cui"], "Cui": ["Cui"], "hui": ["hui"], "Hui": ["Hui"], "huic": ["huic"],
+                               "Huic": ["Huic"]}
         self.stress_accent_dict = dict(zip(list(self.VOWELS + self.ACCENTED_VOWELS),
                                            list(self.ACCENTED_VOWELS + self.ACCENTED_VOWELS)))
         self.LIQUIDS = "lmnrLMNR"

diff --git a/cltk/prosody/latin/Syllabifier.py b/cltk/prosody/latin/Syllabifier.py
@@ -31,13 +31,16 @@ def __init__(self, constants=ScansionConstants()):
         self.kw_matcher = re.compile("[kK][w]")
         self.ACCEPTABLE_CHARS = constants.ACCENTED_VOWELS + constants.VOWELS + ' ' \
                                 + constants.CONSONANTS
+        self.diphthongs = [d for d in constants.DIPTHONGS if d not in ["ui", "Ui", "uī"]]
 
     def syllabify(self, words: str) -> list:
         """Parse a Latin word into a list of syllable strings.
         :param words: a string containing one latin word or many words separated by spaces.
         :return: list of string, each representing a syllable.
 
         >>> syllabifier = Syllabifier()
+        >>> print(syllabifier.syllabify("fuit"))
+        ['fu', 'it']
         >>> print(syllabifier.syllabify("libri"))
         ['li', 'bri']
         >>> print(syllabifier.syllabify("contra"))
@@ -93,6 +96,23 @@ def syllabify(self, words: str) -> list:
         ['lin', 'guā']
         >>> print(syllabifier.syllabify("languidus"))
         ['lan', 'gui', 'dus']
+
+        >>> print(syllabifier.syllabify("suis"))
+        ['su', 'is']
+        >>> print(syllabifier.syllabify("habui"))
+        ['ha', 'bu', 'i']
+        >>> print(syllabifier.syllabify("habuit"))
+        ['ha', 'bu', 'it']
+        >>> print(syllabifier.syllabify("qui"))
+        ['qui']
+        >>> print(syllabifier.syllabify("quibus"))
+        ['qui', 'bus']
+        >>> print(syllabifier.syllabify("hui"))
+        ['hui']
+        >>> print(syllabifier.syllabify("cui"))
+        ['cui']
+        >>> print(syllabifier.syllabify("huic"))
+        ['huic']
         """
         cleaned = words.translate(self.remove_punct_map)
         cleaned = cleaned.replace("qu", "kw")
@@ -118,6 +138,7 @@ def syllabify(self, words: str) -> list:
         cleaned = cleaned.replace("guū", "gwū")
         cleaned = cleaned.replace("Guū", "Gwū")
         items = cleaned.strip().split(" ")
+
         for char in cleaned:
             if not char in self.ACCEPTABLE_CHARS:
                 LOG.error("Unsupported character found in %s " % cleaned)
@@ -156,6 +177,10 @@ def _setup(self, word) -> list:
                         self._process(first) + self._process(rest))
                 # a word like pror can happen from ellision
                 return StringUtils.remove_blank_spaces(self._process(word))
+        if word in self.constants.UI_EXCEPTIONS.keys():
+            return self.constants.UI_EXCEPTIONS[word]
+
+
         return StringUtils.remove_blank_spaces(self._process(word))
 
     def convert_consonantal_i(self, word) -> str:
@@ -179,7 +204,7 @@ def _process(self, word: str) -> list:
         my_word = " " + word + " "
         letters = list(my_word)
         positions = []
-        for dipth in self.constants.DIPTHONGS:
+        for dipth in self.diphthongs:
             if dipth in my_word:
                 dipth_matcher = re.compile("{}".format(dipth))
                 matches = dipth_matcher.finditer(my_word)
@@ -322,3 +347,4 @@ def get_syllable_count(self, syllables: list) -> int:
         return len(StringUtils.remove_blank_spaces(
             StringUtils.move_consonant_right(tmp_syllables,
                                              self._find_solo_consonant(tmp_syllables))))
+