Skip to content

Commit

Permalink
Fix 'ui' diphthong problem for Latin syllabifier (#754)
Browse files Browse the repository at this point in the history
* added .cache to gitignore

* fixed macron issue and added unit tests

* add final newline

* added .cache to gitignore

* fixed ui problem

* Cui fix

* remove 2.7 files
  • Loading branch information
TylerKirby authored and kylepjohnson committed Apr 3, 2018
1 parent 308755a commit b80c0a4
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 2 additions & 0 deletions cltk/prosody/latin/ScansionConstants.py
Expand Up @@ -53,6 +53,8 @@ def __init__(self, unstressed="U", stressed="-", optional_terminal_ending="X", s
self.DIPTHONGS = ["ae", "au", "ei", "eu", "oe", "ui", "Ui", "uī",
# because the last vowel can be accented by position: potuisse
"Ae", "Au", "Ei", "Eu", "Oe"]
self.UI_EXCEPTIONS = { "cui": ["cui"], "Cui": ["Cui"], "hui": ["hui"], "Hui": ["Hui"], "huic": ["huic"],
"Huic": ["Huic"]}
self.stress_accent_dict = dict(zip(list(self.VOWELS + self.ACCENTED_VOWELS),
list(self.ACCENTED_VOWELS + self.ACCENTED_VOWELS)))
self.LIQUIDS = "lmnrLMNR"
Expand Down
28 changes: 27 additions & 1 deletion cltk/prosody/latin/Syllabifier.py
Expand Up @@ -31,13 +31,16 @@ def __init__(self, constants=ScansionConstants()):
self.kw_matcher = re.compile("[kK][w]")
self.ACCEPTABLE_CHARS = constants.ACCENTED_VOWELS + constants.VOWELS + ' ' \
+ constants.CONSONANTS
self.diphthongs = [d for d in constants.DIPTHONGS if d not in ["ui", "Ui", "uī"]]

def syllabify(self, words: str) -> list:
"""Parse a Latin word into a list of syllable strings.
:param words: a string containing one latin word or many words separated by spaces.
:return: list of string, each representing a syllable.
>>> syllabifier = Syllabifier()
>>> print(syllabifier.syllabify("fuit"))
['fu', 'it']
>>> print(syllabifier.syllabify("libri"))
['li', 'bri']
>>> print(syllabifier.syllabify("contra"))
Expand Down Expand Up @@ -93,6 +96,23 @@ def syllabify(self, words: str) -> list:
['lin', 'guā']
>>> print(syllabifier.syllabify("languidus"))
['lan', 'gui', 'dus']
>>> print(syllabifier.syllabify("suis"))
['su', 'is']
>>> print(syllabifier.syllabify("habui"))
['ha', 'bu', 'i']
>>> print(syllabifier.syllabify("habuit"))
['ha', 'bu', 'it']
>>> print(syllabifier.syllabify("qui"))
['qui']
>>> print(syllabifier.syllabify("quibus"))
['qui', 'bus']
>>> print(syllabifier.syllabify("hui"))
['hui']
>>> print(syllabifier.syllabify("cui"))
['cui']
>>> print(syllabifier.syllabify("huic"))
['huic']
"""
cleaned = words.translate(self.remove_punct_map)
cleaned = cleaned.replace("qu", "kw")
Expand All @@ -118,6 +138,7 @@ def syllabify(self, words: str) -> list:
cleaned = cleaned.replace("guū", "gwū")
cleaned = cleaned.replace("Guū", "Gwū")
items = cleaned.strip().split(" ")

for char in cleaned:
if not char in self.ACCEPTABLE_CHARS:
LOG.error("Unsupported character found in %s " % cleaned)
Expand Down Expand Up @@ -156,6 +177,10 @@ def _setup(self, word) -> list:
self._process(first) + self._process(rest))
# a word like pror can happen from ellision
return StringUtils.remove_blank_spaces(self._process(word))
if word in self.constants.UI_EXCEPTIONS.keys():
return self.constants.UI_EXCEPTIONS[word]


return StringUtils.remove_blank_spaces(self._process(word))

def convert_consonantal_i(self, word) -> str:
Expand All @@ -179,7 +204,7 @@ def _process(self, word: str) -> list:
my_word = " " + word + " "
letters = list(my_word)
positions = []
for dipth in self.constants.DIPTHONGS:
for dipth in self.diphthongs:
if dipth in my_word:
dipth_matcher = re.compile("{}".format(dipth))
matches = dipth_matcher.finditer(my_word)
Expand Down Expand Up @@ -322,3 +347,4 @@ def get_syllable_count(self, syllables: list) -> int:
return len(StringUtils.remove_blank_spaces(
StringUtils.move_consonant_right(tmp_syllables,
self._find_solo_consonant(tmp_syllables))))

0 comments on commit b80c0a4

Please sign in to comment.