cltk · kylepjohnson · Mar 11, 2020 · Feb 24, 2020 · Mar 11, 2020 · Mar 11, 2020
diff --git a/cltk/tokenize/latin/word.py b/cltk/tokenize/latin/word.py
@@ -5,7 +5,7 @@
 __license__ = 'MIT License.'
 
 import re
-from typing import List
+from typing import List, Tuple
 
 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 
@@ -30,11 +30,20 @@ def __init__(self):
         self.sent_tokenizer = PunktSentenceTokenizer(self.punkt_param)
         self.word_tokenizer = LatinLanguageVars()
 
-    def tokenize(self, text:str) ->List[str]:
+    def tokenize(self, text: str,
+                 replacements: List[Tuple[str, str]] = REPLACEMENTS,
+                 enclitics_exceptions: List[str] = EXCEPTIONS,
+                 enclitics: List[str] = ENCLITICS
+
+    ) ->List[str]:
         """
         Tokenizer divides the text into a list of substrings
 
         :param text: This accepts the string value that needs to be tokenized
+        :param replacements: List of replacements to apply to tokens such as "mecum" -> ["cum", "me"]
+        :param enclitics_exceptions: List of words that look likes they end with an enclitic but are not.
+        :param enclitics: List of enclitics to check for in tokenization
+
         :returns: A list of substrings extracted from the text
 
         >>> toker = WordTokenizer()
@@ -51,6 +60,22 @@ def tokenize(self, text:str) ->List[str]:
         >>> toker.tokenize('Dic si audes mihi, bellan videtur specie mulier?')
         ['Dic', 'si', 'audes', 'mihi', ',', 'bella', '-ne', 'videtur', 'specie', 'mulier', '?']
 
+        >>> toker.tokenize("mecum")
+        ['cum', 'me']
+
+        You can specify how replacements are made using replacements
+
+        >>> toker.tokenize("mecum", replacements=[(r"mecum", "me cum")])
+        ['me', 'cum']
+
+        Or change enclitics and enclitics exception:
+        >>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que"])
+        ['atque', 'haec', 'abuter', '-que', 'puerve', 'paterne', 'nihil']
+
+        >>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que", "ve", "ne"],
+        ...    enclitics_exceptions=('paterne', 'atque'))
+        ['atque', 'haec', 'abuter', '-que', 'puer', '-ve', 'paterne', 'nihil']
+
         """
 
         def matchcase(word):
@@ -68,19 +93,19 @@ def replace(matching):
 
             return replace
 
-        for replacement in REPLACEMENTS:
+        for replacement in replacements:
             text = re.sub(replacement[0], matchcase(replacement[1]), text, flags=re.IGNORECASE)
 
         sents = self.sent_tokenizer.tokenize(text)
-        tokens = [] # type: List[str]
+        tokens = []  # type: List[str]
 
         for sent in sents:
             temp_tokens = self.word_tokenizer.word_tokenize(sent)
             # Need to check that tokens exist before handling them;
             # needed to make stream.readlines work in PlaintextCorpusReader
             if temp_tokens:
                 if temp_tokens[0].endswith('ne'):
-                    if temp_tokens[0].lower() not in WordTokenizer.EXCEPTIONS:
+                    if temp_tokens[0].lower() not in enclitics_exceptions:
                         temp = [temp_tokens[0][:-2], '-ne']
                         temp_tokens = temp + temp_tokens[1:]
                 if temp_tokens[-1].endswith('.'):
@@ -96,8 +121,8 @@ def replace(matching):
 
         for token in tokens:
             is_enclitic = False
-            if token.lower() not in WordTokenizer.EXCEPTIONS:
-                for enclitic in WordTokenizer.ENCLITICS:
+            if token.lower() not in enclitics_exceptions:
+                for enclitic in enclitics:
                     if token.endswith(enclitic):
                         if enclitic == 'n':
                             specific_tokens += [token[:-len(enclitic)]] + ['-ne']