Uses the internal Levenshtein distance (#958)

* Uses the internal Levenshtein distance * Fixed docs Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
cltk · Jun 7, 2020 · fe31ab7 · fe31ab7
1 parent 30eaf18
commit fe31ab7
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 41 deletions.
diff --git a/cltk/prosody/latin/hendecasyllable_scanner.py b/cltk/prosody/latin/hendecasyllable_scanner.py
@@ -7,7 +7,7 @@
 
 import re
 
-from Levenshtein import distance
+from cltk.text_reuse.levenshtein import Levenshtein
 
 import cltk.prosody.latin.string_utils as string_utils
 from cltk.prosody.latin.verse import Verse
@@ -110,7 +110,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:
 
         smoothed = self.correct_invalid_start(verse.scansion)
 
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -120,7 +120,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:
 
         smoothed = self.correct_antepenult_chain(verse.scansion)
 
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["antepenult chain"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)

diff --git a/cltk/prosody/latin/hexameter_scanner.py b/cltk/prosody/latin/hexameter_scanner.py
@@ -14,7 +14,7 @@
 
 import re
 
-from Levenshtein import distance
+from cltk.text_reuse.levenshtein import Levenshtein
 
 from cltk.prosody.latin.verse import Verse
 from cltk.prosody.latin.metrical_validator import MetricalValidator
@@ -187,7 +187,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
             return verse
 
         smoothed = self.correct_inverted_amphibrachs(verse.scansion)
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["inverted"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -196,7 +196,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
             return self.assign_candidate(verse, verse.scansion)
 
         smoothed = self.correct_first_two_dactyls(verse.scansion)
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -205,7 +205,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
             return self.assign_candidate(verse, verse.scansion)
 
         smoothed = self.correct_invalid_fifth_foot(verse.scansion)
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["invalid 5th"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -236,7 +236,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
 
         # need to do this again, since the scansion has changed
         smoothed = self.correct_inverted_amphibrachs(verse.scansion)
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["inverted"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -264,7 +264,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
 
         if dactyl_smoothing:
             smoothed = self.correct_dactyl_chain(smoothed)
-            if distance(verse.scansion, smoothed) > 0:
+            if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
                 verse.scansion_notes += [self.constants.NOTE_MAP["dactyl smoothing"]]
                 verse.scansion = smoothed
             if self.metrical_validator.is_valid_hexameter(verse.scansion):

diff --git a/cltk/prosody/latin/metrical_validator.py b/cltk/prosody/latin/metrical_validator.py
@@ -6,7 +6,8 @@
 from typing import List
 
 from cltk.prosody.latin.scansion_constants import ScansionConstants
-from Levenshtein import distance
+from cltk.text_reuse.levenshtein import Levenshtein
+
 
 LOG = logging.getLogger(__name__)
 LOG.addHandler(logging.NullHandler())
@@ -212,7 +213,7 @@ def _closest_patterns(self, patterns: List[str], scansion: str) -> List[str]:
         pattern = pattern.replace(self.constants.FOOT_SEPARATOR, "")
         ending = pattern[-1]
         candidate = pattern[:len(pattern) - 1] + self.constants.OPTIONAL_ENDING
-        cans = [(distance(candidate, x), x) for x in patterns
+        cans = [(Levenshtein.levenshtein_distance(candidate, x), x) for x in patterns
                 if len(x) == len(candidate)]
         if cans:
             cans = sorted(cans, key=lambda tup: tup[0])

diff --git a/cltk/prosody/latin/pentameter_scanner.py b/cltk/prosody/latin/pentameter_scanner.py
@@ -7,8 +7,7 @@
 
 import re
 
-from Levenshtein import distance
-
+from cltk.text_reuse.levenshtein import Levenshtein
 from cltk.prosody.latin.verse import Verse
 from cltk.prosody.latin.metrical_validator import MetricalValidator
 from cltk.prosody.latin.scansion_constants import ScansionConstants
@@ -127,7 +126,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:
 
         smoothed = self.correct_first_two_dactyls(verse.scansion)
 
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)
@@ -137,7 +136,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:
 
         smoothed = self.correct_penultimate_dactyl_chain(verse.scansion)
 
-        if distance(verse.scansion, smoothed) > 0:
+        if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
             verse.scansion_notes += [self.constants.NOTE_MAP["penultimate dactyl chain"]]
             verse.scansion = smoothed
             stresses += string_utils.differences(verse.scansion, smoothed)

diff --git a/cltk/tests/test_nlp/test_scansion.py b/cltk/tests/test_nlp/test_scansion.py
@@ -9,7 +9,8 @@
 from cltk.prosody.latin.hendecasyllable_scanner import HendecasyllableScanner
 from cltk.prosody.latin.syllabifier import Syllabifier
 
-class TestScansionFunctions(unittest.TestCase):  # pylint: disable=R0904
+
+class TestScansionFunctions(unittest.TestCase):
     """Class for unittest"""
 
     def test_hexameter_scanner(self):

diff --git a/cltk/tests/test_nlp/test_text_reuse.py b/cltk/tests/test_nlp/test_text_reuse.py
@@ -55,13 +55,13 @@ def test_distance_ratio(self):
     def test_levenshtein_distance(self):
         """Test for Levenshtein Distance between two words""" 
         l = Levenshtein()
-        dist = l.Levenshtein_Distance("now grete glorious god through grace of himselven","and the precious prayer of his pris moder")
+        dist = l.levenshtein_distance("now grete glorious god through grace of himselven", "and the precious prayer of his pris moder")
         self.assertEqual(dist, 36)
 
     def test_damerau_levenshtein_distance(self):
         """Test for Damerau-Levenshtein Distance between two words"""
         l = Levenshtein()
-        dist = l.Damerau_Levenshtein_Distance("all haile whose solempne glorious concepcioun","fresche floure in quhom the hevinlie dewe doun fell")
+        dist = l.damerau_levenshtein_distance("all haile whose solempne glorious concepcioun", "fresche floure in quhom the hevinlie dewe doun fell")
         self.assertEqual(dist,35)
 
 #    Test causing lemmatizer Travis build to fail—figure out what is wrong and restore.

diff --git a/cltk/text_reuse/levenshtein.py b/cltk/text_reuse/levenshtein.py
@@ -13,7 +13,7 @@ def __init__(self):
         return
 
     @staticmethod
-    def Levenshtein_Distance(w1, w2):
+    def levenshtein_distance(w1, w2):
         """
         Computes Levenshtein Distance between two words
 
@@ -24,36 +24,37 @@ def Levenshtein_Distance(w1, w2):
 
         Examples:
 
-            >>> Levenshtein.Levenshtein_Distance('noctis', 'noctem')
+            >>> Levenshtein.levenshtein_distance('noctis', 'noctem')
             2
 
-            >>> Levenshtein.Levenshtein_Distance('nox', 'nochem')
+            >>> Levenshtein.levenshtein_distance('nox', 'nochem')
             4
 
-            >>> Levenshtein.Levenshtein_Distance('orbis', 'robis')
+            >>> Levenshtein.levenshtein_distance('orbis', 'robis')
             2
         """
         m, n = len(w1), len(w2)
         v1 = [i for i in range(n + 1)]
-        v2 = [0 for i in range(n + 1)]
+        v2 = [0 for _ in range(n + 1)]
 
         for i in range(m):
             v2[0] = i + 1
 
             for j in range(n):
-                delCost = v1[j + 1] + 1
-                insCost = v2[j] + 1
+                del_cost = v1[j + 1] + 1
+                ins_cost = v2[j] + 1
 
-                subCost = v1[j]
-                if w1[i] != w2[j]: subCost += 1
+                sub_cost = v1[j]
+                if w1[i] != w2[j]:
+                    sub_cost += 1
 
-                v2[j + 1] = min(delCost, insCost, subCost)
+                v2[j + 1] = min(del_cost, ins_cost, sub_cost)
             v1, v2 = v2, v1
 
         return v1[-1]
 
     @staticmethod
-    def Damerau_Levenshtein_Distance(w1, w2):
+    def damerau_levenshtein_distance(w1, w2):
         """
         Computes Damerau-Levenshtein Distance between two words
 
@@ -66,15 +67,15 @@ def Damerau_Levenshtein_Distance(w1, w2):
             For the most part, Damerau-Levenshtein behaves
             identically to Levenshtein:
 
-            >>> Levenshtein.Damerau_Levenshtein_Distance('noctis', 'noctem')
+            >>> Levenshtein.damerau_levenshtein_distance('noctis', 'noctem')
             2
 
-            >>> Levenshtein.Levenshtein_Distance('nox', 'nochem')
+            >>> Levenshtein.levenshtein_distance('nox', 'nochem')
             4
 
             The strength of DL lies in detecting transposition of characters:
 
-            >>> Levenshtein.Damerau_Levenshtein_Distance('orbis', 'robis')
+            >>> Levenshtein.damerau_levenshtein_distance('orbis', 'robis')
             1
 
         """
@@ -88,7 +89,8 @@ def Damerau_Levenshtein_Distance(w1, w2):
         max_dist = len(w1) + len(w2)
         mat[0][0] = max_dist
 
-        # Initialize matrix margin to the maximum possible distance (essentially inf) for ease of calculations (avoiding try blocks)
+        # Initialize matrix margin to the maximum possible distance (essentially inf) for ease of calculations
+        # (avoiding try blocks)
 
         for i in range(1, len(w1) + 2):
             mat[i][0] = max_dist
@@ -102,7 +104,6 @@ def Damerau_Levenshtein_Distance(w1, w2):
             tem = 0
 
             for j in range(2, len(w2) + 2):
-
                 k = dam_ar[alph.index(w2[j - 2])]
                 l = tem
 
@@ -112,7 +113,7 @@ def Damerau_Levenshtein_Distance(w1, w2):
                 else:
                     cost = 1
 
-                # The reccurence relation of DL is identical to that of Levenshtein with the addition of transposition
+                # The recurrence relation of DL is identical to that of Levenshtein with the addition of transposition
                 mat[i][j] = min(mat[i - 1][j - 1] + cost, mat[i][j - 1] + 1, mat[i - 1][j] + 1,
                                 mat[k - 1][l - 1] + i + j - k - l - 1)
 
@@ -133,11 +134,10 @@ def ratio(string_a, string_b):
             from fuzzywuzzy import fuzz
 
         except ImportError as imp_err:  # pragma: no cover
-            message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err
+            message = "'fuzzywuzzy' library required for this module: %s. Install with " \
+                      "`pip install fuzzywuzzy python-Levenshtein`" % imp_err
             logger.error(message)
             print(message)
             raise ImportError
 
         return fuzz.ratio(string_a, string_b) / 100
-
-
diff --git a/docs/multilingual.rst b/docs/multilingual.rst
@@ -582,7 +582,7 @@ You can also calculate the Levenshtein distance of two words, defined as the min
 
 .. code-block:: python
 
-   In [4]: l.Levenshtein_Distance("deaeque", "deaeuqe")
+   In [4]: l.levenshtein_distance("deaeque", "deaeuqe")
    Out[4]: 2
 
 
@@ -610,10 +610,10 @@ Alternatively, you can also use CLTK's native ``Levenshtein`` class:
 
    In [3]: from cltk.text_reuse.levenshtein import Levenshtein
 
-   In [4]: Levenshtein.Damerau_Levenshtein_Distance("deaeque", "deaque")
+   In [4]: Levenshtein.damerau_levenshtein_distance("deaeque", "deaque")
    Out[4]: 1
 
-   In [5]: Levenshtein.Damerau_Levenshtein_Distance("deaeque", "deaeuqe")
+   In [5]: Levenshtein.damerau_levenshtein_distance("deaeque", "deaeuqe")
    Out[5]: 1
 
 Needleman-Wunsch Algorithm