Skip to content

Commit

Permalink
Uses the internal Levenshtein distance (#958)
Browse files Browse the repository at this point in the history
* Uses the internal Levenshtein distance

* Fixed docs

Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
  • Loading branch information
clemsciences and todd-cook committed Jun 7, 2020
1 parent 30eaf18 commit fe31ab7
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 41 deletions.
6 changes: 3 additions & 3 deletions cltk/prosody/latin/hendecasyllable_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import re

from Levenshtein import distance
from cltk.text_reuse.levenshtein import Levenshtein

import cltk.prosody.latin.string_utils as string_utils
from cltk.prosody.latin.verse import Verse
Expand Down Expand Up @@ -110,7 +110,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:

smoothed = self.correct_invalid_start(verse.scansion)

if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand All @@ -120,7 +120,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:

smoothed = self.correct_antepenult_chain(verse.scansion)

if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["antepenult chain"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand Down
12 changes: 6 additions & 6 deletions cltk/prosody/latin/hexameter_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import re

from Levenshtein import distance
from cltk.text_reuse.levenshtein import Levenshtein

from cltk.prosody.latin.verse import Verse
from cltk.prosody.latin.metrical_validator import MetricalValidator
Expand Down Expand Up @@ -187,7 +187,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
return verse

smoothed = self.correct_inverted_amphibrachs(verse.scansion)
if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["inverted"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand All @@ -196,7 +196,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
return self.assign_candidate(verse, verse.scansion)

smoothed = self.correct_first_two_dactyls(verse.scansion)
if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand All @@ -205,7 +205,7 @@ def scan(self, original_line: str, optional_transform: bool = False,
return self.assign_candidate(verse, verse.scansion)

smoothed = self.correct_invalid_fifth_foot(verse.scansion)
if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["invalid 5th"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand Down Expand Up @@ -236,7 +236,7 @@ def scan(self, original_line: str, optional_transform: bool = False,

# need to do this again, since the scansion has changed
smoothed = self.correct_inverted_amphibrachs(verse.scansion)
if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["inverted"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand Down Expand Up @@ -264,7 +264,7 @@ def scan(self, original_line: str, optional_transform: bool = False,

if dactyl_smoothing:
smoothed = self.correct_dactyl_chain(smoothed)
if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["dactyl smoothing"]]
verse.scansion = smoothed
if self.metrical_validator.is_valid_hexameter(verse.scansion):
Expand Down
5 changes: 3 additions & 2 deletions cltk/prosody/latin/metrical_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from typing import List

from cltk.prosody.latin.scansion_constants import ScansionConstants
from Levenshtein import distance
from cltk.text_reuse.levenshtein import Levenshtein


LOG = logging.getLogger(__name__)
LOG.addHandler(logging.NullHandler())
Expand Down Expand Up @@ -212,7 +213,7 @@ def _closest_patterns(self, patterns: List[str], scansion: str) -> List[str]:
pattern = pattern.replace(self.constants.FOOT_SEPARATOR, "")
ending = pattern[-1]
candidate = pattern[:len(pattern) - 1] + self.constants.OPTIONAL_ENDING
cans = [(distance(candidate, x), x) for x in patterns
cans = [(Levenshtein.levenshtein_distance(candidate, x), x) for x in patterns
if len(x) == len(candidate)]
if cans:
cans = sorted(cans, key=lambda tup: tup[0])
Expand Down
7 changes: 3 additions & 4 deletions cltk/prosody/latin/pentameter_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

import re

from Levenshtein import distance

from cltk.text_reuse.levenshtein import Levenshtein
from cltk.prosody.latin.verse import Verse
from cltk.prosody.latin.metrical_validator import MetricalValidator
from cltk.prosody.latin.scansion_constants import ScansionConstants
Expand Down Expand Up @@ -127,7 +126,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:

smoothed = self.correct_first_two_dactyls(verse.scansion)

if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["invalid start"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand All @@ -137,7 +136,7 @@ def scan(self, original_line: str, optional_transform: bool = False) -> Verse:

smoothed = self.correct_penultimate_dactyl_chain(verse.scansion)

if distance(verse.scansion, smoothed) > 0:
if Levenshtein.levenshtein_distance(verse.scansion, smoothed) > 0:
verse.scansion_notes += [self.constants.NOTE_MAP["penultimate dactyl chain"]]
verse.scansion = smoothed
stresses += string_utils.differences(verse.scansion, smoothed)
Expand Down
3 changes: 2 additions & 1 deletion cltk/tests/test_nlp/test_scansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from cltk.prosody.latin.hendecasyllable_scanner import HendecasyllableScanner
from cltk.prosody.latin.syllabifier import Syllabifier

class TestScansionFunctions(unittest.TestCase): # pylint: disable=R0904

class TestScansionFunctions(unittest.TestCase):
"""Class for unittest"""

def test_hexameter_scanner(self):
Expand Down
4 changes: 2 additions & 2 deletions cltk/tests/test_nlp/test_text_reuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def test_distance_ratio(self):
def test_levenshtein_distance(self):
"""Test for Levenshtein Distance between two words"""
l = Levenshtein()
dist = l.Levenshtein_Distance("now grete glorious god through grace of himselven","and the precious prayer of his pris moder")
dist = l.levenshtein_distance("now grete glorious god through grace of himselven", "and the precious prayer of his pris moder")
self.assertEqual(dist, 36)

def test_damerau_levenshtein_distance(self):
"""Test for Damerau-Levenshtein Distance between two words"""
l = Levenshtein()
dist = l.Damerau_Levenshtein_Distance("all haile whose solempne glorious concepcioun","fresche floure in quhom the hevinlie dewe doun fell")
dist = l.damerau_levenshtein_distance("all haile whose solempne glorious concepcioun", "fresche floure in quhom the hevinlie dewe doun fell")
self.assertEqual(dist,35)

# Test causing lemmatizer Travis build to fail—figure out what is wrong and restore.
Expand Down
40 changes: 20 additions & 20 deletions cltk/text_reuse/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self):
return

@staticmethod
def Levenshtein_Distance(w1, w2):
def levenshtein_distance(w1, w2):
"""
Computes Levenshtein Distance between two words
Expand All @@ -24,36 +24,37 @@ def Levenshtein_Distance(w1, w2):
Examples:
>>> Levenshtein.Levenshtein_Distance('noctis', 'noctem')
>>> Levenshtein.levenshtein_distance('noctis', 'noctem')
2
>>> Levenshtein.Levenshtein_Distance('nox', 'nochem')
>>> Levenshtein.levenshtein_distance('nox', 'nochem')
4
>>> Levenshtein.Levenshtein_Distance('orbis', 'robis')
>>> Levenshtein.levenshtein_distance('orbis', 'robis')
2
"""
m, n = len(w1), len(w2)
v1 = [i for i in range(n + 1)]
v2 = [0 for i in range(n + 1)]
v2 = [0 for _ in range(n + 1)]

for i in range(m):
v2[0] = i + 1

for j in range(n):
delCost = v1[j + 1] + 1
insCost = v2[j] + 1
del_cost = v1[j + 1] + 1
ins_cost = v2[j] + 1

subCost = v1[j]
if w1[i] != w2[j]: subCost += 1
sub_cost = v1[j]
if w1[i] != w2[j]:
sub_cost += 1

v2[j + 1] = min(delCost, insCost, subCost)
v2[j + 1] = min(del_cost, ins_cost, sub_cost)
v1, v2 = v2, v1

return v1[-1]

@staticmethod
def Damerau_Levenshtein_Distance(w1, w2):
def damerau_levenshtein_distance(w1, w2):
"""
Computes Damerau-Levenshtein Distance between two words
Expand All @@ -66,15 +67,15 @@ def Damerau_Levenshtein_Distance(w1, w2):
For the most part, Damerau-Levenshtein behaves
identically to Levenshtein:
>>> Levenshtein.Damerau_Levenshtein_Distance('noctis', 'noctem')
>>> Levenshtein.damerau_levenshtein_distance('noctis', 'noctem')
2
>>> Levenshtein.Levenshtein_Distance('nox', 'nochem')
>>> Levenshtein.levenshtein_distance('nox', 'nochem')
4
The strength of DL lies in detecting transposition of characters:
>>> Levenshtein.Damerau_Levenshtein_Distance('orbis', 'robis')
>>> Levenshtein.damerau_levenshtein_distance('orbis', 'robis')
1
"""
Expand All @@ -88,7 +89,8 @@ def Damerau_Levenshtein_Distance(w1, w2):
max_dist = len(w1) + len(w2)
mat[0][0] = max_dist

# Initialize matrix margin to the maximum possible distance (essentially inf) for ease of calculations (avoiding try blocks)
# Initialize matrix margin to the maximum possible distance (essentially inf) for ease of calculations
# (avoiding try blocks)

for i in range(1, len(w1) + 2):
mat[i][0] = max_dist
Expand All @@ -102,7 +104,6 @@ def Damerau_Levenshtein_Distance(w1, w2):
tem = 0

for j in range(2, len(w2) + 2):

k = dam_ar[alph.index(w2[j - 2])]
l = tem

Expand All @@ -112,7 +113,7 @@ def Damerau_Levenshtein_Distance(w1, w2):
else:
cost = 1

# The reccurence relation of DL is identical to that of Levenshtein with the addition of transposition
# The recurrence relation of DL is identical to that of Levenshtein with the addition of transposition
mat[i][j] = min(mat[i - 1][j - 1] + cost, mat[i][j - 1] + 1, mat[i - 1][j] + 1,
mat[k - 1][l - 1] + i + j - k - l - 1)

Expand All @@ -133,11 +134,10 @@ def ratio(string_a, string_b):
from fuzzywuzzy import fuzz

except ImportError as imp_err: # pragma: no cover
message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err
message = "'fuzzywuzzy' library required for this module: %s. Install with " \
"`pip install fuzzywuzzy python-Levenshtein`" % imp_err
logger.error(message)
print(message)
raise ImportError

return fuzz.ratio(string_a, string_b) / 100


6 changes: 3 additions & 3 deletions docs/multilingual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ You can also calculate the Levenshtein distance of two words, defined as the min

.. code-block:: python
In [4]: l.Levenshtein_Distance("deaeque", "deaeuqe")
In [4]: l.levenshtein_distance("deaeque", "deaeuqe")
Out[4]: 2
Expand Down Expand Up @@ -610,10 +610,10 @@ Alternatively, you can also use CLTK's native ``Levenshtein`` class:
In [3]: from cltk.text_reuse.levenshtein import Levenshtein
In [4]: Levenshtein.Damerau_Levenshtein_Distance("deaeque", "deaque")
In [4]: Levenshtein.damerau_levenshtein_distance("deaeque", "deaque")
Out[4]: 1
In [5]: Levenshtein.Damerau_Levenshtein_Distance("deaeque", "deaeuqe")
In [5]: Levenshtein.damerau_levenshtein_distance("deaeque", "deaeuqe")
Out[5]: 1
Needleman-Wunsch Algorithm
Expand Down

0 comments on commit fe31ab7

Please sign in to comment.