-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #147 from lukehollis/text_reuse
Updates for text reuse module in response to issue #130
- Loading branch information
Showing
7 changed files
with
419 additions
and
74 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
"""Test cltk.text_reuse.""" | ||
|
||
__author__ = 'Luke Hollis <lukehollis@gmail.com>' | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
import unittest | ||
from cltk.text_reuse.levenshtein import Levenshtein | ||
from cltk.text_reuse.text_reuse import TextReuse | ||
|
||
|
||
demo_verg = """ | ||
tuque o, cui prima frementem | ||
fudit equum magno tellus percussa tridenti, | ||
Neptune; et cultor nemorum, cui pinguia Ceae | ||
ter centum niuei tondent dumeta iuuenci; | ||
ipse nemus linquens patrium saltusque Lycaei | ||
Pan, ouium custos, tua si tibi Maenala curae, | ||
adsis, o Tegeaee, fauens, oleaeque Minerua | ||
inuentrix, uncique puer monstrator aratri, | ||
et teneram ab radice ferens, Siluane, cupressum: | ||
dique deaeque omnes, studium quibus arua tueri, | ||
munera vestra cano. et vos o agrestum praesentia | ||
quique nouas alitis non ullo semine fruges | ||
quique satis largum caelo demittitis imbrem. | ||
""" | ||
|
||
demo_prop = """ | ||
corniger Arcadii vacuam pastoris in aulam | ||
dux aries saturas ipse reduxit oves; | ||
dique deaeque omnes, quibus est tutela per agros, | ||
praebebant vestri verba benigna foci: | ||
'et leporem, quicumque venis, venaberis, hospes, | ||
et si forte meo tramite quaeris avem: | ||
et me Pana tibi comitem de rupe vocato, | ||
sive petes calamo praemia, sive cane.' | ||
at nunc desertis cessant sacraria lucis: | ||
aurum omnes victa iam pietate colunt. | ||
auro pulsa fides, auro venalia iura, | ||
aurum lex sequitur, mox sine lege pudor. | ||
""" | ||
|
||
class TestSequenceFunctions(unittest.TestCase): # pylint: disable=R0904 | ||
"""Class for unittest""" | ||
|
||
def test_distance_ratio(self): | ||
"""Test returning simple Levenshtein distance calculation ratio between two strings""" | ||
l = Levenshtein() | ||
ratio = l.ratio("dique deaeque omnes, studium quibus arua tueri,", "dique deaeque omnes, quibus est tutela per agros,") | ||
self.assertEqual(ratio, 0.71) | ||
|
||
def test_distance_sentences(self): | ||
"""Test comparing two passages tokenized at the sentence level""" | ||
t = TextReuse() | ||
comparisons = t.compare_sentences(demo_verg, demo_prop) | ||
self.assertEqual(comparisons[1][0]['ratio'], 0.39) | ||
|
||
def test_distance_sliding_window(self): | ||
"""Test comparing two passages with the sliding window strategy""" | ||
t = TextReuse() | ||
comparisons = t.compare_sliding_window(demo_verg, demo_prop) | ||
self.assertEqual(comparisons[19][3]['ratio'], 0.64) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
|
||
""" | ||
A comparison class to help with tracking string comparison values | ||
""" | ||
|
||
from cltk.utils.cltk_logger import logger | ||
|
||
|
||
__author__ = 'Luke Hollis <lukehollis@gmail.com>' | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
|
||
class Comparison: | ||
"""A class to increase ease of working with text reuse data.""" | ||
|
||
|
||
def __init__(self, str_a, str_b, distance_ratio): | ||
""" | ||
Initialize class with compared strings and ratio of comparison | ||
:param str_a: str | ||
:param str_b: str | ||
:param distance_ratio: float | ||
""" | ||
|
||
self.str_a = str_a | ||
self.str_b = str_b | ||
self.ratio = distance_ratio | ||
|
||
# The authors related to the compared string values | ||
# e.g. 10 (for line 10) or 3 (for paragraph 3) | ||
self.author_a = "" | ||
self.author_b = "" | ||
|
||
# The works related to the compared string values | ||
# e.g. 10 (for line 10) or 3 (for paragraph 3) | ||
self.work_a = "" | ||
self.work_b = "" | ||
|
||
# The subworks related to the compared string values | ||
# e.g. 10 (for line 10) or 3 (for paragraph 3) | ||
self.subwork_a = "" | ||
self.subwork_b = "" | ||
|
||
# The text numbers related to the compared string values | ||
# e.g. 10 (for line 10) or 3 (for paragraph 3) | ||
self.text_n_a = None | ||
self.text_n_b = None | ||
|
||
return | ||
|
||
def set_ref_a(author, work, subwork, text_n): | ||
""" | ||
Set the reference values related to the str_a compared string | ||
:param author: str | ||
:param work: str | ||
:param subwork: str | ||
:param text_n: str (a string instead of integer for variations in numbering systems that may inlude integers and alpha characters (e.g. '101b')) | ||
:return: void | ||
""" | ||
self.author_a = author | ||
self.work_a = work | ||
self.subwork_a = subwork | ||
self.text_n_a = text_n | ||
|
||
return | ||
|
||
def set_ref_b(author, work, subwork, text_n): | ||
""" | ||
Set the reference values related to the str_b compared string | ||
:param author: str | ||
:param work: str | ||
:param subwork: str | ||
:param text_n: str (a string instead of integer for variations in numbering systems that may inlude integers and alpha characters (e.g. '101b')) | ||
:return: void | ||
""" | ||
self.author_b = author | ||
self.work_b = work | ||
self.subwork_b = subwork | ||
self.text_n_b = text_n | ||
|
||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
""" | ||
Offer tools for working with Levenshtein distance algorithm and distance ratio between strings. | ||
Requirements: | ||
fuzzywuzzy | ||
Good-to-haves: | ||
python-Levenshtein | ||
""" | ||
|
||
import re, string | ||
import unicodedata | ||
from cltk.tokenize.sentence import TokenizeSentence | ||
from cltk.utils.cltk_logger import logger | ||
|
||
try: | ||
from fuzzywuzzy import fuzz | ||
except ImportError as imp_err: | ||
logger.error("'fuzzywuzzy' library required for this module: %s" % imp_err) | ||
raise ImportError | ||
|
||
|
||
__author__ = 'Luke Hollis <lukehollis@gmail.com>' | ||
__license__ = 'MIT License. See LICENSE.' | ||
|
||
|
||
class Levenshtein: | ||
"""A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods.""" | ||
|
||
def __init__(self): | ||
"""Initialize class. Currently empty.""" | ||
return | ||
|
||
@staticmethod | ||
def ratio(string_a, string_b): | ||
"""At the most basic level, return a Levenshtein distance ratio via | ||
fuzzywuzzy. | ||
:param string_a: str | ||
:param string_b: str | ||
:return: float | ||
""" | ||
|
||
return fuzz.ratio(string_a, string_b)/100 |
Oops, something went wrong.