Skip to content

Commit

Permalink
Merge pull request #147 from lukehollis/text_reuse
Browse files Browse the repository at this point in the history
Updates for text reuse module in response to issue #130
  • Loading branch information
kylepjohnson committed Feb 28, 2016
2 parents dbc2aae + 93453e0 commit 5f0982d
Show file tree
Hide file tree
Showing 7 changed files with 419 additions and 74 deletions.
74 changes: 0 additions & 74 deletions cltk/reuse/levenshtein.py

This file was deleted.

61 changes: 61 additions & 0 deletions cltk/tests/test_text_reuse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Test cltk.text_reuse."""

__author__ = 'Luke Hollis <lukehollis@gmail.com>'
__license__ = 'MIT License. See LICENSE.'

import unittest
from cltk.text_reuse.levenshtein import Levenshtein
from cltk.text_reuse.text_reuse import TextReuse


demo_verg = """
tuque o, cui prima frementem
fudit equum magno tellus percussa tridenti,
Neptune; et cultor nemorum, cui pinguia Ceae
ter centum niuei tondent dumeta iuuenci;
ipse nemus linquens patrium saltusque Lycaei
Pan, ouium custos, tua si tibi Maenala curae,
adsis, o Tegeaee, fauens, oleaeque Minerua
inuentrix, uncique puer monstrator aratri,
et teneram ab radice ferens, Siluane, cupressum:
dique deaeque omnes, studium quibus arua tueri,
munera vestra cano. et vos o agrestum praesentia
quique nouas alitis non ullo semine fruges
quique satis largum caelo demittitis imbrem.
"""

demo_prop = """
corniger Arcadii vacuam pastoris in aulam
dux aries saturas ipse reduxit oves;
dique deaeque omnes, quibus est tutela per agros,
praebebant vestri verba benigna foci:
'et leporem, quicumque venis, venaberis, hospes,
et si forte meo tramite quaeris avem:
et me Pana tibi comitem de rupe vocato,
sive petes calamo praemia, sive cane.'
at nunc desertis cessant sacraria lucis:
aurum omnes victa iam pietate colunt.
auro pulsa fides, auro venalia iura,
aurum lex sequitur, mox sine lege pudor.
"""

class TestSequenceFunctions(unittest.TestCase): # pylint: disable=R0904
"""Class for unittest"""

def test_distance_ratio(self):
"""Test returning simple Levenshtein distance calculation ratio between two strings"""
l = Levenshtein()
ratio = l.ratio("dique deaeque omnes, studium quibus arua tueri,", "dique deaeque omnes, quibus est tutela per agros,")
self.assertEqual(ratio, 0.71)

def test_distance_sentences(self):
"""Test comparing two passages tokenized at the sentence level"""
t = TextReuse()
comparisons = t.compare_sentences(demo_verg, demo_prop)
self.assertEqual(comparisons[1][0]['ratio'], 0.39)

def test_distance_sliding_window(self):
"""Test comparing two passages with the sliding window strategy"""
t = TextReuse()
comparisons = t.compare_sliding_window(demo_verg, demo_prop)
self.assertEqual(comparisons[19][3]['ratio'], 0.64)
File renamed without changes.
82 changes: 82 additions & 0 deletions cltk/text_reuse/comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

"""
A comparison class to help with tracking string comparison values
"""

from cltk.utils.cltk_logger import logger


__author__ = 'Luke Hollis <lukehollis@gmail.com>'
__license__ = 'MIT License. See LICENSE.'


class Comparison:
"""A class to increase ease of working with text reuse data."""


def __init__(self, str_a, str_b, distance_ratio):
"""
Initialize class with compared strings and ratio of comparison
:param str_a: str
:param str_b: str
:param distance_ratio: float
"""

self.str_a = str_a
self.str_b = str_b
self.ratio = distance_ratio

# The authors related to the compared string values
# e.g. 10 (for line 10) or 3 (for paragraph 3)
self.author_a = ""
self.author_b = ""

# The works related to the compared string values
# e.g. 10 (for line 10) or 3 (for paragraph 3)
self.work_a = ""
self.work_b = ""

# The subworks related to the compared string values
# e.g. 10 (for line 10) or 3 (for paragraph 3)
self.subwork_a = ""
self.subwork_b = ""

# The text numbers related to the compared string values
# e.g. 10 (for line 10) or 3 (for paragraph 3)
self.text_n_a = None
self.text_n_b = None

return

def set_ref_a(author, work, subwork, text_n):
"""
Set the reference values related to the str_a compared string
:param author: str
:param work: str
:param subwork: str
:param text_n: str (a string instead of integer for variations in numbering systems that may inlude integers and alpha characters (e.g. '101b'))
:return: void
"""
self.author_a = author
self.work_a = work
self.subwork_a = subwork
self.text_n_a = text_n

return

def set_ref_b(author, work, subwork, text_n):
"""
Set the reference values related to the str_b compared string
:param author: str
:param work: str
:param subwork: str
:param text_n: str (a string instead of integer for variations in numbering systems that may inlude integers and alpha characters (e.g. '101b'))
:return: void
"""
self.author_b = author
self.work_b = work
self.subwork_b = subwork
self.text_n_b = text_n

return
44 changes: 44 additions & 0 deletions cltk/text_reuse/levenshtein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Offer tools for working with Levenshtein distance algorithm and distance ratio between strings.
Requirements:
fuzzywuzzy
Good-to-haves:
python-Levenshtein
"""

import re, string
import unicodedata
from cltk.tokenize.sentence import TokenizeSentence
from cltk.utils.cltk_logger import logger

try:
from fuzzywuzzy import fuzz
except ImportError as imp_err:
logger.error("'fuzzywuzzy' library required for this module: %s" % imp_err)
raise ImportError


__author__ = 'Luke Hollis <lukehollis@gmail.com>'
__license__ = 'MIT License. See LICENSE.'


class Levenshtein:
"""A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods."""

def __init__(self):
"""Initialize class. Currently empty."""
return

@staticmethod
def ratio(string_a, string_b):
"""At the most basic level, return a Levenshtein distance ratio via
fuzzywuzzy.
:param string_a: str
:param string_b: str
:return: float
"""

return fuzz.ratio(string_a, string_b)/100

0 comments on commit 5f0982d

Please sign in to comment.