Skip to content

Commit

Permalink
Merge pull request #1145 from lmores/fix/cpredicates
Browse files Browse the repository at this point in the history
Improve cpredicates.pyx
  • Loading branch information
fgregg committed Jan 29, 2023
2 parents 765a6fa + e88470b commit 1f5dfbc
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 25 deletions.
56 changes: 36 additions & 20 deletions dedupe/cpredicates.pyx
Original file line number Diff line number Diff line change
@@ -1,37 +1,54 @@
# cython: c_string_type=unicode, c_string_encoding=utf8, infertypes=True, language_level=3

cpdef list ngrams(basestring field, int n):
"""ngrams returns all unique, contiguous sequences of n characters
"""ngrams returns all contiguous sequences of n characters
of a given field.
:param field: the string to be
:param field: the string to be sequenced
:param n: the number of characters to be included in each gram
usage:
>>> from dedupe.dedupe.predicated import ngrams
>>> ngrams("deduplicate", 3)
('ded', 'edu', 'dup', 'upl', 'pli', 'lic', 'ica', 'cat', 'ate')
['ded', 'edu', 'dup', 'upl', 'pli', 'lic', 'ica', 'cat', 'ate']
"""
cdef unicode ufield = _ustring(field)

cdef int i
cdef int n_char = len(ufield)
cdef int n_grams = n_char - n + 1
cdef list grams = [ufield[i:i+n] for i in range(n_grams)]
return grams


cpdef set unique_ngrams(basestring field, int n):
"""unique_ngrams returns all contiguous unique sequences of n characters
of a given field.
:param field: the string to be sequenced
:param n: the number of characters to be included in each gram
usage:
>>> from dedupe.dedupe.predicated import unique_ngrams
>>> unique_ngrams("mississippi", 2)
{"mi", "is", "ss", "si", "ip", "pp", "pi"}
"""
cdef unicode ufield = _ustring(field)

cdef list grams = []
cdef int i, j
cdef int i
cdef int n_char = len(ufield)
for i in range(n_char):
for j in range(i+n, min(n_char, i+n)+1):
grams.append(ufield[i:j])

cdef int n_grams = n_char - n + 1
cdef set grams = {ufield[i:i+n] for i in range(n_grams)}
return grams


cpdef tuple initials(basestring field, int n):
"""predicate which returns first a tuple containing
the first n chars of a field if and only if the
field contains at least n characters, or an empty
tuple otherwise.
:param field: the string
:type n: int, default None
"""returns a tuple containing the first n chars of a field.
The whole field is returned if n is greater than the field length.
:param field: the string
:type n: int
usage:
>>> initials("dedupe", 7)
('dedupe', )
Expand All @@ -43,7 +60,6 @@ cpdef tuple initials(basestring field, int n):
return (ufield[:n], )



cdef unicode _ustring(basestring s):
if type(s) is unicode:
# fast path for most common case(s)
Expand Down
12 changes: 7 additions & 5 deletions dedupe/predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import dedupe.levenshtein as levenshtein
import dedupe.tfidf as tfidf
from dedupe.cpredicates import initials, ngrams
from dedupe.cpredicates import initials, ngrams, unique_ngrams

if TYPE_CHECKING:
from typing import Any, Callable, Iterable, Mapping, Sequence
Expand Down Expand Up @@ -472,14 +472,16 @@ def fingerprint(field: str) -> tuple[str]:


def oneGramFingerprint(field: str) -> tuple[str]:
return ("".join(sorted(set(ngrams(field.replace(" ", ""), 1)))).strip(),)
return ("".join(sorted(unique_ngrams(field.replace(" ", ""), 1))).strip(),)


def twoGramFingerprint(field: str) -> tuple[str, ...]:
if len(field) > 1:
return (
"".join(
sorted(gram.strip() for gram in set(ngrams(field.replace(" ", ""), 2)))
sorted(
gram.strip() for gram in unique_ngrams(field.replace(" ", ""), 2)
)
),
)
else:
Expand All @@ -488,12 +490,12 @@ def twoGramFingerprint(field: str) -> tuple[str, ...]:

def commonFourGram(field: str) -> set[str]:
"""return 4-grams"""
return set(ngrams(field.replace(" ", ""), 4))
return unique_ngrams(field.replace(" ", ""), 4)


def commonSixGram(field: str) -> set[str]:
"""return 6-grams"""
return set(ngrams(field.replace(" ", ""), 6))
return unique_ngrams(field.replace(" ", ""), 6)


def sameThreeCharStartPredicate(field: str) -> tuple[str]:
Expand Down
176 changes: 176 additions & 0 deletions tests/test_cpredicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import unittest

from dedupe.cpredicates import initials, ngrams, unique_ngrams


class TestCPredicates(unittest.TestCase):
def test_ngrams(self):
assert ngrams("deduplicate", 1) == [
"d",
"e",
"d",
"u",
"p",
"l",
"i",
"c",
"a",
"t",
"e",
]
assert ngrams("deduplicate", 2) == [
"de",
"ed",
"du",
"up",
"pl",
"li",
"ic",
"ca",
"at",
"te",
]
assert ngrams("deduplicate", 3) == [
"ded",
"edu",
"dup",
"upl",
"pli",
"lic",
"ica",
"cat",
"ate",
]
assert ngrams("deduplicate", 4) == [
"dedu",
"edup",
"dupl",
"upli",
"plic",
"lica",
"icat",
"cate",
]
assert ngrams("deduplicate", 5) == [
"dedup",
"edupl",
"dupli",
"uplic",
"plica",
"licat",
"icate",
]
assert ngrams("deduplicate", 6) == [
"dedupl",
"edupli",
"duplic",
"uplica",
"plicat",
"licate",
]
assert ngrams("deduplicate", 7) == [
"dedupli",
"eduplic",
"duplica",
"uplicat",
"plicate",
]
assert ngrams("deduplicate", 8) == [
"deduplic",
"eduplica",
"duplicat",
"uplicate",
]
assert ngrams("deduplicate", 9) == ["deduplica", "eduplicat", "duplicate"]
assert ngrams("deduplicate", 10) == ["deduplicat", "eduplicate"]
assert ngrams("deduplicate", 11) == ["deduplicate"]
assert ngrams("deduplicate", 12) == []
assert ngrams("deduplicate", 100) == []

def test_unique_ngrams(self):
assert unique_ngrams("mississippi", 1) == {"m", "i", "s", "p"}
assert unique_ngrams("mississippi", 2) == {
"mi",
"is",
"ss",
"si",
"ip",
"pp",
"pi",
}
assert unique_ngrams("mississippi", 3) == {
"mis",
"iss",
"ssi",
"sis",
"sip",
"ipp",
"ppi",
}
assert unique_ngrams("mississippi", 4) == {
"miss",
"issi",
"ssis",
"siss",
"ssip",
"sipp",
"ippi",
}
assert unique_ngrams("mississippi", 5) == {
"missi",
"issis",
"ssiss",
"sissi",
"issip",
"ssipp",
"sippi",
}
assert unique_ngrams("mississippi", 6) == {
"missis",
"ississ",
"ssissi",
"sissip",
"issipp",
"ssippi",
}
assert unique_ngrams("mississippi", 7) == {
"mississ",
"ississi",
"ssissip",
"sissipp",
"issippi",
}
assert unique_ngrams("mississippi", 8) == {
"mississi",
"ississip",
"ssissipp",
"sissippi",
}
assert unique_ngrams("mississippi", 9) == {
"mississip",
"ississipp",
"ssissippi",
}
assert unique_ngrams("mississippi", 10) == {"mississipp", "ississippi"}
assert unique_ngrams("mississippi", 11) == {"mississippi"}
assert unique_ngrams("mississippi", 12) == set()
assert unique_ngrams("mississippi", 100) == set()

def test_initials(self):
assert initials("deduplicate", 1) == ("d",)
assert initials("deduplicate", 2) == ("de",)
assert initials("deduplicate", 3) == ("ded",)
assert initials("deduplicate", 4) == ("dedu",)
assert initials("deduplicate", 5) == ("dedup",)
assert initials("deduplicate", 6) == ("dedupl",)
assert initials("deduplicate", 7) == ("dedupli",)
assert initials("deduplicate", 8) == ("deduplic",)
assert initials("deduplicate", 9) == ("deduplica",)
assert initials("deduplicate", 10) == ("deduplicat",)
assert initials("deduplicate", 11) == ("deduplicate",)
assert initials("deduplicate", 12) == ("deduplicate",)
assert initials("deduplicate", 100) == ("deduplicate",)


if __name__ == "__main__":
unittest.main()

0 comments on commit 1f5dfbc

Please sign in to comment.