-
Notifications
You must be signed in to change notification settings - Fork 543
/
string.py
117 lines (91 loc) · 3.38 KB
/
string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from typing import Callable, Iterable, Sequence, Type
from affinegap import normalizedAffineGapDistance as affineGap
from highered import CRFEditDistance
from simplecosine.cosine import CosineTextSimilarity
from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType, indexPredicates
crfEd = CRFEditDistance()
base_predicates = (
predicates.wholeFieldPredicate,
predicates.firstTokenPredicate,
predicates.firstTwoTokensPredicate,
predicates.commonIntegerPredicate,
predicates.nearIntegersPredicate,
predicates.firstIntegerPredicate,
predicates.hundredIntegerPredicate,
predicates.hundredIntegersOddPredicate,
predicates.alphaNumericPredicate,
predicates.sameThreeCharStartPredicate,
predicates.sameFiveCharStartPredicate,
predicates.sameSevenCharStartPredicate,
predicates.commonTwoTokens,
predicates.commonThreeTokens,
predicates.fingerprint,
predicates.oneGramFingerprint,
predicates.twoGramFingerprint,
predicates.sortedAcronym,
)
class BaseStringType(FieldType):
_Predicate = predicates.StringPredicate
_predicate_functions: Sequence[Callable[[str], Iterable[str]]] = ()
def __init__(self, definition: VariableDefinition):
super(BaseStringType, self).__init__(definition)
self.predicates += indexPredicates(
(
predicates.LevenshteinCanopyPredicate,
predicates.LevenshteinSearchPredicate,
),
(1, 2, 3, 4),
self.field,
)
class ShortStringType(BaseStringType):
type = "ShortString"
_predicate_functions = base_predicates + (
predicates.commonFourGram,
predicates.commonSixGram,
predicates.tokenFieldPredicate,
predicates.suffixArray,
predicates.doubleMetaphone,
predicates.metaphoneToken,
)
_index_predicates: Sequence[Type[predicates.IndexPredicate]] = [
predicates.TfidfNGramCanopyPredicate,
predicates.TfidfNGramSearchPredicate,
]
_index_thresholds = (0.2, 0.4, 0.6, 0.8)
def __init__(self, definition: VariableDefinition):
super(ShortStringType, self).__init__(definition)
if definition.get("crf", False) is True:
self.comparator = crfEd # type: ignore[assignment]
else:
self.comparator = affineGap # type: ignore[assignment]
class StringType(ShortStringType):
type = "String"
_index_predicates = [
predicates.TfidfNGramCanopyPredicate,
predicates.TfidfNGramSearchPredicate,
predicates.TfidfTextCanopyPredicate,
predicates.TfidfTextSearchPredicate,
]
class TextType(BaseStringType):
type = "Text"
_predicate_functions = base_predicates
_index_predicates = [
predicates.TfidfTextCanopyPredicate,
predicates.TfidfTextSearchPredicate,
]
_index_thresholds = (0.2, 0.4, 0.6, 0.8)
def __init__(self, definition: VariableDefinition):
super(TextType, self).__init__(definition)
if "corpus" not in definition:
definition["corpus"] = []
self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment]
@hookimpl
def register_variable():
return {
ShortStringType.type: ShortStringType,
StringType.type: StringType,
TextType.type: TextType,
}