Skip to content

Commit

Permalink
Merge ea2924f into b3efd1e
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Dec 11, 2022
2 parents b3efd1e + ea2924f commit 5c24a54
Show file tree
Hide file tree
Showing 15 changed files with 117 additions and 47 deletions.
22 changes: 15 additions & 7 deletions dedupe/__init__.py
@@ -1,7 +1,15 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)

from dedupe._init import * # noqa
from dedupe.api import ( # noqa: F401
Dedupe,
Gazetteer,
RecordLink,
StaticDedupe,
StaticGazetteer,
StaticRecordLink,
)
from dedupe.convenience import ( # noqa: F401
canonicalize,
console_label,
training_data_dedupe,
training_data_link,
)
from dedupe.serializer import read_training, write_training # noqa: F401
15 changes: 0 additions & 15 deletions dedupe/_init.py

This file was deleted.

41 changes: 31 additions & 10 deletions dedupe/datamodel.py
@@ -1,22 +1,18 @@
from __future__ import annotations

import copyreg
import pkgutil
import importlib
import types
from typing import TYPE_CHECKING, cast

import numpy
import pluggy

import dedupe.variables
import dedupe.hookspecs
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import MissingDataType, Variable
from dedupe.variables.interaction import InteractionType

for _, module, _ in pkgutil.iter_modules( # type: ignore
dedupe.variables.__path__, "dedupe.variables."
):
__import__(module)

if TYPE_CHECKING:
from typing import Generator, Iterable, Sequence

Expand All @@ -28,7 +24,26 @@
)
from dedupe.predicates import Predicate

VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}

DEFAULT_VARIABLES = [
"dedupe.variables.base",
"dedupe.variables.string",
"dedupe.variables.categorical_type",
"dedupe.variables.exists",
"dedupe.variables.exact",
"dedupe.variables.latlong",
"dedupe.variables.interaction",
"dedupe.variables.price",
"dedupe.variables.set",
]

pm = pluggy.PluginManager("dedupe")
pm.add_hookspecs(dedupe.hookspecs)
pm.load_setuptools_entrypoints("dedupe")

for plugin in DEFAULT_VARIABLES:
mod = importlib.import_module(plugin)
pm.register(mod, plugin)


class DataModel(object):
Expand All @@ -38,6 +53,7 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]):
variable_definitions = list(variable_definitions)
if not variable_definitions:
raise ValueError("The variable definitions cannot be empty")

all_variables: list[Variable]
self.primary_variables, all_variables = typify_variables(variable_definitions)
self._derived_start = len(all_variables)
Expand Down Expand Up @@ -145,6 +161,11 @@ def __setstate__(self, d):
def typify_variables(
variable_definitions: Iterable[VariableDefinition],
) -> tuple[list[FieldVariable], list[Variable]]:

variable_types = {}
for variable_type in pm.hook.register_variable():
variable_types.update(variable_type)

primary_variables: list[FieldVariable] = []
all_variables: list[Variable] = []
only_custom = True
Expand Down Expand Up @@ -181,11 +202,11 @@ def typify_variables(
]

try:
variable_class = VARIABLE_CLASSES[variable_type]
variable_class = variable_types[variable_type]
except KeyError:
raise KeyError(
"Field type %s not valid. Valid types include %s"
% (definition["type"], ", ".join(VARIABLE_CLASSES))
% (definition["type"], ", ".join(variable_types))
)

variable_object = variable_class(definition)
Expand Down
9 changes: 9 additions & 0 deletions dedupe/hookspecs.py
@@ -0,0 +1,9 @@
import pluggy

hookimpl = pluggy.HookimplMarker("dedupe")
hookspec = pluggy.HookspecMarker("dedupe")


@hookspec
def register_variable():
"""Register a variable for use in a datamodel"""
3 changes: 0 additions & 3 deletions dedupe/variables/__init__.py
@@ -1,3 +0,0 @@
from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)
17 changes: 7 additions & 10 deletions dedupe/variables/base.py
Expand Up @@ -3,9 +3,10 @@
from typing import TYPE_CHECKING

from dedupe import predicates
from dedupe.hookspecs import hookimpl

if TYPE_CHECKING:
from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type
from typing import Any, ClassVar, Iterable, Sequence, Type

from dedupe._typing import Comparator, PredicateFunction, VariableDefinition

Expand Down Expand Up @@ -47,15 +48,6 @@ def __getstate__(self) -> dict[str, Any]:

return odict

@classmethod
def all_subclasses(
cls,
) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]:
for q in cls.__subclasses__():
yield getattr(q, "type", None), q
for p in q.all_subclasses():
yield p


class DerivedType(Variable):
type = "Derived"
Expand Down Expand Up @@ -135,3 +127,8 @@ def indexPredicates(
index_predicates.append(predicate(threshold, field))

return index_predicates


@hookimpl
def register_variable():
return {CustomType.type: CustomType}
6 changes: 6 additions & 0 deletions dedupe/variables/categorical_type.py
Expand Up @@ -4,6 +4,7 @@

from dedupe import predicates
from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import DerivedType, FieldType


Expand Down Expand Up @@ -36,3 +37,8 @@ def __init__(self, definition: VariableDefinition):

def __len__(self) -> int:
return len(self.higher_vars)


@hookimpl
def register_variable():
return {CategoricalType.type: CategoricalType}
6 changes: 6 additions & 0 deletions dedupe/variables/exact.py
@@ -1,6 +1,7 @@
from typing import Any

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -14,3 +15,8 @@ def comparator(field_1: Any, field_2: Any) -> int:
return 1
else:
return 0


@hookimpl
def register_variable():
return {ExactType.type: ExactType}
6 changes: 6 additions & 0 deletions dedupe/variables/exists.py
Expand Up @@ -5,6 +5,7 @@
from categorical import CategoricalComparator

from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import DerivedType
from dedupe.variables.categorical_type import CategoricalType

Expand Down Expand Up @@ -37,3 +38,8 @@ def comparator(self, field_1: Any, field_2: Any) -> list[int]:
# This flag tells fieldDistances in dedupe.core to pass
# missing values (None) into the comparator
comparator.missing = True # type: ignore


@hookimpl
def register_variable():
return {ExistsType.type: ExistsType}
6 changes: 6 additions & 0 deletions dedupe/variables/interaction.py
Expand Up @@ -4,6 +4,7 @@
from typing import Mapping

from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import Variable

Expand Down Expand Up @@ -77,3 +78,8 @@ def atomicInteractions(
atomic_interactions.append(field)

return atomic_interactions


@hookimpl
def register_variable():
return {InteractionType.type: InteractionType}
6 changes: 6 additions & 0 deletions dedupe/variables/latlong.py
Expand Up @@ -5,6 +5,7 @@
from haversine import haversine

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -16,3 +17,8 @@ class LatLongType(FieldType):
@staticmethod
def comparator(x: tuple[float, float], y: tuple[float, float]) -> float:
return sqrt(haversine(x, y))


@hookimpl
def register_variable():
return {LatLongType.type: LatLongType}
6 changes: 6 additions & 0 deletions dedupe/variables/price.py
Expand Up @@ -3,6 +3,7 @@
import numpy

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -22,3 +23,8 @@ def comparator(price_1: int | float, price_2: int | float) -> float:
return numpy.nan
else:
return abs(numpy.log10(price_1) - numpy.log10(price_2))


@hookimpl
def register_variable():
return {PriceType.type: PriceType}
6 changes: 6 additions & 0 deletions dedupe/variables/set.py
Expand Up @@ -2,6 +2,7 @@

from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand Down Expand Up @@ -31,3 +32,8 @@ def __init__(self, definition: VariableDefinition):
definition["corpus"] = []

self.comparator = CosineSetSimilarity(definition["corpus"]) # type: ignore[assignment]


@hookimpl
def register_variable():
return {SetType.type: SetType}
10 changes: 10 additions & 0 deletions dedupe/variables/string.py
Expand Up @@ -6,6 +6,7 @@

from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType, indexPredicates

crfEd = CRFEditDistance()
Expand Down Expand Up @@ -105,3 +106,12 @@ def __init__(self, definition: VariableDefinition):
definition["corpus"] = []

self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment]


@hookimpl
def register_variable():
return {
ShortStringType.type: ShortStringType,
StringType.type: StringType,
TextType.type: TextType,
}
5 changes: 3 additions & 2 deletions pyproject.toml
@@ -1,7 +1,7 @@
[project]
name = "dedupe"
description = "A python library for accurate and scaleable data deduplication and entity-resolution"
version = "2.0.19"
version = "3.0.0"
readme = "README.md"
requires-python = ">=3.7"
license = {file = "LICENSE"}
Expand Down Expand Up @@ -38,6 +38,7 @@ dependencies = [
"zope.index",
"Levenshtein_search==1.4.5",
"typing_extensions",
"pluggy",
]

[project.urls]
Expand All @@ -51,7 +52,7 @@ MailingList = "https://groups.google.com/forum/#!forum/open-source-deduplication


[build-system]
requires = ["setuptools==63",
requires = ["setuptools",
"wheel",
"cython"]
build-backend = "setuptools.build_meta"
Expand Down

0 comments on commit 5c24a54

Please sign in to comment.