document matchBlocks and bump version, close #338

dedupeio · Feb 26, 2015 · 18305d8 · 18305d8
1 parent 4da149f
commit 18305d8
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 24 deletions.
diff --git a/docs/common_dedupe_methods.rst b/docs/common_dedupe_methods.rst
@@ -82,6 +82,83 @@
       > blocked_ids = deduper.blocker(data)
       > print list(blocked_ids)
       [('foo:1', 1), ..., ('bar:1', 100)]
+
+
+.. py:method::  matchBlocks(blocks, [threshold=.5])
+
+   Partitions blocked data and returns a list of clusters, where each
+   cluster is a tuple of record ids
+
+   .. code:: python
+
+   Keyword arguments
+
+   :param list blocks: Sequence of records blocks. Each record block
+		       is a tuple containing records to compare. Each
+		       block should contain two or more records.
+		       Along with each record, there should also be
+		       information on the blocks that cover that
+		       record.
+
+		       For example, if we have three records: 
+
+		       .. code :: python
+		           
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Pat', 'address' : '123 Main'})
+			  (3, {'name' : 'Sam', 'address' : '123 Main'})
+
+		       and two predicates: "Whole name" and "Whole address".
+		       These predicates will produce the following blocks:
+
+		       .. code :: python
+
+		          # Block 1 (Whole name)
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Pat', 'address' : '123 Main'})
+
+			  # Block 2 (Whole name)
+			  (3, {'name' : 'Sam', 'address' : '123 Main'})
+
+			  # Block 3 (Whole address
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Pat', 'address' : '123 Main'})
+			  (3, {'name' : 'Sam', 'address' : '123 Main'})
+
+		       So, the blocks you feed to matchBlocks should look
+		       like this, after filtering out the singleton block.
+
+		       .. code :: python
+
+		          blocks =((
+			            ((1, {'name' : 'Pat', 'address' : '123 Main'}), set([])),
+			            ((2, {'name' : 'Pat', 'address' : '123 Main'}), set([]))
+				    ), 
+			           (
+				    ((1, {'name' : 'Pat', 'address' : '123 Main'}), set([1])),
+			            ((2, {'name' : 'Pat', 'address' : '123 Main'}), set([1])),
+			            ((3, {'name' : 'Sam', 'address' : '123 Main'}), set([]))
+				    )
+				   )
+			  deduper.matchBlocks(blocks)
+
+		       Within each block, dedupe will compare every
+		       pair of records. This is expensive. Checking to
+		       see if two sets intersect is much cheaper, and
+		       if the block coverage information for two
+		       records does intersect, that means that this
+		       pair of records has been compared in a previous
+		       block, and dedupe will skip comparing this pair
+		       of records again.
+
+   :param float threshold: Number between 0 and 1 (default is .5). We
+			   will only consider as duplicates record
+			   pairs as duplicates if their estimated
+			   duplicate likelihood is greater than the
+			   threshold.
+
+			   Lowering the number will increase recall,
+			   raising it will increase precision.
 
 
 
diff --git a/docs/common_gazetteer_methods.rst b/docs/common_gazetteer_methods.rst
@@ -68,9 +68,62 @@
 
        clustered_dupes = deduper.matchBlocks(blocked_data, threshold)
 
-   :param list blocks: Sequence of tuples of records, where each tuple
-		       is a set of records covered by a blocking
-		       predicate.
+   :param list blocks: Sequence of records blocks. Each record block
+		       is a tuple containing two sequences of records,
+		       the records from the messy data set and the
+		       records from the canonical dataset. Within each
+		       block there should be at least one record from
+		       each datasets.  Along with each record, there
+		       should also be information on the blocks that
+		       cover that record.
+
+		       For example, if we have two records from a 
+		       messy dataset one record from a canonical dataset: 
+
+		       .. code :: python
+		           
+		          # Messy
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+
+			  # Canonical
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+		       and two predicates: "Whole name" and "Whole address".
+		       These predicates will produce the following blocks:
+
+		       .. code :: python
+
+		          # Block 1 (Whole name)
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+			  # Block 2 (Whole name)
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+
+			  # Block 3 (Whole address
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+
+		       So, the blocks you feed to matchBlocks should look
+		       like this, 
+
+		       .. code :: python
+
+		          blocks =((
+			            [((1, {'name' : 'Pat', 'address' : '123 Main'}), set([]))],
+			            [((3, {'name' : 'Pat', 'address' : '123 Main'}), set([]))]
+				    ), 
+			           (
+				    [((1, {'name' : 'Pat', 'address' : '123 Main'}), set([1])),
+				     ((2, {'name' : 'Sam', 'address' : '123 Main'}), set([]))],
+			            [((3, {'name' : 'Pat', 'address' : '123 Main'}), set([1]))]
+
+				    )
+				   )
+			  linker.matchBlocks(blocks)
 
    :param float threshold: Number between 0 and 1 (default is .5). We
 			   will only consider as duplicates record

diff --git a/docs/common_methods.rst b/docs/common_methods.rst
@@ -41,23 +41,3 @@
    if you care twice as much about recall as you do precision, set
    recall\_weight to 2.
 
-.. py:method::  matchBlocks(blocks, threshold=.5)
-
-   Partitions blocked data and returns a list of clusters, where each
-   cluster is a tuple of record ids
-
-   .. code:: python
-
-       clustered_dupes = deduper.matchBlocks(blocked_data, threshold)
-
-   Keyword arguments
-
-   ``blocks`` Sequence of tuples of records, where each tuple is a set of
-   records covered by a blocking predicate.
-
-   ``threshold`` Number between 0 and 1 (default is .5). We will only
-   consider as duplicates record pairs as duplicates if their estimated
-   duplicate likelihood is greater than the threshold.
-
-   Lowering the number will increase recall, raising it will increase
-   precision.
diff --git a/docs/common_recordlink_methods.rst b/docs/common_recordlink_methods.rst
@@ -46,3 +46,90 @@
 			   Lowering the number will increase
 			   recall, raising it will increase
 			   precision
+
+
+.. py:method::  matchBlocks(blocks, [threshold=.5])
+
+   Partitions blocked data and returns a list of clusters, where each
+   cluster is a tuple of record ids
+
+   .. code:: python
+
+   Keyword arguments
+
+   :param list blocks: Sequence of records blocks. Each record block
+		       is a tuple containing two sequences of records,
+		       the records from the first data set and the
+		       records from the second dataset. Within each
+		       block there should be at least one record from
+		       each datasets.  Along with each record, there
+		       should also be information on the blocks that
+		       cover that record.
+
+		       For example, if we have two records from dataset
+		       A and one record from dataset B: 
+
+		       .. code :: python
+		           
+		          # Dataset A
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+
+			  # Dataset B
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+		       and two predicates: "Whole name" and "Whole address".
+		       These predicates will produce the following blocks:
+
+		       .. code :: python
+
+		          # Block 1 (Whole name)
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+			  # Block 2 (Whole name)
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+
+			  # Block 3 (Whole address
+		          (1, {'name' : 'Pat', 'address' : '123 Main'})
+			  (2, {'name' : 'Sam', 'address' : '123 Main'})
+			  (3, {'name' : 'Pat', 'address' : '123 Main'})
+
+
+		       So, the blocks you feed to matchBlocks should look
+		       like this, 
+
+		       .. code :: python
+
+		          blocks =((
+			            [((1, {'name' : 'Pat', 'address' : '123 Main'}), set([]))],
+			            [((3, {'name' : 'Pat', 'address' : '123 Main'}), set([]))]
+				    ), 
+			           (
+				    [((1, {'name' : 'Pat', 'address' : '123 Main'}), set([1])),
+				     ((2, {'name' : 'Sam', 'address' : '123 Main'}), set([]))],
+			            [((3, {'name' : 'Pat', 'address' : '123 Main'}), set([1]))]
+
+				    )
+				   )
+			  linker.matchBlocks(blocks)
+
+		       Within each block, dedupe will compare every
+		       pair of records. This is expensive. Checking to
+		       see if two sets intersect is much cheaper, and
+		       if the block coverage information for two
+		       records does intersect, that means that this
+		       pair of records has been compared in a previous
+		       block, and dedupe will skip comparing this pair
+		       of records again.
+
+   :param float threshold: Number between 0 and 1 (default is .5). We
+			   will only consider as duplicates record
+			   pairs as duplicates if their estimated
+			   duplicate likelihood is greater than the
+			   threshold.
+
+			   Lowering the number will increase recall,
+			   raising it will increase precision.
+
+
diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@
 setup(
     name='dedupe',
     url='https://github.com/datamade/dedupe',
-    version='0.7.7.1.1',
+    version='0.7.7.1.2',
     description='A python library for accurate and scaleable data deduplication and entity-resolution',
     packages=['dedupe', 'dedupe.variables'],
     ext_modules=[Extension('dedupe.cpredicates', ['src/cpredicates.c'])],