Merge pull request #5 from kabirkhan/master

Link records across datasets with prodigy and dedupe.io
explosion · Jan 16, 2019 · 2e744ad · 2e744ad
2 parents 27fe09f + 48527b0
commit 2e744ad
Show file tree

Hide file tree

Showing 10 changed files with 4,710 additions and 0 deletions.
diff --git a/contrib/dedupe/link_records/README.md b/contrib/dedupe/link_records/README.md
@@ -0,0 +1,37 @@
+# dedupe.io with Prodi.gy
+
+This is a custom recipe for linking records across multiple datasets using the Python [dedupe](https://github.com/dedupeio/dedupe) library.
+See https://github.com/dedupeio/dedupe-examples/tree/master/record_linkage_example for an example of linking records with dedupe's console labeler to compare.
+
+## Usage
+
+Install Prodi.gy
+
+Once Prodigy is installed, you should be able to run the `prodigy` command from
+your terminal, either directly or via `python -m`:
+
+Install Requirements
+```bash
+pip install -r requirements.txt
+```
+
+Run with example datasets
+```bash
+python -m prodigy records.link my_dataset --left data/raw_dedupe_abtbuy_abt.csv --right data/raw_dedupe_abtbuy_buy.csv --fields fields.json -F ./link_records.py
+```
+
+---
+### Annotating
+
+![annotation interface](img/link_records_example.jpg)
+In the interface, a row is highlighted green if the field has an exact string match across both datasets, otherwise the row will be green.
+
+If you think the records are duplicates like they are in the image above, accept, otherwise reject.
+
+When you click the save button your progress will be updated.
+
+In order to reach 100% progress, the dedupe library recommends at least 10 positive and 10 negative examples.
+
+---
+### Model training
+Once you end the annotation session, a model will be batch trained and evaluated on the rest of your dataset and will write out records the model think should be conflated together to a file named `data_matching_output.csv` and save a copy of the dedupe model settings to `data_matching_learned_settings`
diff --git a/contrib/dedupe/link_records/data/raw_dedupe_abtbuy_abt.csv b/contrib/dedupe/link_records/data/raw_dedupe_abtbuy_abt.csv
diff --git a/contrib/dedupe/link_records/data/raw_dedupe_abtbuy_buy.csv b/contrib/dedupe/link_records/data/raw_dedupe_abtbuy_buy.csv
diff --git a/contrib/dedupe/link_records/data_matching_learned_settings b/contrib/dedupe/link_records/data_matching_learned_settings
diff --git a/contrib/dedupe/link_records/data_matching_output.csv b/contrib/dedupe/link_records/data_matching_output.csv
diff --git a/contrib/dedupe/link_records/fields.json b/contrib/dedupe/link_records/fields.json
@@ -0,0 +1,6 @@
+[
+    {"field": "title", "type": "String"},
+    {"field": "title", "type": "Text", "corpus": "[descriptions]"},
+    {"field": "description", "type": "Text", "has missing": true, "corpus": "[descriptions]"},
+    {"field" : "price", "type" : "Price", "has missing" : true}
+]
diff --git a/contrib/dedupe/link_records/img/link_records_example.jpg b/contrib/dedupe/link_records/img/link_records_example.jpg
diff --git a/contrib/dedupe/link_records/link_records.py b/contrib/dedupe/link_records/link_records.py
@@ -0,0 +1,282 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import json
+import csv
+import re
+
+import dedupe
+from unidecode import unidecode
+import prodigy
+from prodigy.components.db import connect
+
+
+def unique(seq):
+    seen = set()
+    seen_add = seen.add
+    return [x for x in seq if not (x in seen or seen_add(x))]
+
+
+def preProcess(column):
+    """
+    Do a little bit of data cleaning with the help of Unidecode and Regex.
+    Things like casing, extra spaces, quotes and new lines can be ignored.
+    """
+
+    column = unidecode(column)
+    column = re.sub('\n', ' ', column)
+    column = re.sub('-', '', column)
+    column = re.sub('/', ' ', column)
+    column = re.sub("'", '', column)
+    column = re.sub(",", '', column)
+    column = re.sub(":", ' ', column)
+    column = re.sub('  +', ' ', column)
+    column = column.strip().strip('"').strip("'").lower().strip()
+    if not column:
+        column = None
+    return column
+
+
+def readData(filename):
+    """
+    Read in our data from a CSV file and create a dictionary of records,
+    where the key is a unique record ID.
+    """
+
+    data_d = {}
+
+    with open(filename) as f:
+        reader = csv.DictReader(f)
+        for i, row in enumerate(reader):
+            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
+            if clean_row['price']:
+                clean_row['price'] = float(clean_row['price'][1:])
+            data_d[filename + str(i)] = dict(clean_row)
+
+    return data_d
+
+
+def record_pairs_stream(linker):  # pragma: no cover
+    '''
+    Command line interface for presenting and labeling training pairs
+    by the user
+    Argument :
+    A deduper object
+    '''
+
+    finished = False
+    use_previous = False
+    fields = unique(
+        field.field
+        for field in linker.data_model.primary_fields
+    )
+
+    examples_buffer = []
+    uncertain_pairs = []
+
+    while not finished:
+        if use_previous:
+            record_pair, _ = examples_buffer.pop(0)
+            use_previous = False
+        else:
+            if not uncertain_pairs:
+                uncertain_pairs = linker.uncertainPairs()
+
+            try:
+                record_pair = uncertain_pairs.pop()
+                a, b = record_pair
+                stream = []
+
+                for field_name in list(a.keys()):
+                    if field_name in fields:
+                        exact_match = a[field_name] == b[field_name]
+                        stream.append({
+                            'name': field_name,
+                            'a_value': a[field_name],
+                            'b_value': b[field_name],
+                            'exact_match': exact_match,
+                            'not_exact_match': not exact_match
+                        })
+                yield {'fields': stream}
+            except IndexError:
+                break
+
+
+def update_linker(linker, examples):
+    labeled_pairs = {'distinct': [], 'match': []}
+
+    for e in examples:
+
+        record_a = {}
+        record_b = {}
+        for field in e['fields']:
+            record_a[field['name']] = field['a_value']
+            record_b[field['name']] = field['b_value']
+
+        record_pair = (record_a, record_b)
+
+        if e['answer'] == 'accept':
+            labeled_pairs['match'].append(record_pair)
+        elif e['answer'] == 'reject':
+            labeled_pairs['distinct'].append(record_pair)
+
+    linker.markPairs(labeled_pairs)
+    return linker
+
+
+def validate_field(field):
+    assert 'field' in field
+    assert 'type' in field
+
+
+# Recipe decorator with argument annotations: (description, argument type,
+# shortcut, type / converter function called on value before it's passed to
+# the function). Descriptions are also shown when typing --help.
+@prodigy.recipe('records.link',
+    dataset=("The dataset to use", "positional", None, str),
+    left_record_file_path=("One of two files to dedupe and conflate across. Will be on the left in annotation UI", "option", "left", str),
+    right_record_file_path=("One of two files to dedupe and conflate across. Will be on the right in annotation UI", "option", "right", str),
+    fields_json_file_path=("The path to a JSON config file for field dedupe", "option", "fields", str)
+)
+def link_records(dataset, left_record_file_path, right_record_file_path, fields_json_file_path):
+    """
+    Collect the best possible training data for linking records across multiple
+    datasets. This recipe is an example of linking records across 2 CSV files
+    using the dedupe.io library.
+    """
+
+    db = connect()  # uses the settings in your prodigy.json
+
+    output_file = 'data_matching_output.csv'
+    settings_file = 'data_matching_learned_settings'
+    training_file = 'data_matching_training.json'
+
+    left_records = readData(left_record_file_path)
+    right_records = readData(right_record_file_path)
+
+    def descriptions():
+        for dataset in (left_records, right_records):
+            for record in dataset.values():
+                yield record['description']
+
+    with open(fields_json_file_path) as fields_json_file:
+        fields = json.load(fields_json_file)
+
+    for field in fields:
+        validate_field(field)
+        if field['type'] == 'Text' and 'corpus' in field:            
+            func_name = field['corpus'][1:-1]
+            field['corpus'] = locals()[func_name].__call__()
+
+    print('LEN RECORDS: ', len(left_records) / 2, len(right_records) / 2)
+    print('MIN SAMPLE', min(len(left_records) / 2, len(right_records) / 2))
+    print(fields)
+
+    linker = dedupe.RecordLink(fields)
+    # To train the linker, we feed it a sample of records.
+    linker.sample(
+        left_records,
+        right_records,
+        round(min(len(left_records) / 2, len(right_records) / 2))
+    )
+
+    print('getting examples')
+    # If we have training data saved from a previous run of linker,
+    # look for it an load it in.
+    examples = db.get_dataset(dataset)
+    if len(examples) > 0:
+        linker = update_linker(linker, examples)
+
+    def update(examples, linker=linker):
+        print(len(examples))
+        linker = update_linker(linker, examples)
+
+    def get_progress(session=0, total=0, loss=0, linker=linker):
+        n_match = len(linker.training_pairs['match'])
+        n_distinct = len(linker.training_pairs['distinct'])
+        n_match_progress = min(1, (n_match / 10)) / 2
+        n_distinct_progress = min(1, (n_distinct / 10)) / 2
+        print("Examples Annotated: {0}/10 positive, {1}/10 negative".format(n_match, n_distinct))
+        print(n_match_progress, n_distinct_progress)
+        progress = min(1, n_match_progress + n_distinct_progress)
+        return progress
+
+    def on_exit(controller, linker=linker):
+        linker.train()
+        # Save our weights and predicates to disk.  If the settings file
+        # exists, we will skip all the training and learning next time we run
+        # this file.
+        with open(settings_file, 'wb') as sf:
+            linker.writeSettings(sf)
+
+        print('clustering...')
+        linked_records = linker.match(left_records, right_records, 0)
+        print('# duplicate sets', len(linked_records))
+
+        # ## Writing Results
+
+        # Write our original data back out to a CSV with a new column called 
+        # 'Cluster ID' which indicates which records refer to each other.
+
+        cluster_membership = {}
+        cluster_id = None
+        for cluster_id, (cluster, score) in enumerate(linked_records):
+            for record_id in cluster:
+                cluster_membership[record_id] = (cluster_id, score)
+
+        if cluster_id:
+            unique_id = cluster_id + 1
+        else:
+            unique_id = 0
+
+
+        with open(output_file, 'w') as f:
+            writer = csv.writer(f)
+
+            header_unwritten = True
+
+            for fileno, filename in enumerate((left_record_file_path, right_record_file_path)):
+                with open(filename) as f_input:
+                    reader = csv.reader(f_input)
+
+                    if header_unwritten:
+                        heading_row = next(reader)
+                        heading_row.insert(0, 'source file')
+                        heading_row.insert(0, 'Link Score')
+                        heading_row.insert(0, 'Cluster ID')
+                        writer.writerow(heading_row)
+                        header_unwritten = False
+                    else:
+                        next(reader)
+
+                    for row_id, row in enumerate(reader):
+                        cluster_details = cluster_membership.get(filename + str(row_id))
+                        if cluster_details is None:
+                            cluster_id = unique_id
+                            unique_id += 1
+                            score = None
+                        else:
+                            cluster_id, score = cluster_details
+                        row.insert(0, fileno)
+                        row.insert(0, score)
+                        row.insert(0, cluster_id)
+                        writer.writerow(row)
+
+        print(cluster_membership)
+
+    stream = record_pairs_stream(linker)
+
+    with open('./record_pairs.html') as template_file:
+        html_template = template_file.read()
+
+    return {
+        'view_id': 'html',
+        'dataset': dataset,
+        'stream': stream,
+        'update': update,
+        'progress': get_progress,
+        'on_exit': on_exit,
+        'config': {
+            'html_template': html_template
+        }
+    }
diff --git a/contrib/dedupe/link_records/record_pairs.html b/contrib/dedupe/link_records/record_pairs.html
@@ -0,0 +1,34 @@
+<style>
+    table {
+        border-collapse: collapse;
+    }
+    table, td, th {
+        border: 1px solid gray;
+        vertical-align: top;
+    }
+    .title {
+        text-align: left;
+        line-height: 0;
+    }
+</style>
+<h2 class='title'>Link Duplicate Records</h2>
+<table>
+    <tr>
+        <th>Field</th>
+        <th>Record A</th>
+        <th>Record B</th>
+    </tr>
+    {{#fields}}
+    <tr>
+        <td>{{name}}</td>
+        {{#exact_match}}
+            <td style="background: #DFF0D8">{{a_value}}</td>
+            <td style="background: #DFF0D8">{{b_value}}</td>
+        {{/exact_match}}
+        {{#not_exact_match}}
+            <td style="background: #F2DEDE">{{a_value}}</td>
+            <td style="background: #F2DEDE">{{b_value}}</td>
+        {{/not_exact_match}}
+    </tr>
+    {{/fields}}
+</table>
diff --git a/contrib/dedupe/link_records/requirements.txt b/contrib/dedupe/link_records/requirements.txt
@@ -0,0 +1,2 @@
+dedupe==1.9.4
+unidecode==1.0.23