### Dedupe Example with NC Voter Data

In [1]:
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

In [2]:
from future.utils import viewitems

import csv
import collections
import itertools

In [3]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

In [4]:
def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [6]:
# ## Setup
file_path = '/Users/gizelleguerra/Documents/fedex_cip/dedupe_NC_exp1/'
input_file = 'combined_clean.csv'
output_file = 'NCdd1_output.csv'
settings_file = 'NCdd1_learned_settings'
training_file = 'NCdd1_training.json'

print('importing data ...')
data_d = readData(file_path + input_file)

importing data ...


In [7]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
        {'field': 'givenname', 'type': 'String', 'has missing': True},
        {'field': 'surname', 'type': 'String', 'has missing': True},
        {'field': 'suburb', 'type': 'String', 'has missing': True},
        {'field': 'postcode', 'type': 'String', 'has missing': True},
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word er
INFO:dedupe.canopy_index:Removing stop word ar
INFO:dedupe.canopy_index:Removing stop word on
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word ll
INFO:dedupe.canopy_index:Removing stop word le
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.canopy_index:Removing stop word in
INFO:dedupe.canopy_index:Removing stop word ar
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word ha
INFO:dedupe.canopy_index:Removing stop word ri
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.8, surname)
givenname : josephine
surname : hinson
suburb : hillsborough
postcode : 27278

givenname : joel
surname : hinson
suburb : salisbury
postcode : 28146

0/10 positive, 0/10 negative
Do these re

starting active labeling...


 n


givenname : robert
surname : cox
suburb : graham
postcode : 27253

givenname : robert
surname : holcombe
suburb : clemmons
postcode : 27012

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : john
surname : webster
suburb : pittsboro
postcode : 27312

givenname : john
surname : reyes
suburb : pittsboro
postcode : 27312

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : kenneth
surname : covington
suburb : raleigh
postcode : 27603

givenname : kenneth
surname : white
suburb : raleigh
postcode : 27613

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : tremont
surname : taylor
suburb : kittrell
postcode : 27544

givenname : geraldine
surname : taylor
suburb : kittrell
postcode : 27544

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : marcus
surname : chavis
suburb : lumberton
postcode : 28360

givenname : donald
surname : chavis
suburb : lumberton
postcode : 28360

0/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : keith
surname : williams
suburb : elizabethcity
postcode : 27909

givenname : keith
surname : williams
suburb : charlotte
postcode : 28208

0/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : kayla
surname : williams
suburb : jacksonville
postcode : 28540

givenname : kay
surname : williams
suburb : greensboro
postcode : 27407

0/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : shanea
surname : brown
suburb : greensaboro
postcode : 27405

givenname : charles
surname : brown
suburb : greensboro
postcode : 27409

0/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : karen
surname : johnson
suburb : denver
postcode : 28037

givenname : darin
surname : johnson
suburb : bessemercity
postcode : 28016

0/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : terrence
surname : williams
suburb : kinston
postcode : 28504

givenname : lawrence
surname : williams
suburb : windsor
postcode : 27983

0/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : blaine
surname : hill
suburb : greensboro
postcode : 27406

givenname : chyami
surname : hill
suburb : greensboro
postcode : 27405

0/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : myra
surname : miller
suburb : huntersville
postcode : 28078

givenname : mary
surname : miller
suburb : newbern
postcode : 28562

0/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : ashton
surname : cole
suburb : kernersville
postcode : 27284

givenname : ashlee
surname : cole
suburb : fayetteville
postcode : 28301

0/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : lia
surname : jones
suburb : hillsborough
postcode : 2778

givenname : michelle
surname : jones
suburb : greensboro
postcode : 27407

0/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : james
surname : brown
suburb : sylva
postcode : 28779

givenname : madison
surname : brown
suburb : chinagrove
postcode : 28023

0/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : christopher
surname : evans
suburb : laurinburg
postcode : 28352

givenname : nina
surname : evans
suburb : ash
postcode : 28420

0/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : joanne
surname : powell
suburb : rockymount
postcode : 27804

givenname : johnny
surname : powell
suburb : columbia
postcode : 27925

0/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : tracie
surname : parker
suburb : wallace
postcode : 28466

givenname : crystal
surname : parker
suburb : dallas
postcode : 28034

0/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : vicki
surname : pappas
suburb : durham
postcode : 27701

givenname : lisa
surname : pappas
suburb : fuquayvarina
postcode : 27526

0/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : johnnie
surname : moore
suburb : pikeville
postcode : 27863

givenname : stanley
surname : moore
suburb : asheville
postcode : 28805

0/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : pollie
surname : williams
suburb : washington
postcode : 27889

givenname : michael
surname : williams
suburb : burlington
postcode : 27215

0/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : samuel
surname : campbell
suburb : graham
postcode : 27253

givenname : carol
surname : campbell
suburb : franklinville
postcode : 27248

0/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : james
surname : ingram
suburb : rockingham
postcode : 28379

givenname : margaret
surname : ingram
suburb : boone
postcode : 28607

0/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : kay
surname : miller
suburb : mocksville
postcode : 27028

givenname : frederick
surname : miller
suburb : council
postcode : 28434

0/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : sanrra
surname : jackson
suburb : kernersville
postcode : 27286

givenname : ray
surname : jackson
suburb : raleigh
postcode : 27617

0/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : william
surname : brown
suburb : gastonia
postcode : 28054

givenname : troy
surname : brown
suburb : winstonsalem
postcode : 27105

0/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : lakeisha
surname : johnson
suburb : elizabethcity
postcode : 27909

givenname : ned
surname : johnson
suburb : supply
postcode : 28462

0/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


givenname : leroy
surname : davis
suburb : newbernl
postcode : 2856

givenname : tyrone
surname : davis
suburb : cary
postcode : 27519

0/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.0
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (suffixArray, surname), TfidfTextCanopyPredicate: (0.2, givenname), SimplePredicate: (oneGramFingerprint, suburb))


In [8]:
# ## Clustering

# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))

clustering...


INFO:dedupe.blocking:10000, 0.3757322 seconds
INFO:dedupe.blocking:20000, 0.6839332 seconds
INFO:dedupe.blocking:30000, 0.9808442 seconds
INFO:dedupe.blocking:40000, 1.2603442 seconds
INFO:dedupe.blocking:50000, 1.5343122 seconds
INFO:dedupe.blocking:60000, 1.8140642 seconds
INFO:dedupe.blocking:70000, 2.0856432 seconds
INFO:dedupe.blocking:80000, 2.3533172 seconds
INFO:dedupe.blocking:90000, 2.6198722 seconds
INFO:dedupe.blocking:100000, 3.0630152 seconds
INFO:dedupe.blocking:110000, 3.7005772 seconds
INFO:dedupe.blocking:120000, 4.0975862 seconds
INFO:dedupe.blocking:130000, 4.3631582 seconds
INFO:dedupe.blocking:140000, 4.6114852 seconds
INFO:dedupe.blocking:150000, 4.8611612 seconds
INFO:dedupe.blocking:160000, 5.1058702 seconds
INFO:dedupe.blocking:170000, 5.3601942 seconds
INFO:dedupe.blocking:180000, 5.6023422 seconds
INFO:dedupe.blocking:190000, 5.8457942 seconds
INFO:dedupe.blocking:200000, 6.0885082 seconds
INFO:dedupe.blocking:210000, 6.3388992 seconds
INFO:dedupe.blocking:2

# duplicate sets 1000000


In [9]:
# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

out_f = file_path + output_file
in_f = file_path + input_file

with open(out_f, 'w') as f_output, open(in_f) as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['Id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)

### Evaluation

In [19]:
def evaluateDuplicates(found_dupes, true_dupes):
    true_positives = found_dupes.intersection(true_dupes)
    false_positives = found_dupes.difference(true_dupes)
    uncovered_dupes = true_dupes.difference(found_dupes)

    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))

In [20]:
def dupePairs(filename, rowname) :
    dupe_d = collections.defaultdict(list)

    with open(filename) as f:
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for row in reader:
            dupe_d[row[rowname]].append(row['Id'])

    if 'x' in dupe_d :
        del dupe_d['x']

    dupe_s = set([])
    for (unique_id, cluster) in viewitems(dupe_d) :
        if len(cluster) > 1:
            for pair in itertools.combinations(cluster, 2):
                dupe_s.add(frozenset(pair))

    return dupe_s

In [21]:
manual_clusters = 'csv_example_input_with_true_ids.csv'
dedupe_clusters = 'csv_example_output.csv'

manual_clusters_fp = file_path + manual_clusters
dedupe_clusters_fp = file_path + dedupe_clusters

In [22]:
true_dupes = dupePairs(manual_clusters_fp, 'True Id')
test_dupes = dupePairs(dedupe_clusters_fp, 'Cluster ID')

In [23]:
evaluateDuplicates(test_dupes, true_dupes)

found duplicate
3087
precision
0.98607061872368
recall
0.46065375302663436
