In [1]:
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

In [17]:
from future.utils import viewitems

import csv
import collections
import itertools

### Dedupe Example
source: https://github.com/gizelleguerra/dedupe-examples/tree/master/csv_example

In [2]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

In [3]:
def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [None]:
if __name__ == '__main__':

    # ## Logging

    # Dedupe uses Python logging to show or suppress verbose output. This
    # code block lets you change the level of loggin on the command
    # line. You don't need it if you don't want that. To enable verbose
    # logging, run `python examples/csv_example/csv_example.py -v`
    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING
    if opts.verbose:
        if opts.verbose == 1:
            log_level = logging.INFO
        elif opts.verbose >= 2:
            log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    # ## Setup

    input_file = 'csv_example_messy_input.csv'
    output_file = 'csv_example_output.csv'
    settings_file = 'csv_example_learned_settings'
    training_file = 'csv_example_training.json'

    print('importing data ...')
    data_d = readData(input_file)

    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        fields = [
            {'field': 'Site name', 'type': 'String'},
            {'field': 'Address', 'type': 'String'},
            {'field': 'Zip', 'type': 'Exact', 'has missing': True},
            {'field': 'Phone', 'type': 'String', 'has missing': True},
            ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_d, f)
        else:
            deduper.prepare_training(data_d)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.console_label(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)

    # ## Clustering

    # `partition` will return sets of records that dedupe
    # believes are all referring to the same entity.

    print('clustering...')
    clustered_dupes = deduper.partition(data_d, 0.5)

    print('# duplicate sets', len(clustered_dupes))

    # ## Writing Results

    # Write our original data back out to a CSV with a new column called
    # 'Cluster ID' which indicates which records refer to each other.

    cluster_membership = {}
    for cluster_id, (records, scores) in enumerate(clustered_dupes):
        for record_id, score in zip(records, scores):
            cluster_membership[record_id] = {
                "Cluster ID": cluster_id,
                "confidence_score": score
            }

    with open(output_file, 'w') as f_output, open(input_file) as f_input:

        reader = csv.DictReader(f_input)
        fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

        writer = csv.DictWriter(f_output, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            row_id = int(row['Id'])
            row.update(cluster_membership[row_id])
            writer.writerow(row)


In [4]:
# ## Logging

# Dedupe uses Python logging to show or suppress verbose output. This
# code block lets you change the level of loggin on the command
# line. You don't need it if you don't want that. To enable verbose
# logging, run `python examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

Usage: ipykernel_launcher.py [options]

ipykernel_launcher.py: error: no such option: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [9]:
# ## Setup
file_path = '/Users/gizelleguerra/Documents/fedex_cip/csv_example/'
input_file = 'csv_example_messy_input.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

print('importing data ...')
data_d = readData(file_path + input_file)

importing data ...


In [10]:
type(data_d)

dict

In [11]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
        {'field': 'Site name', 'type': 'String'},
        {'field': 'Address', 'type': 'String'},
        {'field': 'Zip', 'type': 'Exact', 'has missing': True},
        {'field': 'Phone', 'type': 'String', 'has missing': True},
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)


starting active labeling...


Site name : chicago youth centers - rachel's learning center #1
Address : 3430 w roosevelt rd
Zip : None
Phone : None

Site name : chicago youth centers rachel's 1
Address : 3430 w roosevelt rd
Zip : 60624
Phone : 5331837

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


 y


Site name : trinity united church of christ - trinity united
Address : 532 w 95th st
Zip : None
Phone : 4883511

Site name : trinity united church of christ trinity ucc
Address : 532 w 95th st
Zip : 60628
Phone : None

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)
INFO:dedupe.training:SimplePredicate: (alphaNumericPredicate, Site name)
Site name : home of life community dev. corp. home of life just for you (773)-626-8655
Address : 4647 w. washington
Zip : 60644
Phone : None

Site name : home of life
Address : 4647 w washington boulevard
Zip : 60644
Phone : 6268655

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Address)
Site name : henry booth house young achievers academy
Address : 520 e 79th street
Zip : 60619
Phone : None

Site name : henry booth house young achievers academy
Address : 520 e 79th st.
Zip : 60619
Phone : None

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, Address)
Site name : hearst
Address : 4340 s. lamon
Zip : None
Phone : 5352376

Site name : hearst
Address : 4640 s. lamon
Zip : 60638
Phone : 5352376

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : our lady of tepeyac
Address : 2414 south albany avenue
Zip : 60623
Phone : 2775888

Site name : our lady of tepeyac early childhood center
Address : 2414 s albany avenue
Zip : 60623
Phone : 2775888

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, Address)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)
Site name : chinese american service league chinese american service league child dev ctr
Address : 2141 s tan court
Zip : 60616
Phone : 7910454

Site name : chinese american service league
Address : 2141 south tan court 1st floor
Zip : 60616
Phone : 7910454

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfTextCanopyPredicate: (0.6, Address)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)
Site name : christopher house uptown
Address : 4701 n winthrop
Zip : 60640
Phone : 7694540

Site name : christopher house uptown i/t
Address : 4701 n. winthorp
Zip : 60640
Phone : 7694540

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : el hogar del nino/cuidar - maria freeman
Address : 4312 s california ave
Zip : None
Phone : None

Site name : el hogar del nino/cuidar - el hogar del nino/cuidar- california
Address : 2325 s california ave
Zip : None
Phone : 5231629

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameFiveCharStartPredicate, Address)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)
Site name : marillac social center
Address : 212 south francisco
Zip : 60612
Phone : 7227440

Site name : marillac social center supportive services
Address : 212 s francisco avenue
Zip : 60612
Phone : 7227440

8/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : centers for new horizons - altgeld gardens ii early learning center
Address : 939 e 132nd street
Zip : 60827
Phone : 4686033

Site name : centers for new horizons - altgeld
Address : 941 e 132nd street
Zip : 60627
Phone : 4683055

9/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfTextCanopyPredicate: (0.6, Address)
INFO:dedupe.training:LevenshteinCanopyPredicate: (2, Address)
Site name : uic children's cent ii west
Address : 1919 w. taylor
Zip : 60612
Phone : 4135326

Site name : uic children's center
Address : 1919 west taylor street (mc 525) room 128
Zip : 60612
Phone : 4135328

10/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : mary crane
Address : 2905 n. leavitt
Zip : 60618
Phone : 3485528

Site name : mary crane east 0-3
Address : 2974 n. clybourn
Zip : 60618
Phone : 3485528

11/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, Address)
INFO:dedupe.training:LevenshteinCanopyPredicate: (2, Address)
Site name : mary crane league mary crane center (east)
Address : 2974 n clybourn ave
Zip : 60618
Phone : 3485528

Site name : mary crane
Address : 2905 n. leavitt
Zip : 60618
Phone : 3485528

11/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


Site name : henry booth house - love n learn academy
Address : 723 e 75th st
Zip : None
Phone : 7230338

Site name : henry booth house love n learn academy
Address : 723-725 e 75th st.
Zip : 60619
Phone : 7230338

11/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : easter seals society of metropolitan chicago gilchrist marchman
Address : 1001 w roosevelt rd 606
Zip : 81559
Phone : 4927402

Site name : easter seals society of metropolitan chicago rachel's learning center #1
Address : 3430 w roosevelt rd
Zip : 60624
Phone : 5330444

12/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (firstTokenPredicate, Address)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, Site name)
INFO:dedupe.training:LevenshteinCanopyPredicate: (2, Address)
Site name : abraham lincoln center - king
Address : 4314 s. cottage grove
Zip : None
Phone : 7472310

Site name : abraham lincoln center abraham-lincoln
Address : 3858 s. cottage grove
Zip : 60654
Phone : 2851390

12/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


Site name : centers for new horizons - ida b. wells learning center
Address : 3601 s rhodes st
Zip : None
Phone : 3733640

Site name : centers for new horizons ida b. wells elc
Address : 3641 s. rhodes
Zip : 60653
Phone : 3733640

12/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


Site name : st. paul church of god - chaney ford child care center
Address : 4526 s wabash ave
Zip : None
Phone : 2858721

Site name : st. paul-chaney ford child care
Address : 4526 s. wabash
Zip : 60653
Phone : 2858721

13/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (nearIntegersPredicate, Phone)
INFO:dedupe.training:SimplePredicate: (sameSevenCharStartPredicate, Address)
INFO:dedupe.training:LevenshteinCanopyPredicate: (2, Address)
Site name : carter
Address : 5740 s. michigan ave.
Zip : None
Phone : 5350860

Site name : henry booth house precious little ones
Address : 5327 s michigan ave.
Zip : 60615
Phone : None

14/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 0.000100, score 0.43389980764142244
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (nearIntegersPredicate, Address), SimplePredicate: (wholeFieldPredicate, Zip), TfidfNGramCanopyPredicate: (0.6, Site name))
INFO:dedupe.training:(SimplePredicate: (tokenFieldPredicate, Phone), TfidfNGramCanopyPredicate: (0.2, Site name), TfidfNGramCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (2, Address), SimplePredicate: (commonThreeTokens, Site name), TfidfTextCanopyPredicate: (0.6, Site name))


In [12]:
# ## Clustering

# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)

print('# duplicate sets', len(clustered_dupes))

clustering...
# duplicate sets 1784


In [16]:
# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

out_f = file_path + output_file
in_f = file_path + input_file

with open(out_f, 'w') as f_output, open(in_f) as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['Id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)

### Evaluation

In [19]:
def evaluateDuplicates(found_dupes, true_dupes):
    true_positives = found_dupes.intersection(true_dupes)
    false_positives = found_dupes.difference(true_dupes)
    uncovered_dupes = true_dupes.difference(found_dupes)

    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))

In [20]:
def dupePairs(filename, rowname) :
    dupe_d = collections.defaultdict(list)

    with open(filename) as f:
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for row in reader:
            dupe_d[row[rowname]].append(row['Id'])

    if 'x' in dupe_d :
        del dupe_d['x']

    dupe_s = set([])
    for (unique_id, cluster) in viewitems(dupe_d) :
        if len(cluster) > 1:
            for pair in itertools.combinations(cluster, 2):
                dupe_s.add(frozenset(pair))

    return dupe_s

In [21]:
manual_clusters = 'csv_example_input_with_true_ids.csv'
dedupe_clusters = 'csv_example_output.csv'

manual_clusters_fp = file_path + manual_clusters
dedupe_clusters_fp = file_path + dedupe_clusters

In [22]:
true_dupes = dupePairs(manual_clusters_fp, 'True Id')
test_dupes = dupePairs(dedupe_clusters_fp, 'Cluster ID')

In [23]:
evaluateDuplicates(test_dupes, true_dupes)

found duplicate
3087
precision
0.98607061872368
recall
0.46065375302663436
