In [77]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
quickly on datasets up to ~10,000 rows.
We start with a CSV file containing our messy data. In this example,
it is listings of early childhood education centers in Chicago
compiled from several different sources.
The output will be a CSV with our clustered results.
For larger datasets, see our [mysql_example](mysql_example.html)
"""
from future.builtins import next

import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

# # ## Logging

# # Dedupe uses Python logging to show or suppress verbose output. This
# # code block lets you change the level of loggin on the command
# # line. You don't need it if you don't want that. To enable verbose
# # logging, run `python examples/csv_example/csv_example.py -v`
# optp = optparse.OptionParser()
# optp.add_option('-v', '--verbose', dest='verbose', action='count',
#                 help='Increase verbosity (specify multiple times for more)'
#                 )
# (opts, args) = optp.parse_args()
# log_level = logging.WARNING 
# if opts.verbose:
#     if opts.verbose == 1:
#         log_level = logging.INFO
#     elif opts.verbose >= 2:
#         log_level = logging.DEBUG
# logging.getLogger().setLevel(log_level)



In [78]:
# ## Setup

input_file = '/data/dac/dedupe-project/test/dedupeio/GT_added.csv'
output_file = '/data/dac/dedupe-project/test/dedupeio/csv_example_output.csv'
settings_file = '/data/dac/dedupe-project/test/dedupeio/csv_example_learned_settings'
training_file = '/data/dac/dedupe-project/test/dedupeio/csv_example_training.json'

def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    try : # python 2/3 string differences
        column = column.decode('ISO-8859-1')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('null', '', column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename, encoding='ISO-8859-1') as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['test_id'])
            data_d[row_id] = dict(clean_row)

    return data_d

print('importing data ...')
data_d = readData(input_file)



importing data ...


In [79]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
        {'field' : 'address', 'type': 'String'}
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.consoleLabel(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.writeSettings(sf)
        
# Find the threshold that will maximize a weighted average of our
# precision and recall.  When we set the recall weight to 2, we are
# saying we care twice as much about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

threshold = deduper.threshold(data_d, recall_weight=1)

# ## Clustering

# `match` will return sets of record IDs that dedupe
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called 
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w', encoding='ISO-8859-1') as f_output, open(input_file, encoding='ISO-8859-1') as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('ISO-8859-1'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

address : 235 qiaojiao zhong lu, shi gu village, dongguan, cn 523729

address : 1120 reed dr, monroe, us 45050

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...


 n


address : xiliuzhi village, qi county, jinzhong city,

address : floor 3-4, no.198-7, tianjin road, weihai torch hi-tech development zone

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


address : 1f, 3rd building, qingxiang road, qinghu community

address : ghai creations c-101 okhla industrial area phase-1

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


address : 266 arbor ct, winchester, us 22602

address : gaohu industrial park.yinglin town

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


address : 1st floor dekk house, zippora

address : 1st floor dekk house, zippora

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


address : no.88 jinli road, dangshan county, , suzhou, cn

address : no.88 jinli road, dangshan county, , suzhou, cn

1/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, address), SimplePredicate: (firstIntegerPredicate, address))
address : shangqiao village group, changbu village, xinxu town, huiyang district

address : shangqiao village group, changbu village, xinxu town, huiyang district

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, address), SimplePredicate: (wholeFieldPredicate, address))
address : no. 390 zhongxin village, yixing zhoutiezhen, wuxi, cn 214261

address : no. 24 zhengdian village

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


address : no.2 wenlin wennan road, zhutang town, jiangyin city, jiangsu, china.

address : no. 12-8 yungu road, zhutang town, jiangyin, cn

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


address : plot # h7, landhi industrial area, landhi

address : plot # ht/7, landhi industrial area, karachi 75120, pakistan

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


address : kawasan industri kujang cikampek jend a yani, , karowang, id 41373

address : kawasan industri kujang cikampek(kikc) jend a yani, , karowang, id 41373

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, address), TfidfTextCanopyPredicate: (0.2, address))
address : 235 yuan shan bei road town of changping

address : 235 yuan shan bei da dao, yuan shan bei village, chang ping, dongguan, guangdong, china

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


address : chuangye road , chengnan development area, yuan district, luan city, anhui province

address : chuangye rd., chengnan development area, china

6/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


address : no. 21 xiwen road, jiangyin zhutangzhen, wuxi, cn 214416

address : no. 28 xinwei road, jiangyin zhutangzhen, wuxi, cn 214416

7/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, address), TfidfTextCanopyPredicate: (0.4, address))
address : 365, wuzhou, yuhang economic and development zone, hangzhou, cn 311100

address : no. 365, wuzhou road, yuhang economic and development zone, hangzhou, cn 311100

8/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, address), TfidfTextCanopyPredicate: (0.4, address))
address : antiguo camino a la

address : bu yun development area.licheng district .pu tian, fu jian china ste 3 flr 3# 4#, putian, cn 351100

9/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, address), TfidfTextCanopyPredicate: (0.4, address))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 1.000000, score 0.6446501841494627
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, address), TfidfTextCanopyPredicate: (0.4, address))
  ('score', 'f4', 1)])
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.902
INFO:dedupe.api:precision: 0.816
INFO:dedupe.api:With threshold: 0.443


clustering...


  ('score', 'f4', 1)])


# duplicate sets 373


In [81]:
pd.read_csv('/data/dac/dedupe-project/test/dedupeio/GT_added.csv', encoding='ISO-8859-1')

Unnamed: 0,test_id,id,address
0,26,309972,"SIMA INDUSTRIAL ZONE, HENGLI TOWN, DONGGUAN, G..."
1,58,315787,"TENGURI, BKSP, ASHULIA, SAVAR\nDHAKA BANGLADESH"
2,103,308536,"Plot # 02,Sector 25 Korangi Industrial Area, K..."
3,105,308535,"PLOT # 4, SECTOR # 25, KORANGI INDUSTRIAL AREA..."
4,155,314939,"189 UDYOG VIHAR, PHASE 1, GURGAON-122016"
...,...,...,...
839,292890,1603,"Building 16, Zhengbei Street South, Yuxin, Nan..."
840,316218,1604,"G-28&29, SECTOR 6, NOIDA"
841,314977,1605,"No 235Q, 3rd Phase , Bommasandra Industrial Zo..."
842,300836,1606,"4F, NO27 EAST GUGONG ROND"
