In [10]:
            #############################################################
            #        *********** Prototype 1 *********                  #
            #            Eliodor Ednalson Guy Mirlin                    #
            #                                                           #
            #   Internship : Deduplication using machine Learning       #      
            #                 Orchestra Network                         #
            #############################################################

#This first prototype is using the dedupe library from python to deduplicate a CSV file giving in input

# Input : a CSV file with many duplicate records 
# output : another CSV file generated from the system which has the clustered deduplicate detection results 

#Import of packages et libraries : 
from builtins import next
import dedupe
import os
import csv
import re
from unidecode import unidecode
import pandas as pd

In [11]:
# setting up of the system : input, output, settings, training
#Input : The CSV filename which contains the duplicated records
#output : The CSV file generated from the system which list the cluters of the deduplicate detection
#system : this file contains the data model ie the format of the data before the training process
# training : this file in .Json save the training set before starting the clustering process

input_file = 'restaurant.csv'
output_file = 'deduplicationRestaurant1.csv'
settings_file = 'settings'
training_file = 'training.json'


In [12]:
# This function preProcess is using for Data cleaning by using Unidecode. 
#Quotes,empty lines and unapropriate spaces are ignored

def preProcess(column):
    
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

In [13]:
def readData(filename):
    """
    Read in the data from a CSV file and created a dictionary of records, 
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename, 'rU') as f:
        reader = csv.DictReader(f)
        ide = 0
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = ide
            data_d[row_id] = dict(clean_row)
            ide= ide + 1

    return data_d

In [14]:
print('Import and listing ... \n')
data_d = readData(input_file)

dframe = pd.read_table('restaurant.csv', sep=',')
dframe

Import and listing ... 



  


Unnamed: 0,NAME,ADDRESS,PHONE,CUISINE
0,Apple Pan The,10801 W. Pico Blvd. West LA,310-475-3585,American
1,Arnie Morton's of Chicago,435 S. La Cienega Blvd. Los Angeles,310-246-1501,Steakhouses
2,Art's Deli,12224 Ventura Blvd. Studio City,818-762-1221,Delis
3,Asahi Ramen,2027 Sawtelle Blvd. West LA,310-479-2231,Noodle Shops
4,Baja Fresh,3345 Kimber Dr. Westlake Village,805-498-4049,Mexican
5,Bel-Air Hotel,701 Stone Canyon Rd. Bel Air,310-472-1211,Californian
6,Belvedere The,9882 Little Santa Monica Blvd. Beverly Hills,310-788-2306,Pacific New Wave
7,Benita's Frites,1433 Third St. Promenade Santa Monica,310-458-2889,Fast Food
8,Bernard's,515 S. Olive St. Los Angeles,213-612-1580,Continental
9,Bistro 45,45 S. Mentor Ave. Pasadena,818-795-2478,Californian


In [15]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields it will pay attention to
    fields = [
        {'field' : 'NAME', 'type': 'String'},
        ]

    # Created a new object and passed our data model to it.
    deduper = dedupe.Dedupe(fields)

    # For training, I feed it a sample of records.
    deduper.sample(data_d)

      # If we have training data saved from a previous run,
  # look for it and load it in.
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

In [16]:
# ## Active learning
    # will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
print('starting active labeling...')

dedupe.consoleLabel(deduper)

    # Using the examples we just labeled, train it and learn
    # blocking predicates
deduper.train()

    # save our training to disk
with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

NAME : cafe des artistes

NAME : cafe des artistes

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...
y


NAME : ocean star

NAME : oceana

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, NAME), SimplePredicate: (wholeFieldPredicate, NAME))
NAME : brothers

NAME : broiler

1/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : bone's restaurant

NAME : bone's

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : second avenue deli

NAME : second avenue deli

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.2, NAME), TfidfTextCanopyPredicate: (0.8, NAME))
NAME : local nochol

NAME : None

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


NAME : aquavit

NAME : aqua

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : joe's

NAME : joe's shanghai

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : ritz-carlton dining room (buckhead)

NAME : dining room ritz-carlton buckhead

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : bel-air hotel

NAME : hotel bel-air

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : jack sprat's grill

NAME : judson grill

5/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : fifty seven fifty seven

NAME : les celebrites

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : coco loco

NAME : splendido embarcadero

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : carnegie deli

NAME : carnegie deli

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : la caravelle

NAME : la caravelle

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : greenwood's

NAME : greens

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : coyote cafe (las vegas)

NAME : coyote cafe

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : sofi

NAME : sofia fabulous pizza

8/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, NAME), TfidfTextCanopyPredicate: (0.6, NAME))
NAME : rainbow room

NAME : rainbow restaurant

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : ritz-carlton cafe (buckhead)

NAME : ritz-carlton restaurant

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


NAME : aquavit

NAME : aqua

9/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : pacifica

NAME : pacific pan pacific hotel

9/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


NAME : california pizza kitchen

NAME : None

9/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.004802585034933624
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, NAME), TfidfTextCanopyPredicate: (0.6, NAME))


In [17]:
# Find the threshold that will maximize a weighted average of our
# precision and recall.  When we set the recall weight to 2, we are
# saying we care twice as much about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

threshold = deduper.threshold(data_d, recall_weight=1)

# ## Clustering

# `match` will return sets of record IDs that it
# believes are all referring to the same entity.

print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called 
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output, open(input_file , 'rU') as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)
    ide = 0
    for row in reader:
        row_id = ide
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)
        ide= ide + 1

INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 1.000
INFO:dedupe.api:precision: 0.446
INFO:dedupe.api:With threshold: 0.322


clustering...
# duplicate sets 117




In [9]:
print("Results of the deduplicated detection after training")
dframe = pd.read_table('Deduplicationoutput3.csv', sep=',')
dframe.sort_values(by=['Cluster ID'])

Results of the deduplicated detection after training


Unnamed: 0,Cluster ID,confidence_score,ID,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company_public_response,...,canonical_State,canonical_ZIP_code,canonical_Tags,canonical_Consumer_consent_provided,canonical_Submitted_via,canonical_Date_sent_to_company,canonical_Company_response_to_consumer,canonical_Timely_response,canonical_Consumer_disputed,canonical_Complaint_ID
0,0,0.897717,1,2016-05-18,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,Company has responded to the consumer and the ...,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
174,0,0.897717,175,2013-02-16,Bank account or service,Checking account,"Account opening, closing, or management",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
168,0,0.897717,169,2012-04-27,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
593,0,0.897717,594,2014-05-04,Mortgage,VA mortgage,"Loan servicing, payments, escrow account",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
600,0,0.897717,601,2013-03-30,Consumer Loan,Personal line of credit,Managing the line of credit,,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
148,0,0.897717,149,2012-05-17,Bank account or service,Savings account,"Account opening, closing, or management",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
618,0,0.897717,619,2016-05-17,Mortgage,FHA mortgage,"Loan modification,collection,foreclosure",,,Company has responded to the consumer and the ...,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
140,0,0.897717,141,2012-12-19,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
623,0,0.897717,624,2015-01-14,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
635,0,0.897717,636,2015-02-23,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,,,...,b'ca',b'30004',b'older american',b'n/a',b'web',b'2013-01-11',b'closed',b'yes',b'no',b'10647'
