# Example of Deduplication

In [30]:
import deduplication as dep
reload(dep);

## 1. Loading Data

### Loading sample data

In [31]:
df_input_records = pd.read_csv('df_impairs.csv',index_col=0,dtype={'duns':str,'postalcode':str},sep='|')
df_target_records = pd.read_csv('df_pairs.csv',index_col=0,dtype={'duns':str,'postalcode':str},sep='|')
df_input_records.sample(3)

Unnamed: 0_level_0,name,duns,city,postalcode,street,country_code
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
661a0913-d82f-4186-b191-58a93c06ba0c,phoenix contact hmi-ipc,,filderstadt,70794,29 kurze str,DE
774bcf85-7a03-4a6d-9ec3-f5474b5e996d,aviasport sa,464418029.0,tres cantos,28760,11 calle almazara,ES
efb815f9-ea51-4728-babb-fdd9c294981d,pro-idee gmbh co kg,312865546.0,aachen,52070,gut-dmme-str,DE


### Cleaning that data

In [32]:
from preprocessing import clean_db
for x in [df_input_records,df_target_records]:
    x = clean_db(x)
df_target_records.sample(3)

Unnamed: 0_level_0,name,duns,city,postalcode,street,country_code,name_wostopwords,street_wostopwords,name_acronym,postalcode_1stdigit,postalcode_2digits,name_len,hasairbusname,isbigcity
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
b537e8e0-1a92-4694-8ef7-c43c4b7a208f,jamara modelltechnik,322899675.0,aichstetten,88317,5 am lauerbhl,DE,jamara modelltechnik,lauerbhl 5 am,jm,8,88,20,0,0
e8695ac0-8ef8-4bd1-bbfb-36009497d10a,cadilac laser gmbh,,albstadt,72459,herderstr,DE,cadilac laser,herder,clg,7,72,18,0,0
941ec1ab-0b44-4a66-abd6-69eead48b1ca,airtanker services ltd,,carterton,ox18,airtanker hub raf brize norton,GB,airtanker,raf airtanker hub norton brize,asl,o,ox,22,0,0


# Machine Learning - based deduplication

## Creating a training table for the decision model

### Creating a side-by-side comparison table for manual labelling
using results from a rule-based decision model, for example (see below)

### Loading a supervised learning table

In [33]:
supervised_learning=pd.read_excel('supervised_table.xlsx')
nix=1000
s_inputs=supervised_learning['ix_source'].iloc[:nix]
s_targets=supervised_learning['ix_target'].iloc[:nix]
s_true=supervised_learning['y_true'].iloc[:nix]

### Creating the training table

In [34]:
dummymodel=dep.TrainerModel(scoredict={'fuzzy':['name','name_wostopwords',
                                               'street','street_wostopwords',
                                               'city'],
                                      'exact':['duns','country_code'],
                                      'token':['name','name_wostopwords',
                                               'street','street_wostopwords'],
                                      'acronym':['name','name_wostopwords']})
sur=dep.Suricate(input_records=df_input_records,
                target_records=df_target_records,
                model=dummymodel)
training_table=sur.build_training_table(inputs=s_inputs,targets=s_targets,y_true=s_true).fillna(-1)
#x2=sur.chain_build_labelled_table(inputs=s_inputs,targets=s_targets)
print(training_table['y_true'].value_counts())
training_table.sample(3)

0    916
1     84
Name: y_true, dtype: int64


Unnamed: 0,city_fuzzyscore,country_code_exactscore,duns_exactscore,name_acronymscore,name_fuzzyscore,name_tokenscore,name_wostopwords_acronymscore,name_wostopwords_fuzzyscore,name_wostopwords_tokenscore,street_fuzzyscore,street_tokenscore,street_wostopwords_fuzzyscore,street_wostopwords_tokenscore,y_true
565,0.12,1,0,0.0,0.41,0.333333,-1.0,0.24,0.0,0.45,0.0,0.39,0.0,0
209,0.31,1,-1,0.0,0.49,0.333333,-1.0,0.32,0.0,0.52,0.0,0.38,0.0,0
188,0.0,1,-1,0.0,0.5,0.333333,-1.0,0.11,0.0,0.44,0.0,0.32,0.0,0


### Train the decision model

In [35]:
X_train = training_table.iloc[:,:-1].astype(float)
y_train= training_table.iloc[:,-1].astype(int)

In [36]:
evaluator=dep.MLEvaluationModel()
evaluator.fit(X=X_train,y=y_train)

shape of training table  (1000, 13)
number of positives in table 84
precision score on training data: 1.0
recall score on training data: 1.0
time elapsed 3.607531 seconds


### Adding filtering rules to speed up the process (Optional)

filter on records that match exactly the country code, or that match the duns number

In [37]:
filterdict={'all':['country_code'],
           'any':['duns']}

from those filtered records, filter on records who have a roughly similar name or address, or share the same duns

In [38]:
intermediate_thresholds={'name_wostopwords_fuzzyscore':0.6,'street_wostopwords_fuzzyscore':0.6, 'duns_exactscore':1.0,'aggfunc':'any'}

## Launching the deduplication

In [39]:
sur=dep.Suricate(input_records=df_input_records,target_records=df_target_records,
                 filterdict=filterdict,
                 intermediate_thresholds=intermediate_thresholds,
                 model=evaluator)

### Possibility 1: return only good matches (for run mode)

In [40]:
res=sur.start_linkage()
df=sur.format_results(res,display=['name','street','duns','country_code'],fuzzyscorecols=['name','street'],exactscorecols=['duns'])
df.sample(5)

starting deduplication at 2018-02-06 09:41:13.010887
1 of 10 inputs records deduplicated | found 0 of 1 max possible matches | time elapsed 0.253161 s
2 of 10 inputs records deduplicated | found 0 of 1 max possible matches | time elapsed 0.231388 s
3 of 10 inputs records deduplicated | found 0 of 1 max possible matches | time elapsed 0.250355 s
4 of 10 inputs records deduplicated | found 1 of 1 max possible matches | time elapsed 0.244689 s
5 of 10 inputs records deduplicated | found 1 of 1 max possible matches | time elapsed 0.22009 s
6 of 10 inputs records deduplicated | found 1 of 1 max possible matches | time elapsed 0.246758 s
7 of 10 inputs records deduplicated | found 1 of 1 max possible matches | time elapsed 0.259941 s
8 of 10 inputs records deduplicated | found 0 of 1 max possible matches | time elapsed 0.238129 s
9 of 10 inputs records deduplicated | found 1 of 1 max possible matches | time elapsed 0.198179 s
10 of 10 inputs records deduplicated | found 1 of 1 max possible m

Unnamed: 0,ix_source,ix_target,name_source,name_target,country_code_source,country_code_target,street_source,street_target,duns_source,duns_target,name_fuzzyscore,street_fuzzyscore,avg_fuzzyscore,duns_exactscore,n_exactmatches
1,ff973ba5-ab42-42e0-8244-6aa82de46691,c3200b89-b646-4b61-affb-76023e5915ef,acm,acm,FR,FR,9 rue de la gare,9 rue de la gare,380071407.0,,1.0,1.0,1.0,,0
3,e2a2da69-3aa4-44e2-ae5b-4bbd2cbdc238,c906f2e3-bd4c-4785-9d60-95f19579a04c,botschaft afghanistan,botschaft afghanistan,DE,DE,3 taunusstraaye,3 taunusstr,,,1.0,0.85,0.925,,0
2,6097f4c6-8515-41fb-b5e5-549c81140848,f704e1f0-b240-4461-a741-b41b5c30b476,bildungswerk der wirtschaft hamburg,bildungswerk der wirtschaft hamburg,DE,DE,10 kapstadtring,10 kapstadtring,,,1.0,1.0,1.0,,0
4,21c09dde-cff3-4c45-a2f1-46a99e6e1587,1b932c6a-3719-4f78-ba6e-c4ffdf0cc344,hatfield and dawson consulting,hatfield and dawson consulting,US,US,greenwood ave n,greenwood ave n,99615556.0,,1.0,1.0,1.0,,0
0,5ff704ee-399e-4fbd-b604-51b6ced944dd,af8133f8-361f-494e-92dc-ab3c72637d56,berlinzeppelin,berlinzeppelin,DE,DE,4 rottweiler str,4 rottweiler str,,,1.0,1.0,1.0,,0


### Possibility 2: return a probability vector to build a supervised learning table

In [41]:
# return the 5 most probable matches of the query and the associated probabilities
res=sur.start_linkage(n_matches_max=5,with_proba=True)
df=sur.format_results(res,with_proba=True,display=['name','street','duns','country_code'],fuzzyscorecols=['name','street'],exactscorecols=['duns'])
df.sample(3)

starting deduplication at 2018-02-06 09:41:15.472593
1 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 0.301977 s
2 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 0.501333 s
3 of 10 inputs records deduplicated | found 2 of 5 max possible matches | time elapsed 0.729465 s
4 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 0.944464 s
5 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 1.160897 s
6 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 1.376027 s
7 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 1.662456 s
8 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 1.987794 s
9 of 10 inputs records deduplicated | found 1 of 5 max possible matches | time elapsed 2.213327 s
10 of 10 inputs records deduplicated | found 1 of 5 max possible 

Unnamed: 0,ix_source,ix_target,y_proba,name_source,name_target,country_code_source,country_code_target,street_source,street_target,duns_source,duns_target,name_fuzzyscore,street_fuzzyscore,avg_fuzzyscore,duns_exactscore,n_exactmatches
10,31ea5edd-2c80-40ca-85ca-b6768a941d1e,7ff2b1d2-bf47-4b26-849a-0d150fca7b66,1.0,h media,h media,BE,BE,329 heistraat,329 heistraat,372377817,,1.0,1.0,1.0,,0.0
0,c204c7b7-66fd-4e55-99a4-abf647dd6b3c,7a17d0c9-5992-43db-9580-c1c6cf16cdbc,0.187,l amphitryon restaurant,alter ego 31,FR,FR,chemin de gramont,chemin de gramont,779097252,779097252.0,0.29,1.0,0.645,1.0,1.0
5,ff973ba5-ab42-42e0-8244-6aa82de46691,c3200b89-b646-4b61-affb-76023e5915ef,1.0,acm,acm,FR,FR,9 rue de la gare,9 rue de la gare,380071407,,1.0,1.0,1.0,,0.0


In [42]:
df.to_excel('supervised2.xlsx')

# Rule-based deduplication

it works the same as above, but instead of having to train a model, you hard-code some rules

In [43]:
hard_threshold = {'name_tokenscore': 0.7,
                  'street_tokenscore': 0.7}
hard_cols = list(hard_threshold.keys())

def hardcodedfunc(r):
    r = r.fillna(0)
    for k in hard_cols:
        if r[k] > hard_threshold[k]:
            return 1
    else:
        return 1

rule_based_model = dep.FuncEvaluationModel(used_cols=hard_cols,
                        eval_func=hardcodedfunc)
sur=dep.Suricate(input_records=df_input_records,
                target_records=df_target_records,
                 filterdict=filterdict,
                 intermediate_thresholds=intermediate_thresholds,
                model=rule_based_model)