In [6]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize
from sklearn.neural_network import MLPClassifier

In [5]:
fields = [{'field': 'description', 'variable name': 'description', 'type': 'String'},
          {'field': 'price', 'variable name': 'price', 'type': 'Price'},
          {'type': 'Interaction', 'interaction variables': ['description', 'price']}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    price_pattern = r'[\d]+\.[\d]+\.[\d]+'
    
    if re.match(price_pattern, input_str):
        sep = input_str.rsplit('.')
        return '{}.{}'.format(sep[0], sep[1])
    
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [7]:
with open('../data/walmart/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

with open('../data/submarino/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [8]:
p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [11]:
linker = dedupe.RecordLink(fields)
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : game micoach by adidas - ps3
price : 56.99

description : capinha celular nba note 4 milwaukee bucks - nbae14 3984861
price : 59.9

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : bateria camera digital samsung pl50
price : 86.0

description : bateria notebook hp pavilion dm3 - 1116 polimero li - po 10.8v 11.1v / / preta / / 5200mah 2142825
price : 270.9

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : banheira 500 litros - nautika
price : 137.52

description : banheira 230 litros - nautika 2206703
price : 71.06

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : farol - arteb - amarok 2010 diante - lado passageiro - cada unidade - 0160754
price : 556.0

description : farol - arteb - amarok 2010 diante - lado passageiro - cada unidade - 0160754 3816829
price : 556.0

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : terminal direcao diametro pino 14,6 x 225 mm direito hilux gi
price : 132.95

description : terminal direcao diametro pino 14,6 x 225 mm direito hilux gi 3796373
price : 132.95

1/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha compacta damasco itatiaia coimbra bege / branco laca
price : 778.48

description : cozinha compacta damasco itatiaia coimbra bege / branco laca 2557202
price : 779.87

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escapamento turbo esportivo pro tork honda biz 125 2010 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde 3582460
price : 124.46

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : par farol daylight hyundai ix35 2009 2010 2011 2012 foco duplo mascara negra
price : 1919.2

description : par farol daylight l200 triton 2007 2008 2009 2010 2011 2012 foco duplo mascara negra led 484664
price : 1760.0

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escapamento turbo esportivo pro tork honda biz 125 2010 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde 3582460
price : 124.46

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5143 sicilia siena moveis argila / amarelo gema 2555451
price : 1499.89

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde 3582460
price : 124.46

3/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escapamento turbo esportivo pro tork honda biz 125 2010 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 125 2010 + brinde 3582467
price : 124.46

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : protetor radiador triumph tiger explorer 1200 / xc 2016
price : 401.0

description : protetor radiador triumph tiger explorer 1200 / xc 2016 3807296
price : 401.0

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5144 sicilia siena moveis argila / amarelo gema 2555456
price : 1499.89

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : puff angel redondo collor impermealizado pes palito amarelo - lymdecor
price : 199.5

description : puff angel redondo collor impermealizado pes palito amarelo lymdecor 3301986
price : 199.5

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : guarda roupa 6 portas 2 gavetas astral 100 mdp rodapes castanho / avela touch - demobile
price : 639.95

description : guarda roupa 6 portas 2 gavetas astral 100 % mdp rodapes castanho / avela touch - demobile - 201,00 x 165,00 x 47,00 3503333
price : 639.94

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo mesa 70 70 heineken 4 cadeiras pamplona mel - moveis parana
price : 1272.9

description : jogo mesa 70 70 heineken 4 cadeiras pamplona mel - moveis parana 3489124
price : 1339.87

8/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : conjunto sala jantar mesa 6 cadeiras megan madesa rustic / perola
price : 959.9

description : conjunto sala jantar mesa 6 cadeiras celeny madesa rustic / floral hibiscos 2306804
price : 599.9

9/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escapamento esportivo 788 aco pro tork star 50 traxx + brinde
price : 155.09

description : escapamento esportivo 788 aco pro tork speed 150 dafra + brinde cromado 3582767
price : 159.24

9/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : banqueta courissimo giratoria regulagem altura modelo umix 801c vermelho - mix u
price : 339.9

description : banqueta courissimo giratoria regulagem altura umix 801c vermelha 3757764
price : 285.08

9/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : comoda 3 gavetas nature branco fosco / eco wood
price : 598.98

description : comoda 3 gavetas nature branco fosco / eco wood 3067539
price : 598.98

10/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : conjunto sala jantar mesa 6 cadeiras gales madesa rustic / perola
price : 649.9

description : conjunto sala jantar mesa 4 cadeiras dijon madesa rustic / floral hibiscos 2485074
price : 759.9

11/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 125 2010 + brinde 3582467
price : 124.46

11/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [12]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000100
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4,

In [16]:
m = linker.match(p,w, 0)

INFO:dedupe.canopy_index:Removing stop word li
INFO:dedupe.canopy_index:Removing stop word la
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 8.8622902 seconds


In [17]:
m

[(('4013', '1424'), 0.99999994),
 (('14085', '2342'), 0.98305237),
 (('12936', '1965'), 0.89569336),
 (('9895', '976'), 0.10485038),
 (('9890', '2791'), 0.030447919),
 (('8699', '2358'), 9.981847e-05)]

In [18]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'lanterna traseira saveiro g3 2000 2001 2002 2003 2004 2005 bicolor 484482', 'price': 63.15} {'description': 'par farol daylight golf 99 2000 2001 2002 2003 2004 2005 2006 led mascara cromada', 'price': 878.4}
{'description': 'kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8 3714147', 'price': 31.04} {'description': 'kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8', 'price': 31.04}
{'description': 'pneu linglong 235 / 60r16 tl 100h crosswind hp010 3740579', 'price': 481.48} {'description': 'pneu linglong 235 / 60r16 tl 100h crosswind hp010', 'price': 416.56}
{'description': 'kit farol mascara cromada sprinter cdi 2003 2004 2005 2006 2007 2008 2009 2010 2011 grade 489263', 'price': 445.39} {'description': 'central multimidia honda fit 2003 2004 2005 2006 2007 2008', 'price': 1567.99}
{'description': 'par farol peugeot 206 99 2000 2001 2002 2003 2004 2005 2006 2007 2008 black foco duplo angel eye

In [13]:
linker.threshold(p,w, recall_weight=0.5)

INFO:dedupe.canopy_index:Removing stop word ia
INFO:dedupe.canopy_index:Removing stop word en
INFO:dedupe.canopy_index:Removing stop word ra
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word ta
INFO:dedupe.canopy_index:Removing stop word do
INFO:dedupe.canopy_index:Removing stop word iv
INFO:dedupe.canopy_index:Removing stop word ro
INFO:dedupe.canopy_index:Removing stop word de
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word ac
INFO:dedupe.canopy_index:Removing stop word te
INFO:dedupe.canopy_index:Removing stop word to
INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.canopy_index:Removing stop word co
INFO:dedupe.canopy_index:Removing stop word es
INFO:dedupe.canopy_index:Removing stop word nt
INFO:dedupe.canopy_index:Removing stop word ei
INFO:dedupe.canopy_index:Removing stop word ma
INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.c

0.89569336

In [19]:
linker2 = dedupe.RecordLink(fields)
linker2.classifier = MLPClassifier()
linker2.sample(p, w)
dedupe.consoleLabel(linker2)

description : short auslander destroyed
price : 74.99

description : capa transparente personalizada exclusiva samsung galaxy j7 sm - j700f dachshund - tp70 3176298
price : 24.9

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : livro - new sprinkles 2 : activity pad - 9786070604379
price : 44.99

description : cha dez 354245
price : 26.5

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5143 sicilia siena moveis argila / amarelo gema 2555451
price : 1499.89

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : farol automotive imports vw gol - saveiro - parati g3 2000 - 2005 lado passageiro
price : 188.99

description : farol automotive imports vw gol - saveiro - parati g3 2000 - 2005 lado passageiro 3805005
price : 188.99

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : terminal direcao diametro pino 14,6 x 225 mm direito hilux gi
price : 132.95

description : terminal direcao diametro pino 14,6 x 225 mm direito hilux gi 3796373
price : 132.95

1/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha compacta damasco itatiaia coimbra bege / branco laca
price : 778.48

description : cozinha compacta damasco itatiaia coimbra bege / branco laca 2557202
price : 779.87

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : banheira 500 litros - nautika
price : 137.52

description : banheira 230 litros - nautika 2206703
price : 71.06

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : conjunto sala jantar mesa 6 cadeiras gales madesa rustic / perola
price : 649.9

description : conjunto sala jantar mesa 4 cadeiras lexy madesa rustic / floral hibiscos 2306754
price : 399.9

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : piscina inflavel 4600
price : 249.0

description : piscina inflavel 3700l 270x70cm bel lazer 1922454
price : 221.9

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda roupa 6 portas 2 gavetas astral 100 mdp rodapes castanho / avela touch - demobile
price : 639.95

description : guarda roupa 6 portas 2 gavetas astral 100 % mdp rodapes castanho / avela touch - demobile - 201,00 x 165,00 x 47,00 3503333
price : 639.94

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda biz 100 1998 2005 + brinde 3582460
price : 124.46

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo mesa 70 70 heineken 4 cadeiras pamplona mel - moveis parana
price : 1272.9

description : jogo mesa 70 70 heineken 2 cadeiras pamplona mel - moveis parana 3489118
price : 939.89

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : puff angel redondo collor impermealizado pes palito amarelo - lymdecor
price : 199.5

description : puff angel redondo collor impermealizado pes palito amarelo lymdecor 3301986
price : 199.5

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : grade moldura - choque audi a3 sportback 09 10 11 12 furo preto
price : 139.9

description : grade moldura - choque audi a3 sportback 09 10 11 12 furo preto lado esquerdo motorista 3709431
price : 139.9

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5141 sicilia siena moveis argila / vermelho scarlet 2555442
price : 2499.9

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo mesa redonda brahma 4 cadeiras bahia mel - moveis parana
price : 1041.1

description : jogo mesa redonda heineken 4 cadeiras bahia mel - moveis parana 3489157
price : 1095.9

8/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


p


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5141 sicilia siena moveis argila / vermelho scarlet 2555442
price : 2499.9

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : jogo mesa redonda brahma 4 cadeiras bahia mel - moveis parana
price : 1041.1

description : jogo mesa redonda heineken 4 cadeiras bahia mel - moveis parana 3489157
price : 1095.9

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo mesa redonda brahma 4 cadeiras bahia mel - moveis parana
price : 1041.1

description : jogo mesa redonda brahma 4 cadeiras camboriu mel moveis parana 3568654
price : 1095.9

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : capacete corazza air ultra safe
price : 199.2

description : capacete corazza air ultra safe 1830869
price : 199.2

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo mesa redonda heineken 4 cadeiras camboriu mel - moveis parana
price : 1041.1

description : jogo mesa redonda heineken 4 cadeiras bahia mel - moveis parana 3489157
price : 1095.9

9/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit coifa homocinetica daewoo espero
price : 35.0

description : kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8 3714147
price : 31.04

10/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : banco pequeno naturalle branco - tramontina
price : 80.99

description : banco pequeno naturalle branco in casa tramontina 3774992
price : 111.48

10/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escapamento esportivo 788 aco pro tork honda fan 125 2014 2015 + brinde
price : 155.09

description : escapamento esportivo 788 aco pro tork honda titan 150 esd 2004 2008 + brinde cromado 3582741
price : 159.24

11/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cozinha compacta 5141 sicilia siena moveis argila / amarelo gema
price : 2499.9

description : cozinha compacta 5142 sicilia siena moveis argila 2556669
price : 1799.9

11/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit piscina inflavel bestway 10.179l filtro capa bomba
price : 899.9

description : kit piscina inflavel bestway 3.700ls filtro capa bomba forro 220 2179512
price : 499.9

11/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa centro duo - cinza
price : 949.0

description : mesa centro duo 2 gavetas natural amarelo maxima 2884394
price : 967.8

11/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit 2 fornos 3 portas fellicci master kd25 carvalho blanche
price : 393.28

description : kit 2 fornos 3 portas fellicci master kd25 carvalho blanche 3573674
price : 419.0

11/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : comoda 3 gavetas nature branco fosco / eco wood
price : 598.98

description : comoda 3 gavetas nature branco fosco / eco wood 3067539
price : 598.98

12/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo anel toyota corolla 1.8 16v 7afe
price : 120.0

description : jogo anel toyota corolla 1.6 / 1.8 16v 02 / ... 3719711
price : 108.0

13/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [20]:
linker2.train()

INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: Tfid

In [21]:
m2 = linker.match(p,w, 0)

INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 8.7587402 seconds


In [22]:
for i in m2:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'lanterna traseira saveiro g3 2000 2001 2002 2003 2004 2005 bicolor 484482', 'price': 63.15} {'description': 'par farol daylight golf 99 2000 2001 2002 2003 2004 2005 2006 led mascara cromada', 'price': 878.4}
{'description': 'kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8 3714147', 'price': 31.04} {'description': 'kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8', 'price': 31.04}
{'description': 'pneu linglong 235 / 60r16 tl 100h crosswind hp010 3740579', 'price': 481.48} {'description': 'pneu linglong 235 / 60r16 tl 100h crosswind hp010', 'price': 416.56}
{'description': 'kit farol mascara cromada sprinter cdi 2003 2004 2005 2006 2007 2008 2009 2010 2011 grade 489263', 'price': 445.39} {'description': 'central multimidia honda fit 2003 2004 2005 2006 2007 2008', 'price': 1567.99}
{'description': 'par farol peugeot 206 99 2000 2001 2002 2003 2004 2005 2006 2007 2008 black foco duplo angel eye