In [24]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [2]:
fields = [{'field': 'description', 'variable name': 'description', 'type': 'String'},
          {'field': 'price', 'variable name': 'price', 'type': 'Price'},
          {'type': 'Interaction', 'interaction variables': ['description', 'price']}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    price_pattern = r'[\d]+\.[\d]+\.[\d]+'
    
    if re.match(price_pattern, input_str):
        sep = input_str.rsplit('.')
        return '{}.{}'.format(sep[0], sep[1])
    
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [3]:
with open('../data/walmart/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [4]:
with open('../data/americanas/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [11]:
# training support vector machine
linker = dedupe.RecordLink(fields)
linker.classifier = SGDClassifier(loss='log')
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : cadeira lis eames pp revestida tecido cinza
price : 448.95

description : cadeira jantar toda revestida couro ecologico or - 4401 or design branco 3513822
price : 269.0

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : saida triya drapeada
price : 179.99

description : cortina esmeralda quarto sala 3,00m x 2,80m avela palha 3687606
price : 179.99

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escapamento turbo esportivo pro tork honda titan 150 2004 2008 esd + brinde
price : 121.22

description : escapamento turbo esportivo pro tork honda titan 150 2004 2008 esd + brinde 3582479
price : 124.46

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom
price : 1399.9

description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom 2345660
price : 1399.9

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : vara pesca telescopica vitoria 4 m 4008 marine sports
price : 33.9

description : vara pesca telescopica vitoria 1.80 m 1804 marine sports 2185255
price : 15.9

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : aplique cromado tampa combustivel gol 95 / 98 ser auto - 13002
price : 17.86

description : aplique cromado tampa combustivel fox 2003 1317908
price : 20.9

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa diana madeira summa - tramontina
price : 449.82

description : mesa diana madeira summa tramontina 3736078
price : 461.76

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : moldura interruptor vidro redonda simples porta dianteira traseira cinza universal uso geral
price : 12.01

description : moldura interruptor vidro redonda simples porta dianteira traseira preto universal uso geral 3679570
price : 12.01

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo mesa 70 70 heineken 2 cadeiras santa fe mel - moveis parana
price : 892.9

description : jogo mesa 70 70 heineken 4 cadeiras pamplona mel - moveis parana 3489124
price : 1339.87

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : bolsa termica ice cooler 24 litros cinza - mor
price : 65.9

description : bolsa termica ice cooler 24 litros 01 divisoria bolso lateral mor 476362
price : 71.9

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cadeira 1717 cromada 04 unidades cacau carraro
price : 602.0

description : cadeira 1709 cromada 04 unidades napa cacau carraro 2897171
price : 695.63

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa bar alta cor stain jatoba
price : 608.4

description : cadeira bar alta cor stain jatoba 15644 3168782
price : 445.36

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : balcao pia 3 portas 2 gavetas elis tampo branco / amendoa - glamy
price : 417.58

description : balcao pia 3 portas 2 gavetas elis tampo branco / gold - glamy - 83,00 x 120,00 x 51,00 3507856
price : 395.0

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa lateral kenzo 4152 santana - knr moveis
price : 269.12

description : mesa lateral kenzo 4152 santana knr moveis 3839115
price : 269.12

5/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 694 premium preto
price : 542.9

description : jogo 2 cadeiras fixa c - 724 premium preto 3825317
price : 412.9

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : interface desbloqueio tela vw golf 2014 2016 faaftech ft - video - lvds aud3
price : 1679.0

description : interface desbloqueio tela vw golf 2014 2016 faaftech ft video lvds aud3 3796527
price : 1678.98

6/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cadeira relic preto
price : 329.99

description : cadeira relic preta 2955881
price : 353.58

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cadeira 026 cromada 04 unidades branca carraro
price : 359.59

description : cadeira 357 cromada 04 unidades branca carraro 3897508
price : 645.51

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cadeira 190 cromada 04 unidades fantasia branco carraro
price : 635.76

description : cadeira 1717 cromada 02 unidades fantasia branco carraro 2897222
price : 280.0

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : pinhao 13 dentes yamaha dt200 l ano 1985 - vaz
price : 49.74

description : pinhao 13 dentes yamaha dt200 l ano 1985 vaz 3784919
price : 54.26

8/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : espelho auxiliar redondo universal 75mm
price : 8.15

description : espelho auxiliar redondo universal 50mm 4024925
price : 6.08

9/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : rack teto thule wingbar aluminio subaru impreza , 4 - p sedan , 07 - 11
price : 1814.0

description : rack teto thule wingbar aluminio subaru impreza 4 - p sedan 07 - 11 4095956
price : 1813.98

9/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 728 premium preto
price : 489.9

description : jogo 2 cadeiras fixa c - 694 premium preto 3825366
price : 542.9

10/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cadeira 1717 cromada 04 unidades branca carraro
price : 602.0

description : cadeira 357 cromada 04 unidades branca carraro 3897508
price : 645.51

10/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo 2 cadeiras fixa c - 694 premium preto
price : 542.9

description : jogo 2 cadeiras fixa c - 728 premium preto 3825249
price : 489.9

10/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : molinete altima 4000 sistema balanceado marine sports
price : 101.9

description : molinete altima 5000 sistema balanceado marine sports 2186391
price : 145.9

10/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : molinete pesca force 4000 3 rolamentos marine sports
price : 70.9

description : molinete pesca force 2000 3 rolamentos marine sports 2187249
price : 63.9

10/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade
price : 27.62

description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade 3862590
price : 27.62

10/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v 3922882
price : 799.9

11/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 694 premium branco
price : 542.9

description : jogo 2 cadeiras fixa c - 694 premium branco 3825311
price : 542.9

12/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada unidade
price : 11.9

description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada 01 unidade 3873298
price : 11.22

13/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : canaleta alternativo opala diplomata 1985 1990 4 portas preta - jogo
price : 128.0

description : canaleta alternativo opala diplomata 1985 1990 4 portas preta jogo 3889373
price : 128.0

14/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 728 premium preto
price : 489.9

description : jogo 2 cadeiras fixa c - 695 premium preto 3825339
price : 517.9

15/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [14]:
linker.train()

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: Tfidf

In [21]:
m = linker.match(p,w, 0)

INFO:dedupe.canopy_index:Removing stop word ir
INFO:dedupe.canopy_index:Removing stop word ch
INFO:dedupe.canopy_index:Removing stop word ci
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:10000, 3.8836272 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 5.7429342 seconds


In [23]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'guarnicao bomba combustivel courier 97 / 00 , ecosport 03 , ka 08 , fiesta 96 / 06 , focus 00 , new fiesta 12 dsc 3824908', 'price': 17.19} {'description': 'guarnicao bomba combustivel courier 97 / 00 , ecosport 03 , ka 08 , fiesta 96 / 06 , focus 00 , new', 'price': 17.19}
{'description': 'pneu dunlop 235 / 75r15 104s grandtrek at3 3740001', 'price': 676.24} {'description': 'pneu dunlop 235 / 75r15 104s grandtrek at3', 'price': 676.24}
{'description': 'pneu nexen 235 / 70r16 106s roat pro ra8 3739851', 'price': 647.03} {'description': 'pneu nexen 235 / 70r16 106s roat pro ra8', 'price': 647.03}
{'description': 'kit coifa homocinetica cambio esquerdo rolamento golf , golf , golf , polo classic 1.8 3714147', 'price': 31.04} {'description': 'kit coifa homocinetica cambio direito rolamento golf , golf , golf , polo classic 1.8', 'price': 30.54}
{'description': 'cadeado abus 585 75 upgrip chain preto 584589 2699610', 'price': 200.88} {'description': 'cadeado abus 585 / 75 

In [25]:
# training naive bayes classifier
linker2 = dedupe.RecordLink(fields)
linker2.classifier = MultinomialNB()
linker2.sample(p, w)
dedupe.consoleLabel(linker2)

description : cd medo - ruas
price : 21.99

description : corneta redonda 1 hl11 / 25 prata / preta jbl 3654627
price : 71.84

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : interface volante renault symbol 2009 2014 via infra vermelho new ft - rn - ir
price : 279.0

description : interface volante toyota corolla 2009 2016 via infra vermelho ft ir 3795994
price : 279.0

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo 2 cadeiras fixa c - 728 premium branco
price : 489.9

description : jogo 2 cadeiras fixa c - 728 premium branco 3825323
price : 489.9

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2 3922952
price : 799.9

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : mesa diana madeira summa - tramontina
price : 449.82

description : mesa diana madeira summa tramontina 3736078
price : 461.76

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : som automotivo mp3 player
price : 115.9

description : som automotivo usb p2 sd bluetooth multilaser p3319 4069554
price : 140.5

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : princesas jogo cha - toyng
price : 56.62

description : par borracha multi uso premium porta preto borracha celta , corsa , ipanema , kadett , montana , monza , omega , prisma , classe , gol , saveiro 3763253
price : 104.52

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : suporte gopro tubo
price : 179.0

description : suporte pratos talheres cod . 6659 3667066
price : 259.25

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit completo suspensao rosca tebao molas preparadas corsa wagon 1997 2002
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas meriva 2002 2012 3922606
price : 799.9

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : maquina vidro mecanica , manual porta dianteira esquerdo bandeirante
price : 63.1

description : maquina vidro mecanica , manual porta dianteira esquerdo ford f12000 , f14000 , f16000 , f250 , f , 350 , f400098 3763390
price : 174.5

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : anel pistao kit agrale 27.5 - kmp 1.00mm
price : 51.98

description : anel pistao kit agrale 27.5 kmp 1.00mm 1324554
price : 57.76

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : espelho auxiliar redondo universal 75mm
price : 8.15

description : espelho auxiliar redondo universal 50mm 4024925
price : 6.08

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : molinete pesca prisma 2000 5 rolamentos marine sports
price : 61.9

description : molinete pesca laguna 1000 5 rolamentos marine sports 2185802
price : 47.9

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : canaleta alternativo opala diplomata 1985 1990 4 portas preta - jogo
price : 128.0

description : canaleta alternativo opala diplomata 1985 1990 4 portas preta jogo 3889373
price : 128.0

4/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : molinete pesca force 3000 3 rolamentos marine sports
price : 68.9

description : molinete pesca force 2000 3 rolamentos marine sports 2187249
price : 63.9

5/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo 2 cadeiras fixa c - 694 premium preto
price : 542.9

description : jogo 2 cadeiras fixa c - 728 premium preto 3825249
price : 489.9

5/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : molinete ms carretel aluminio xt - 2000 marine sports
price : 97.9

description : molinete ms carretel aluminio xt - 4000 marine sports 2187281
price : 90.9

5/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom
price : 1399.9

description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom 2345660
price : 1399.9

5/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v 3922882
price : 799.9

6/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade
price : 27.62

description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade 3862590
price : 27.62

7/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 694 premium preto
price : 542.9

description : jogo 2 cadeiras fixa c - 695 premium preto 3825339
price : 517.9

8/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo 2 cadeiras fixa c - 724 premium branco
price : 412.9

description : jogo 2 cadeiras fixa c - 694 premium branco 3825311
price : 542.9

8/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo mesa 70 70 brahma 4 cadeiras camboriu mel - moveis parana
price : 1082.9

description : jogo mesa 70 70 brahma 2 cadeiras camboriu mel - moveis parana 3489098
price : 839.9

8/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo mesa redonda heineken 2 cadeiras ipanema mel - moveis parana
price : 826.4

description : jogo mesa redonda heineken 2 cadeiras ipanema mel - moveis parana 3489152
price : 869.89

8/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : calco macaneta externa porta dianteira direito scania serie iv , serie v98
price : 150.48

description : calco macaneta externa porta dianteira direito scania serie iv , serie v98 3743143
price : 150.48

9/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : punisher nemesis justiceiro funko pop marvel
price : 95.9

description : punisher nemesis justiceiro funko pop marvel 3378497
price : 102.9

10/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada unidade
price : 11.9

description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada 01 unidade 3873298
price : 11.22

11/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : moldura interruptor vidro redonda simples porta dianteira traseira cinza universal uso geral
price : 12.01

description : moldura interruptor vidro redonda simples porta dianteira traseira preto universal uso geral 3679570
price : 12.01

12/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escrivaninha madeira mdf laqueado 5 gavetas vintage - maxima
price : 1599.0

description : escrivaninha madeira mdf laqueado 5 gavetas vintage maxima bege 3513896
price : 1599.0

13/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [30]:
linker2.train()

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: Tfidf

In [64]:
m2 = linker2.match(p,w, 0.5)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 1.7377132 seconds
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records
INFO:dedupe.blocking:10000, 0.7797472 seconds
INFO:dedupe.api:200 records


In [65]:
for i in m2:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'pneu hifly aro 16 265 70 r16 112h ht601 3731016', 'price': 517.0} {'description': 'pneu aro 16 265 / 70 r16 comforser cf2000 112h', 'price': 503.0}
{'description': 'capacete corazza air ultra safe 1830869', 'price': 199.2} {'description': 'capacete ultra safe corazza air laranja', 'price': 199.16}
{'description': 'tampo balcao coocktop 80x52x10 preto mia coccina art in moveis 1566760', 'price': 94.9} {'description': 'tampo espelho balcao 1,20m art in moveis mia coccina preto', 'price': 96.9}
{'description': 'boneca basica - monster high - assustadora barreira corais - toralei - mattel 2480747', 'price': 89.99} {'description': 'monster high boneca basica cleo nile - mattel', 'price': 89.99}
{'description': 'mesa diana summa tramontina 3736170', 'price': 438.96} {'description': 'mesa diana madeira summa - tramontina', 'price': 449.82}
{'description': 'pneu aderenza aro 17 205 / 50 r17 93w perform 3735939', 'price': 328.0} {'description': 'pneu aro 17 205 / 50 r17 comfors

In [29]:
# training naive bayes classifier
linker3 = dedupe.RecordLink(fields)
linker3.classifier = MLPClassifier()
linker3.sample(p, w)
dedupe.consoleLabel(linker3)

description : mesa lateral marley branco
price : 362.9

description : ponteira harley davidson softail fat boy 3 baixo customer fat boy 2006 atual 3067735
price : 1241.89

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : molas esportivas red coil montana 1.8 003 diante
price : 545.0

description : molas esportivas macaulay - ford fiesta 2003 diante 2095143
price : 449.0

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : retrovisor palio strada g3 04 05 06 07 controle 2 portas preto
price : 44.9

description : retrovisor palio strada g3 04 05 06 07 controle 2 portas preto lado direito + lado esquerdo 3783074
price : 89.8

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2 3922952
price : 799.9

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom
price : 1399.9

description : conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom 2345660
price : 1399.9

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : molinete pesca serena 3000 3 rolamentos marine sports
price : 64.9

description : molinete pesca force 4000 3 rolamentos marine sports 2185798
price : 70.9

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa modulado 3pt branco
price : 729.99

description : guarda - roupa 3 portas correr flex color sofia pes espelho branco preto branco perfil moveis 219,00 x 191,40 x 53,00 3084656
price : 1115.1

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa diana madeira summa - tramontina
price : 449.82

description : mesa diana madeira summa tramontina 3736078
price : 461.76

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : moldura interruptor vidro redonda simples porta dianteira traseira cinza universal uso geral
price : 12.01

description : moldura interruptor vidro redonda simples porta dianteira traseira preto universal uso geral 3679570
price : 12.01

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cadeira 026 cromada 04 unidades branca carraro
price : 359.59

description : cadeira 154 cromada 04 unidades napa branca carraro 3897504
price : 356.75

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo 2 cadeiras fixa c - 694 premium branco
price : 542.9

description : jogo 2 cadeiras fixa c - 724 premium branco 3825259
price : 412.9

3/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v
price : 799.9

description : kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v 3922882
price : 799.9

3/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade
price : 27.62

description : kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade 3862590
price : 27.62

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : aplicador pretinho mandala
price : 3.49

description : aplicador pretinho mandala 1592710
price : 3.49

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : molinete elite 500 3 rolos esferas marine sports
price : 35.9

description : molinete elite 3000 3 rolos esferas marine sports 2185573
price : 42.9

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : jogo tabuleiro temporada compras - mix8 611188
price : 39.9

description : jogo tabuleiro temporada compras mix8 611188 3595531
price : 39.9

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo 2 cadeiras fixa c - 724 premium preto
price : 412.9

description : jogo 2 cadeiras fixa c - 694 premium preto 3825366
price : 542.9

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : interruptor luz re 3rho 4489 ford
price : 59.2

description : interruptor luz re 3rho 4453 ford 3781767
price : 150.03

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cadeira relic preto
price : 329.99

description : cadeira relic preta 2955881
price : 353.58

7/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada unidade
price : 11.9

description : bucha caixa direcao npm npm - 25.202.a4 ford escort / verona 1.6 93 / 96 central cada 01 unidade 3873298
price : 11.22

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : canaleta alternativo opala diplomata 1985 1990 4 portas preta - jogo
price : 128.0

description : canaleta alternativo opala diplomata 1985 1990 4 portas preta jogo 3889373
price : 128.0

9/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cadeado abus 585 / 75 upgrip chain preto 584589
price : 200.88

description : cadeado abus 585 75 upgrip chain preto 584589 2699610
price : 200.88

10/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit reparo amortecedor dianteiro coxim batente guarda po rolamento fiestai
price : 418.02

description : kit reparo amortecedor dianteiro coxim batente guarda po rolamento corolla 3751044
price : 588.2

11/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [33]:
linker3.train()

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: Tfidf

In [68]:
m3 = linker3.match(p,w, 0.75)

INFO:dedupe.blocking:10000, 0.8191732 seconds
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records
INFO:dedupe.api:200 records
INFO:dedupe.api:300 records
INFO:dedupe.api:400 records
INFO:dedupe.api:500 records
INFO:dedupe.api:600 records
INFO:dedupe.api:700 records
INFO:dedupe.api:800 records
INFO:dedupe.api:900 records
INFO:dedupe.api:1000 records
INFO:dedupe.api:1100 records
INFO:dedupe.api:1200 records
INFO:dedupe.api:1300 records
INFO:dedupe.api:1400 records
INFO:dedupe.api:1500 records
INFO:dedupe.api:1600 records
INFO:dedupe.api:1700 records
INFO:dedupe.api:1800 records
INFO:dedupe.api:1900 records
INFO:dedupe.api:2000 records
INFO:dedupe.api:2100 records
INFO:dedupe.api:2200 records
INFO:dedupe.api:2300 records
INFO:dedupe.api:2400 records
INFO:dedupe.api:2500 records
INFO:dedupe.api:2600 records
INFO:dedupe.api:2700 records
INFO:dedupe.api:2800 records
INFO:dedupe.api:2900 records
INFO:dedupe.api:3000 records
INFO:dedupe.api:3100 records
INFO:dedupe.api:3200 records
INFO:dedu

In [69]:
for i in m3:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2 3922952', 'price': 799.9} {'description': 'kit completo suspensao rosca tebao molas preparadas fiesta 1996 2002 fiesta street 2001 2', 'price': 799.9}
{'description': 'conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom 2345660', 'price': 1399.9} {'description': 'conjunto sala jantar mesa 6 cadeiras madesa marselha tabaco / suede perola bege / marrom', 'price': 1399.9}
{'description': 'kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v 3922882', 'price': 799.9} {'description': 'kit completo suspensao rosca tebao molas preparadas c4 09 14 pallas 08 13 hatch 04 14 v', 'price': 799.9}
{'description': 'kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade 3862590', 'price': 27.62} {'description': 'kit amortecedor dianteiro axios br10004402513 c3 03 / partir 2003 cada 01 unidade', 'pri

In [46]:
linker.threshold(p,w, recall_weight=0.5)

INFO:dedupe.canopy_index:Removing stop word el
INFO:dedupe.canopy_index:Removing stop word an
INFO:dedupe.canopy_index:Removing stop word de
INFO:dedupe.canopy_index:Removing stop word or
INFO:dedupe.canopy_index:Removing stop word me
INFO:dedupe.canopy_index:Removing stop word am
INFO:dedupe.canopy_index:Removing stop word gi
INFO:dedupe.canopy_index:Removing stop word ca
INFO:dedupe.canopy_index:Removing stop word pi
INFO:dedupe.canopy_index:Removing stop word ai
INFO:dedupe.canopy_index:Removing stop word em
INFO:dedupe.canopy_index:Removing stop word is
INFO:dedupe.canopy_index:Removing stop word in
INFO:dedupe.canopy_index:Removing stop word se
INFO:dedupe.canopy_index:Removing stop word 85
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:10000, 3.6636972 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 5.6282202 seconds
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 1.000
INFO:dedupe.api:pr

0.99767953

In [44]:
linker2.threshold(p,w, recall_weight=0.5)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 1.7508222 seconds
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records
INFO:dedupe.blocking:10000, 0.7619642 seconds
INFO:dedupe.api:200 records
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.939
INFO:dedupe.api:precision: 0.501
INFO:dedupe.api:With threshold: 0.443


0.44262093

In [45]:
linker3.threshold(p,w, recall_weight=0.5)

INFO:dedupe.blocking:10000, 0.8412732 seconds
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records
INFO:dedupe.api:200 records
INFO:dedupe.api:300 records
INFO:dedupe.api:400 records
INFO:dedupe.api:500 records
INFO:dedupe.api:600 records
INFO:dedupe.api:700 records
INFO:dedupe.api:800 records
INFO:dedupe.api:900 records
INFO:dedupe.api:1000 records
INFO:dedupe.api:1100 records
INFO:dedupe.api:1200 records
INFO:dedupe.api:1300 records
INFO:dedupe.api:1400 records
INFO:dedupe.api:1500 records
INFO:dedupe.api:1600 records
INFO:dedupe.api:1700 records
INFO:dedupe.api:1800 records
INFO:dedupe.api:1900 records
INFO:dedupe.api:2000 records
INFO:dedupe.api:2100 records
INFO:dedupe.api:2200 records
INFO:dedupe.api:2300 records
INFO:dedupe.api:2400 records
INFO:dedupe.api:2500 records
INFO:dedupe.api:2600 records
INFO:dedupe.api:2700 records
INFO:dedupe.api:2800 records
INFO:dedupe.api:2900 records
INFO:dedupe.api:3000 records
INFO:dedupe.api:3100 records
INFO:dedupe.api:3200 records
INFO:dedu

0.59180754

In [66]:
m2

[(('3362', '10589'), 0.57063758),
 (('12378', '6567'), 0.57016164),
 (('12509', '3609'), 0.56956112),
 (('11562', '4750'), 0.55734891),
 (('7205', '6391'), 0.55020803),
 (('3384', '10415'), 0.54823756),
 (('5313', '6222'), 0.54748064),
 (('11806', '1447'), 0.54457325),
 (('12158', '14180'), 0.5438655),
 (('6225', '5239'), 0.54313618),
 (('3912', '14842'), 0.54190528),
 (('2458', '10753'), 0.54150987),
 (('5330', '6242'), 0.54139906),
 (('14149', '1720'), 0.54095662),
 (('9664', '13222'), 0.53880292),
 (('11151', '9799'), 0.53826517),
 (('1495', '9977'), 0.53778887),
 (('860', '13588'), 0.5364548),
 (('1485', '9986'), 0.53617513),
 (('9654', '13225'), 0.5359323),
 (('2673', '1949'), 0.53439832),
 (('13068', '2352'), 0.53374159),
 (('3872', '12007'), 0.53220034),
 (('2467', '10770'), 0.5317288),
 (('12138', '14177'), 0.53163075),
 (('11071', '2887'), 0.53066498),
 (('14513', '13603'), 0.52955908),
 (('10404', '2125'), 0.5281083),
 (('7173', '10623'), 0.52781904),
 (('9659', '13223'), 0.5

In [67]:
m3

[(('2656', '3078'), 0.80469275),
 (('1503', '9998'), 0.80459869),
 (('2663', '3075'), 0.80450255),
 (('15267', '14646'), 0.80387741),
 (('11970', '12262'), 0.80302405),
 (('2662', '3074'), 0.80275083),
 (('1654', '11598'), 0.80196816),
 (('3881', '12000'), 0.80074382),
 (('3886', '11992'), 0.80074382),
 (('987', '12225'), 0.80054122),
 (('3875', '6352'), 0.80054122),
 (('3076', '656'), 0.80033189),
 (('3872', '11999'), 0.80033189),
 (('2746', '6929'), 0.79989183),
 (('9368', '3790'), 0.79989183),
 (('3868', '6349'), 0.79917222),
 (('3874', '11993'), 0.79917222),
 (('3873', '11988'), 0.79891479),
 (('3885', '6351'), 0.79891479),
 (('3889', '6356'), 0.79891479),
 (('15025', '7624'), 0.79834563),
 (('3870', '11989'), 0.79778296),
 (('5138', '13480'), 0.79608852),
 (('5149', '13477'), 0.79608852),
 (('5150', '13478'), 0.79608852),
 (('5135', '13482'), 0.79570395),
 (('5145', '13481'), 0.79570395),
 (('5147', '9903'), 0.79570395),
 (('12941', '4932'), 0.79570395),
 (('12930', '10587'), 0.79