In [15]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize

In [16]:
fields = [{'field': 'description', 'type': 'String', 'has missing': True},
          {'field': 'description', 'type': 'Text', 'has missing': True},
          {'field': 'price', 'type': 'Price', 'has missing': True}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [17]:
with open('../data/americanas/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

with open('../data/magazineluiza/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [18]:
p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [19]:
linker = dedupe.RecordLink(fields)
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : kit reparo piscina 106400 . 0824175
price : 14.9

description : kit reparo balanca traseira peugeot 206 207 1.4 1.6 todos barra estabilizadora
price : 105.9

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : tapete aviao - castor 8372374
price : 59.9

description : tapete personalizado nissan x - terra 2009 2012 preto 5 pcs + trava seguranca
price : 59.9

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental colgate smiles 5+ anos 8306039
price : 17.19

description : escova dental colgate smiles 5 anos
price : 14.99

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escova dental closeup ultra reach media 1 unidade 8253885
price : 16.47

description : escova dental close up ultra reach media
price : 14.44

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : comoda infantil 4 gavetas bb 500 bco . 0838507
price : 269.99

description : criado infantil 4 gavetas
price : 420.0

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : adesivo soleira resinada ranger 13 16 cabine dupla 4 pecas preto cromo 8588438
price : 65.9

description : adesivo soleira resinada gol g3 g4 g5 g6 00 16 4 pecas preto cromo
price : 49.9

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental closeup ultra reach media 1 unidade 8253885
price : 16.47

description : escova dental close up ultra reach macia
price : 14.44

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escova dental closeup ultra reach macia 1 unidade 8207840
price : 16.47

description : escova dental close up ultra reach macia
price : 14.44

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cama solteiro cama auxiliar cm bco / bco / pto . 1232793
price : 449.1

description : cama solteiro vital
price : 254.2

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 42 8507281
price : 126.88

description : calca motocross pro tork insane 4 vermelha cinza trilha
price : 106.9

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 54 8558621
price : 126.88

description : calca motocross pro tork insane 3 azul trilha
price : 99.9

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : kit emblema resinado chevrolet dourado fosco fundo dourado 10,5x3,7cm 25 pecas 8503538
price : 114.75

description : kit emblema resinado chevrolet dourado fosco borda cromada 9,5x3,5cm 10 pecas
price : 54.9

6/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa motocross pro tork insane 5 branco preto trilha - tamanho gg 8561531
price : 38.9

description : calca motocross pro tork insane 3 azul trilha
price : 99.9

6/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : emblema chevrolet gravata dourada modelo corsa sedan 2010 2011 porta - malas 9131031
price : 14.9

description : emblema chevrolet gravata dourada 5,3x2,0cm
price : 3.9

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : teclado gamer multimidia g910 orion spark 920 - 006385 . 2170532
price : 1022.91

description : teclado gamer logitech g910 orion spark fio
price : 1209.99

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental colgate twister ultra completo 3 unidades 8332615
price : 15.35

description : escova dental colgate dr. rabbit
price : 3.49

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit cabelo manteiga la bella liss shampoo 500ml , mascara 950g leave - in 150g 8365894
price : 148.7

description : kit sobrevivencia
price : 229.9

6/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 42 8507281
price : 126.88

description : calca motocross pro tork connect spice trilha
price : 264.9

6/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 40 8535169
price : 126.88

description : calca motocross pro tork connect spice trilha
price : 264.9

7/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : adesivo soleira resinada c3 03 17 4 pecas preto cromo 8507409
price : 42.9

description : adesivo soleira resinada gol g3 g4 g5 g6 00 16 4 pecas preto cromo
price : 49.9

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : dvd player trc dvd 170 karaoke usb dvd 170 . 2152667
price : 129.0

description : dvd - amarcord
price : 32.99

8/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : lampada bulbo led - 12w - e27 - branco frio - bivolt 9250797
price : 15.2

description : lampada h4 24v 70w
price : 12.5

8/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : vibrador gasolina concreto , potencia 5,5 hp , motor ge 550 - v55 - g 8300391
price : 859.0

description : motor gasolina nagano nmg65
price : 969.99

8/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : dvd player philips dvp2880x / 78 divx ultra hdmi dvp2880x / 78 . 1907580
price : 169.0

description : dvd alma pudor
price : 47.99

8/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental colgate smiles 5+ anos 8306039
price : 17.19

description : escova dental colgate dr. rabbit
price : 3.49

8/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : lampada super led multilaser hb4 12v 30w 6200k - au831 - neutro 9149788
price : 143.91

description : kit lampada super led h13 12v 32w 6000k
price : 105.0

8/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental oral - b 3d white advantage 2 unidades 8379904
price : 17.09

description : escova dental oral - b indicator plus 40 - 2 unidades
price : 12.99

8/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : dvd player philips dvp2880x / 78 divx ultra hdmi dvp2880x / 78 . 1907580
price : 169.0

description : dvd - amarcord
price : 32.99

8/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : vela ngk dcpr8eix bmw / ducati / harley / ktm iridium 8285994
price : 59.8

description : vela ngk cr9ehix9 6216
price : 59.8

8/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa infantils 2 portas fofinho art in moveis branco 8368822
price : 319.89

description : guarda roupa infantil rubi 4 portas - canaa
price : 1195.2

8/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa solteiro 4 portas 3 gavetas 150533 . 2167001
price : 432.9

description : guarda roupa solteiro 2 portas 3 gavetas thor moveis horizonte freijo dourado
price : 1099.9

8/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental oral - b complete 40 2 unidades 8212826
price : 10.91

description : escova dental oral - b indicator plus 40 - 2 unidades
price : 12.99

8/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : radio portatil am / fm 20 faixas c / entrada auxiliar bd 112 ba . 2152164
price : 152.1

description : sos agua
price : 32.9

9/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa motocross pro tork insane 5 branco preto trilha - tamanho gg 8561531
price : 38.9

description : camisa jett tattooed - pro tork
price : 43.9

9/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal 8 portas 6 gavetas 120780 . 2167034
price : 1169.99

description : guarda roupa casal 3 portas 6 gavetas natus moveis horizonte canela
price : 1599.9

9/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal 8 portas 6 gavetas 120700 . 2167033
price : 1169.99

description : guarda roupa casal 3 portas 6 gavetas natus moveis horizonte canela
price : 1599.9

9/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa motocross pro tork insane 100 preta trilha - tamanho m 8505172
price : 40.9

description : camisa pro tork insane 3 azul motocross
price : 39.9

9/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : guarda - roupa casal 3 portas correr 3 gavetas 4013 - 166 . 2145710
price : 1079.99

description : ponte modulada casal 3 portas nobre
price : 548.86

10/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa / comoda compacto 2 portas fofinho rp5040 li art in moveis lilas 8298672
price : 469.9

description : guarda - roupa comoda quarto compacto rp5040 branco / rosa - art in moveis
price : 579.9

10/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : guarda - roupa casal 5 portas 4 gavetas 101883 . 2167004
price : 622.03

description : guarda - roupa casal santos andira star 5 portas 3 gavetas espelho - branco opcao rosa
price : 499.9

11/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sofa retratil reclinavel 2 lugares suede cor 592 azul . 1219204
price : 1259.1

description : sofa retratil 2 lugares tauri suede liso prata
price : 1189.0

11/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : colchao solteiro americanflex espuma d28 88x188cm 5012807 . 2161338
price : 461.69

description : colchao solteiro espuma certificada d28 78x188x14
price : 349.0

11/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : par palhetas limpador - brisa bosch aerotwin plus bmw m 6 coupe f 13 2012 diante - ap26 / ap17 8262901
price : 164.9

description : palhetas limpador parabrisa bosch aerotwin plus ap19m + ap24m jaguar xf - 2010
price : 139.9

11/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sofa retratil 3 lugares suede verona verona 3l 1707 . 1219195
price : 899.91

description : sofa retratil 3 lugares tauri suede liso preto
price : 1669.0

11/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal 6 portas 6 gavetas 149983 . 2161675
price : 647.1

description : guarda roupa casal 6 portas 8 gavetas espelho coliseu moveis horizonte canela
price : 1899.9

11/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal 6 portas 6 gavetas 149983 . 2161675
price : 647.1

description : guarda roupa casal 6 portas 8 gavetas espelho coliseu moveis horizonte branco
price : 1899.9

11/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : par palhetas limpador - brisa bosch aerotwin plus peugeot 508 2011 diante - ap26 / ap20 8382610
price : 164.9

description : palhetas limpador parabrisa bosch aerotwin plus ap19m + ap24m jaguar xf - 2010
price : 139.9

11/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca trek fish grafite g guepardo 8574438
price : 208.0

description : calca thorre mista
price : 219.99

11/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca trek fish grafite g guepardo 8574438
price : 208.0

description : calca triya malha
price : 179.99

11/10 positive, 35/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa motocross pro tork jett hi - vis verde neon - tamanho gg 8535815
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

11/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa motocross pro tork jett hi - vis verde neon - tamanho g 8593374
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

11/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sofa retratil 2 lugares suede tetra t20 - 474 . 2145645
price : 839.61

description : sofa retratil 2 lugares tauri suede liso preto
price : 1189.0

11/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : beliche madeira macica moveis lencois 078 / imb . 1235025
price : 323.99

description : bau madeira macica moveis neppel neppel
price : 308.95

11/10 positive, 39/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda roupa casal 6 portas 3 gavetas caju henn malbec 8270409
price : 579.9

description : guarda roupa casal 6 portas 8 gavetas espelho coliseu moveis horizonte freijo dourado
price : 1899.9

11/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal athenas 6 portas 4 gavetas 16554 . 1234973
price : 2043.0

description : guarda roupa infantil rubi 4 portas - canaa
price : 1195.2

11/10 positive, 41/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [20]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4,

In [24]:
m = linker.match(p,w, 0.2)

INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records


In [25]:
m

[(('8700', '2351'), 0.91932768),
 (('8706', '2353'), 0.81043434),
 (('8707', '2364'), 0.81043434),
 (('8576', '3940'), 0.61180764),
 (('8578', '3942'), 0.57256919),
 (('3564', '6227'), 0.50662416),
 (('8577', '3945'), 0.47610179),
 (('2943', '3863'), 0.46175286),
 (('2028', '6945'), 0.4521119),
 (('8703', '2356'), 0.43386456),
 (('7308', '466'), 0.43194285),
 (('2032', '6943'), 0.35962871),
 (('8130', '802'), 0.26454091),
 (('3565', '6226'), 0.23791718),
 (('3571', '6212'), 0.21998759)]

In [26]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'escova dental colgate smiles 5 anos', 'price': 14.99} {'description': 'escova dental colgate smiles 5+ anos 8306039', 'price': 17.19}
{'description': 'escova dental close up ultra reach media', 'price': 14.44} {'description': 'escova dental closeup ultra reach media 1 unidade 8253885', 'price': 16.47}
{'description': 'escova dental close up ultra reach macia', 'price': 14.44} {'description': 'escova dental closeup ultra reach macia 1 unidade 8207840', 'price': 16.47}
{'description': 'calca motocross pro tork insane 4 vermelha cinza trilha', 'price': 106.9} {'description': 'calca motocross pro tork insane 4 amarela roxa trilha - tamanho 38 8522268', 'price': 126.88}
{'description': 'calca motocross pro tork insane 3 azul trilha', 'price': 99.9} {'description': 'calca motocross pro tork insane 4 amarela roxa trilha - tamanho 40 8535169', 'price': 126.88}
{'description': 'guarda - roupa infantil selena 4 portas 2 gavetas branco - carolina baby', 'price': 1014.9} {'descrip

In [5]:
len(walmart)

9035

In [6]:
len(submarino)

8970