In [2]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize

In [15]:
fields = [{'field': 'description', 'variable name': 'description', 'type': 'Text'},
          {'field': 'price', 'variable name': 'price', 'type': 'Price'},
          {'type': 'Interaction', 'interaction variables': ['description', 'price']}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [4]:
with open('../data/americanas/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

with open('../data/magazineluiza/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [16]:
p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [17]:
linker = dedupe.RecordLink(fields)
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : kit relacao transmissao suzuki katana 125 09 - 43z x 14z - c / corrente 428h x 116l - premium 8295437
price : 74.9

description : pastilha freio - nakata - pajero sport 2002 2008 - traseiro - jogo - nkf - 1232p
price : 74.9

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : sofa retratil reclinavel chaise 2 lugares suede porsche . 2133868
price : 2645.91

description : sofa somopar adventure retratil reclinavel 2 lugares suede amassado marrom claro
price : 771.67

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental oral - b complete 40 2 unidades 8212826
price : 10.91

description : escova dental oral - b indicator plus 40 - 2 unidades
price : 12.99

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escova dental colgate smiles 5+ anos 8306039
price : 17.19

description : escova dental colgate smiles 5 anos
price : 14.99

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : guarda - roupa infantil 4 portas 2 gavetas 10297 . 2137096
price : 1169.99

description : guarda - roupa infantil selena 4 portas 2 gavetas branco - carolina baby
price : 1014.9

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : guarda - roupa casal 10 portas 3 gavetas 18490 - 85 . 2128143
price : 359.99

description : guarda - roupa casal santos andira star 5 portas 3 gavetas espelho - branco opcao rosa
price : 499.9

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sofa 3 lugares suede amsterda s31 . 1250197
price : 842.63

description : sofa 3 lugares brilhante suede prata
price : 709.99

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 54 8558621
price : 126.88

description : calca motocross pro tork insane 4 vermelha cinza trilha
price : 106.9

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 42 8507281
price : 126.88

description : calca motocross pro tork insane 4 vermelha cinza trilha
price : 106.9

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 40 8535169
price : 126.88

description : calca motocross pro tork insane 4 vermelha cinza trilha
price : 106.9

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : adesivo soleira resinada corolla 15 16 4 pecas cromado preto 8569897
price : 53.9

description : adesivo soleira resinada gol g3 g4 g5 g6 00 16 4 pecas preto cromo
price : 49.9

5/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : radio portatil am / fm 12 faixas rp - 69 rp 69 . 2152182
price : 233.1

description : livro - estatuto crianca adolescente comentado - 9788530969745
price : 206.15

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : teclado usb fio - kb - 11bk - preto c3tech 9223212
price : 23.3

description : cabo fechadura - palio 2001 2011 - 2 portas - lado passageiro - cada unidade
price : 23.3

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : teclado gamer usb fio - kg - 10 - preto c3tech 9166131
price : 33.9

description : cabo embreagem - fania - / c / d - 10 - 1979 1984 - 768 mm - cada unidade - 30 - 101
price : 27.7

5/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : escova dental closeup ultra reach media 1 unidade 8253885
price : 16.47

description : escova dental close up ultra reach media
price : 14.44

5/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : escova dental closeup ultra reach macia 1 unidade 8207840
price : 16.47

description : escova dental close up ultra reach macia
price : 14.44

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : par palhetas limpador - brisa bosch aerotwin plus mercedes - benz classe m 164 2005 2011 - ap28 / ap21 8341497
price : 164.9

description : par palheta limpador mercedes - benz classe cabriolet 2010 - 2012 dyna - s65a / s60a - 65cm / 60cm
price : 199.9

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : guarda - roupa casal 3 portas correr 3 gavetas 3956 - 84 . 2145706
price : 989.99

description : guarda - roupa 3 portas correr vitoria espelho branco / preto - at . home by madesa
price : 989.99

7/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa casal 3 portas correr 3 gavetas 4013 - 166 . 2145710
price : 1079.99

description : guarda - roupa 3 portas correr vitoria espelho branco / preto - at . home by madesa
price : 989.99

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : kit refil escova eletrica pro - saude flossaction - 6 unidades 8343552
price : 209.7

description : kit 6 unidades refil escova eletrica precision clean oral - b
price : 119.99

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : guarda - roupa casal 3 portas correr 3 gavetas 3956 - 166 . 2145707
price : 1099.99

description : guarda - roupa 3 portas correr vitoria espelho branco / preto - at . home by madesa
price : 989.99

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : camisa motocross pro tork jett hi - vis verde neon - tamanho gg 8535815
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : camisa motocross pro tork jett hi - vis verde neon - tamanho g 8593374
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


p


description : camisa motocross pro tork jett hi - vis verde neon - tamanho gg 8535815
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

7/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : camisa motocross pro tork jett hi - vis verde neon - tamanho g 8593374
price : 51.9

description : calca motocross jett hi - vis rosa neon pro tork
price : 181.9

7/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : calca motocross pro tork insane 4 amarela roxa trilha - tamanho 46 8516151
price : 126.88

description : calca motocross pro tork insane 3 azul trilha
price : 99.9

7/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : camisa motocross pro tork insane 5 branco preto trilha - tamanho xgg 8559880
price : 38.9

description : camisa pro tork insane 3 azul motocross
price : 39.9

8/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : anel folheado ouro detalhe perola - unica - aro 15 8292126
price : 59.8

description : grade - choque - alternativo - stilo 2008 diante - furo milha - aro prata - lad ...
price : 61.2

9/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sofa retratil reclinavel 3 lugares suede cor 592 azul . 1219205
price : 1799.1

description : sofa retratil 3 lugares tauri suede liso preto
price : 1669.0

9/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : teclado gamer multimidia g910 orion spark 920 - 006385 . 2170532
price : 1022.91

description : teclado gamer logitech g910 orion spark fio
price : 1209.99

10/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cama baba paris branco brilho - unico 8319756
price : 569.8

description : cama baba vitta branco - moveis matic
price : 522.68

10/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


p


description : teclado gamer multimidia g910 orion spark 920 - 006385 . 2170532
price : 1022.91

description : teclado gamer logitech g910 orion spark fio
price : 1209.99

10/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


description : cama baba paris branco brilho - unico 8319756
price : 569.8

description : cama baba vitta branco - moveis matic
price : 522.68

11/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [19]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.001000
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, des

In [20]:
m = linker.match(p,w, 0.2)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.api:0 records


In [22]:
m

[(('8703', '2356'), 0.96166432),
 (('8700', '2351'), 0.95724398),
 (('3564', '6227'), 0.87211442),
 (('8130', '802'), 0.8031795),
 (('8576', '3940'), 0.71475679),
 (('8577', '3945'), 0.6980688),
 (('8560', '3853'), 0.69237101),
 (('8706', '2353'), 0.69050813),
 (('8707', '2364'), 0.69050813),
 (('3571', '6212'), 0.62375528),
 (('8592', '3941'), 0.51010996),
 (('2032', '6944'), 0.48204303),
 (('5947', '8945'), 0.46984631),
 (('1785', '792'), 0.45069176),
 (('1869', '701'), 0.4317129),
 (('8578', '3942'), 0.42345086),
 (('7308', '466'), 0.41837117),
 (('3565', '6226'), 0.28748187),
 (('5946', '8946'), 0.28189543),
 (('3553', '6228'), 0.27771178),
 (('1787', '6213'), 0.27527407),
 (('1796', '6214'), 0.27527407),
 (('1341', '7807'), 0.23714177),
 (('2232', '803'), 0.23364666),
 (('3569', '788'), 0.22465377),
 (('1803', '6215'), 0.21821894)]

In [23]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'escova dental oral - b indicator plus 40 - 2 unidades', 'price': 12.99} {'description': 'escova dental oral - b complete 40 2 unidades 8212826', 'price': 10.91}
{'description': 'escova dental colgate smiles 5 anos', 'price': 14.99} {'description': 'escova dental colgate smiles 5+ anos 8306039', 'price': 17.19}
{'description': 'guarda - roupa infantil selena 4 portas 2 gavetas branco - carolina baby', 'price': 1014.9} {'description': 'guarda - roupa infantil 4 portas 2 gavetas 10297 . 2137096', 'price': 1169.99}
{'description': 'guarda - roupa comoda quarto compacto rp5040 branco / rosa - art in moveis', 'price': 579.9} {'description': 'guarda - roupa / comoda compacto 2 portas fofinho rp5040 li art in moveis lilas 8298672', 'price': 469.9}
{'description': 'calca motocross pro tork insane 4 vermelha cinza trilha', 'price': 106.9} {'description': 'calca motocross pro tork insane 4 amarela roxa trilha - tamanho 38 8522268', 'price': 126.88}
{'description': 'camisa pro tor

In [5]:
len(walmart)

9035

In [6]:
len(submarino)

8970