In [33]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize

In [34]:
fields = [{'field': 'description', 'type': 'String', 'has missing': True},
         {'field': 'price', 'type': 'Price', 'has missing': True}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [35]:
with open('../data/walmart/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

with open('../data/submarino/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [36]:
p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [37]:
linker = dedupe.RecordLink(fields)
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : livro - design grafico integracao autodesk 3ds max 2010 adobe photoshop cs4 extended - aprenda recursos fundamentais forma pratica visual - 9788536502601
price : 109.6

description : brinco prata zirconias 11564 4064732
price : 89.0

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : blusa sacada estampada
price : 129.99

description : boneca princesas disney - mini princesa amigo - costuras cinderela b5333 - hasbro 2621211
price : 59.99

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : bico injetor gm vectra 8v 2.2 1998 2003 monza kadett mpfi gasolina 1991 1996 s10 blazer 2.2 199
price : 140.45

description : bico injetor gm vectra 8v 2.2 1998 2003 monza kadett mpfi gasolina 1991 1996 s10 blazer 2.2 1996 1997 3171632
price : 140.44

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : fantasia borboleta basic
price : 75.99

description : fantasia borboleta basic arca noe 1882969
price : 79.99

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha completa maia 02 palmeira moveis jacarta / fume
price : 1099.9

description : cozinha completa maia 02 palmeira moveis jacarta preto 3578309
price : 1099.9

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : plataforma mod . sport dianteira - mirage 150 - jjcustom
price : 393.21

description : plataforma mod . laser dianteira mirage 150 jjcustom 2873675
price : 323.04

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : cozinha completa maia 02 palmeira moveis jacarta / esmeralda
price : 1099.9

description : cozinha completa maia 05 palmeira moveis jacarta esmeralda 3578314
price : 1699.88

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : cozinha completa maia 06 palmeira moveis jacarta / preto
price : 1699.9

description : cozinha completa maia 02 palmeira moveis jacarta preto 3578309
price : 1099.9

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : cozinha completa maia 05 palmeira moveis jacarta / preto
price : 1699.9

description : cozinha completa maia 02 palmeira moveis jacarta preto 3578309
price : 1099.9

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : sissy bar destacavel cav . fantasma - triumph thunderbird 1700cc - jjcustom
price : 894.47

description : sissy bar destacavel cav . fantasma triumph thunderbird 1700cc jjcustom 2873376
price : 894.47

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha completa maia 05 palmeira moveis jacarta / esmeralda
price : 1699.9

description : cozinha completa maia 05 palmeira moveis jacarta esmeralda 3578314
price : 1699.88

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cozinha compacta tulipa madesa branco / tirol
price : 969.9

description : cozinha compacta sandra elis madesa branco / tirol 1644802
price : 1499.89

4/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sissy bar destacavel cav . fantasma - mirage 650 - jjcustom
price : 881.99

description : sissy bar destacavel cav . fantasma hd dyna super glide 1600 2008 2014 jjcustom 2873316
price : 894.47

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : fantasia arara
price : 128.24

description : fantasia bruxa alegria infantil 2065983
price : 49.99

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : plataforma mod . sport traseira fixo - mirage 150 - jjcustom
price : 393.21

description : plataforma mod . sport traseira fixo mirage 250 jjcustom 2872729
price : 393.21

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : fantasia thundercats - lion adulto
price : 170.99

description : fantasia thundercats lion adulto 1900609
price : 179.99

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : poltrona decorativa sala escritorio tilla sued preto - jm estofados
price : 299.0

description : poltrona decorativa sala escritorio tilla sued amarelo - jm estofados 2475857
price : 299.0

5/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : plataforma mod . sport dianteira - mirage 150 - jjcustom
price : 393.21

description : plataforma mod . sport traseira fixo mirage 250 jjcustom 2872729
price : 393.21

6/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : guarda - roupa espelho sparta 6 portas carioca moveis gris / preto
price : 929.0

description : guarda - roupa espelho bahia 3 portas carioca moveis branco / branco 1603824
price : 829.9

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : plataforma mod . sport traseira fixo - mirage 150 - jjcustom
price : 393.21

description : plataforma mod . sport traseira fixo boulevard m800 jjcustom 2872699
price : 393.21

6/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : bico injetor vw gol 1.0 16v
price : 89.65

description : bico injetor fiat uno 1.0 palio 1.0 1.6 8v 16v fire gasolina 3171614
price : 115.94

6/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : fantasia arara
price : 128.24

description : fantasia caipira joao heat girls 2323704
price : 89.99

6/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : sissy bar destacavel cav . fantasma - mirage 650 - jjcustom
price : 881.99

description : sissy bar destacavel cav . fantasma hd 883 / xl1200 jjcustom 2873299
price : 881.99

6/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cd pepinho
price : 27.99

description : fio flexivel crank cabo bateria sonorizacao profissional 6mm 50 metros vermelho cristal 2851105
price : 92.9

6/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : fantasia indiana
price : 85.49

description : fantasia thundercats lion adulto 1900609
price : 179.99

6/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : fantasia can can luxo
price : 47.49

description : fantasia vampiro luxo adulto 2118596
price : 89.9

6/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : fantasia dragao
price : 179.54

description : fantasia the flintstones betty adulto 2083621
price : 79.9

6/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : capacete fechado shark vision - r2 blank blk preto
price : 1799.9

description : capacete fechado shark vision - r2 cisor kwr 3829208
price : 1999.9

6/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : regata canal estampa
price : 64.99

description : regata feminina estampada camisaria colombo 2248481
price : 15.99

7/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : capacete tsw mtb elite
price : 173.9

description : capacete cross pro tork liberty mx pro branco azul brilhante viseira 3833855
price : 99.9

7/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : kit 4 pneus bridgestone b250 175 / 70r14 84t
price : 1007.44

description : kit 4 pneus aro 17 bridgestone 225 / 50r17 94v turanza er300 ecopia 2593797
price : 1869.12

7/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : livro - abra
price : 23.48

description : cronometro profissional c / 10 memorias , relogio , alarme bussola - vollo vl - 512 1570768
price : 74.9

7/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : capacete fechado zeus 811 evo plasma black red fundo branco preto vermelho
price : 479.9

description : capacete zeus 811 evo plasma al5 2697063
price : 328.98

7/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : fantasia odalisca
price : 85.49

description : fantasia caipira lastex festa junina 1898464
price : 54.99

8/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : protetor motor carenagem honda transalp 700
price : 600.51

description : protetor motor / carenagem honda nc700 scam spto 061 2874115
price : 746.28

8/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : santo antonio ranger 2013 2016 bepo cromado grade vigia
price : 814.54

description : santo antonio ranger bepo cromado elevac grade vigia 2585835
price : 889.54

9/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : sissy bar destacavel cav . fantasma - shadow 750 2010 - jjcustom
price : 818.99

description : sissy bar destacavel cav . fantasma hd dyna super glide 1600 2008 2014 jjcustom 2873316
price : 894.47

10/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : camisa fox - 180 race
price : 139.9

description : farol palio siena strada g3 03 04 05 06 07 08 09 10 11 12 13 14 15 mascara cromada foco duplo lado direito passageiro 3867679
price : 139.9

10/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [38]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfNGramSearchPredicate: (0.8,

In [39]:
m = linker.match(p,w, 0)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.api:0 records
INFO:dedupe.api:100 records


In [40]:
m

[(('3999', '3944'), 0.79641318),
 (('3787', '1531'), 0.75663102),
 (('738', '302'), 0.69547784),
 (('2872', '795'), 0.67398232),
 (('2869', '797'), 0.66353774),
 (('4001', '3951'), 0.62680578),
 (('3510', '2253'), 0.60751826),
 (('3800', '1537'), 0.54060543),
 (('3491', '2262'), 0.51300508),
 (('3803', '1519'), 0.40277994),
 (('2849', '2269'), 0.40034854),
 (('3488', '2266'), 0.36752567),
 (('2887', '1217'), 0.36571729),
 (('3489', '2252'), 0.33275783),
 (('2861', '1454'), 0.33249581),
 (('3497', '2254'), 0.32417876),
 (('3809', '1523'), 0.30562451),
 (('4541', '3552'), 0.30531472),
 (('3503', '2247'), 0.2779274),
 (('4559', '3144'), 0.25304466),
 (('3502', '2259'), 0.25260404),
 (('3496', '2263'), 0.25094247),
 (('3507', '2250'), 0.2412139),
 (('2797', '2618'), 0.23926325),
 (('741', '1909'), 0.2219941),
 (('1494', '1923'), 0.22037077),
 (('3495', '2265'), 0.21026333),
 (('3798', '1521'), 0.20818588),
 (('4555', '3152'), 0.17429905),
 (('3792', '1518'), 0.17184785),
 (('3085', '3877')

In [41]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'bico injetor gm vectra 8v 2.2 1998 2003 monza kadett mpfi gasolina 1991 1996 s10 blazer 2.2 1996 1997 3171632', 'price': 140.44} {'description': 'bico injetor gm vectra 8v 2.2 1998 2003 monza kadett mpfi gasolina 1991 1996 s10 blazer 2.2 199', 'price': 140.45}
{'description': 'fantasia borboleta basic arca noe 1882969', 'price': 79.99} {'description': 'fantasia borboleta basic', 'price': 75.99}
{'description': 'poltrona decorativa sala escritorio tilla sued amarelo - jm estofados 2475857', 'price': 299.0} {'description': 'poltrona decorativa sala escritorio tilla sued preto - jm estofados', 'price': 299.0}
{'description': 'cozinha completa maia 05 palmeira moveis jacarta esmeralda 3578314', 'price': 1699.88} {'description': 'cozinha completa maia 05 palmeira moveis jacarta / esmeralda', 'price': 1699.9}
{'description': 'cozinha completa maia 02 palmeira moveis jacarta preto 3578309', 'price': 1099.9} {'description': 'cozinha completa maia 02 palmeira moveis jacarta / p

In [42]:
len(walmart)

4569

In [43]:
len(submarino)

4527