In [12]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize

In [13]:
fields = [{'field': 'description', 'variable name': 'description', 'type': 'Text'},
          {'field': 'price', 'variable name': 'price', 'type': 'Price'},
          {'type': 'Interaction', 'interaction variables': ['description', 'price']}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    price_pattern = r'[\d]+\.[\d]+\.[\d]+'
    
    if re.match(price_pattern, input_str):
        sep = input_str.rsplit('.')
        return '{}.{}'.format(sep[0], sep[1])
    
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [14]:
with open('../data/casasbahia/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [15]:
with open('../data/shoptime/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [42]:
linker = dedupe.RecordLink(fields)
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : rack retro 65 olivar rustik carvalho
price : 309.99

description : mesa centro retangular retro 45 olivar rustik carvalho / amarelo
price : 194.9

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


description : rack retro 65 olivar rustik carvalho
price : 309.99

description : mesa centro retangular retro 45 olivar rustik carvalho / turquesa
price : 194.9

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : tatame eva 1cm 4 pecas azul 40100014 - mor
price : 84.78

description : tatame eva 1cm 4 pecas azul 40100014 - mor
price : 84.78

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : machadinha tomahawk machado taue kh518
price : 134.29

description : machadinha tomahawk machado taue kh518
price : 134.29

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : caneta limpeza lentes fotograficas
price : 59.06

description : caneta limpeza lentes fotograficas
price : 58.51

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : quebra - cabeca hot cars 100 pecas - grow
price : 32.07

description : quebra - cabeca grandao hot wheels - 120 pecas - toyster
price : 55.0

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa lateral berlim espelhada conj . 2 unid
price : 1661.9

description : mesa lateral verde - phorman
price : 425.54

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : boneca acessorios - disney princesas - princesa bela pet - mimo
price : 69.99

description : quebra - cabeca 3d - moana - disney - estrela
price : 69.99

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : rack versatile - branco / vermelho
price : 653.33

description : rack versatile - branco / vermelho
price : 596.77

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : jogo utensilios aco inox 4 pecas oslo tramontina
price : 177.87

description : jogo utensilios aco inox 4 pecas oslo - tramontina
price : 168.51

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : comoda infantil 4 gavetas doce sonho - qmovi branco
price : 256.12

description : comoda infantil 4 gavetas doce sonho branco - qmovi
price : 184.9

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : berco mini cama sol carolina baby branco brilho
price : 432.99

description : berco mini cama carolina baby cleo acetinado
price : 455.39

6/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : rack - vision - artely rovere / amendoa
price : 314.0

description : cozinha compacta franciele 3 pecas - poliman branco / rovere / amendoa
price : 299.0

6/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : almofada pop - art 42x42 - ar04 - atelier valverde - ar . 04
price : 35.0

description : livro - under their thumb - bom garoto misturou rolling stones sobreviveu contar - 9788520923528
price : 37.99

6/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : boneco pets - vida secreta bichos - max articulado
price : 49.99

description : boneco pets - vida secreta bichos - max som movimento
price : 137.42

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : capacete urban azul bi061 - atrio
price : 63.54

description : capacete urban tamanho m azul bi061 - atrio
price : 51.29

6/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : cuscuzeira aluminio polido tamanho 18cm - panemax
price : 26.9

description : cuscuzeira aluminio polido tamanho 18 - panemax
price : 25.52

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : mega bloks call of duty mattel brutus
price : 37.9

description : mega bloks call of duty patrulheiros icaro - mattel
price : 58.49

8/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


description : quebra - cabeca - 2000 pecas - disney - marvel - herois viloes - toyster
price : 111.9

description : livro - fazenda : livro adesivos quebra - cabeca - 9781848693630
price : 54.9

8/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : saleiro porcelana 80 ml copacabana - 42256521 - germer - gmr 009
price : 10.88

description : xicara cha porcelana branca 215 ml pires plissan - 44593820 - germer - gmr 056
price : 20.45

8/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : boneco max steel - modo turbo - mattel
price : 39.99

description : boneco max steel max cacador zumbi - mattel
price : 39.99

8/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mesa bistro opzione alta 0428 - 0 - 256 maxima verniz
price : 689.9

description : mesa bistro opzione alta 0428 - 0 - 884 maxima verniz imbuia / verde musgo
price : 614.9

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : boneco articulado batman - 30 cm - liga justica - coringa - mattel
price : 49.99

description : boneco articulado - 30 cm - batman vs superman - aquaman - mattel
price : 49.99

9/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : puff banqueta round 3 pes madeira corano rosa stay puff
price : 219.99

description : puff banqueta round 3 pes madeira corano rosa - markine mobilier
price : 232.9

9/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : mesa bistro opzione alta 0428 - 0 - 128 maxima verniz
price : 689.9

description : mesa bistro opzione alta 0428 - 0 - 128 maxima verniz imbuia / lilas
price : 614.9

10/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : lego city demolition escavadora caminhao - 311 pecas
price : 339.0

description : lego city demolition escavadora caminhao
price : 349.99

11/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : guarda - roupa sonho encantado 2 portas 2 gavetas - henn
price : 518.8

description : guarda - roupa 2 portas 3 gavetas exclusive branco m126 - 05 henn
price : 589.9

12/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : cadeira 0951 - 0 - 301 dumon maxima cacau / rosa bebe
price : 404.9

description : minibar canyon 0682 - 0 - 301 maxima cacau / rosa bebe
price : 849.9

12/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [44]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.010000
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, des

In [53]:
m = linker.match(p,w, 0.89)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 2.0933752 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 0.6216122 seconds


In [54]:
len(m)

11

In [55]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'tatame eva 1cm 4 pecas azul 40100014 - mor', 'price': 84.78} {'description': 'tatame eva 1cm 4 pecas azul 40100014 - mor', 'price': 84.78}
{'description': 'machadinha tomahawk machado taue kh518', 'price': 134.29} {'description': 'machadinha tomahawk machado taue kh518', 'price': 134.29}
{'description': 'caneta limpeza lentes fotograficas', 'price': 59.06} {'description': 'caneta limpeza lentes fotograficas', 'price': 58.51}
{'description': 'rack versatile - branco / vermelho', 'price': 653.33} {'description': 'rack versatile - branco / vermelho', 'price': 596.77}
{'description': 'comoda infantil 4 gavetas doce sonho - qmovi branco', 'price': 256.12} {'description': 'comoda infantil 4 gavetas doce sonho branco - qmovi', 'price': 184.9}
{'description': 'blocos encaixe 10 1 transforca lider 593 pecas xalingo', 'price': 201.2} {'description': 'blocos encaixe 10 1 transforca lider 593 pecas - xalingo', 'price': 190.61}
{'description': 'jogo utensilios aco inox 4 pecas oslo

In [16]:
len(walmart)

10792

In [17]:
len(submarino)

11050

In [52]:
linker.threshold(p,w, recall_weight=0.1)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 2.1180942 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 0.6363432 seconds
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.185
INFO:dedupe.api:precision: 0.933
INFO:dedupe.api:With threshold: 0.890


0.89040464