In [1]:
import dedupe
import json
from cytoolz import *
from copy import copy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse
from unicodedata import normalize
from sklearn.linear_model import SGDClassifier

In [2]:
fields = [{'field': 'description', 'variable name': 'description', 'type': 'String'},
          {'field': 'price', 'variable name': 'price', 'type': 'Price'},
          {'type': 'Interaction', 'interaction variables': ['description', 'price']}]


def tokenize_url(url):
    path = urlparse(url).path
    path = path.replace('/', ' ')
    path = path.replace('-', ' ')
    return word_tokenize(path)


def remove_accents(input_str):
    return normalize('NFKD', input_str).encode('ascii', 'ignore').decode()


def sanitize_string(input_str):
    pattern = r'[?|&|!|@|#|;|*|~|(|)|´|^|\r|\n|\t]'
    price_pattern = r'[\d]+\.[\d]+\.[\d]+'
    
    if re.match(price_pattern, input_str):
        sep = input_str.rsplit('.')
        return '{}.{}'.format(sep[0], sep[1])
    
    return clean_string(re.sub(pattern, r'', remove_accents(input_str.lower())))


def clean_string(input_str):
    unuseful = set(['confira', 'compre', 'oferta', 'novo',
                    'preco', 'agora', 'melhores', 'aqui',
                    'aproveite', 'menor', 'maior', 'encontra',
                    'site', 'condicoes', 'ofertas', 'imbativeis',
                    'vendas', 'online', 'novo', 'nova', 'tecnologia',
                    'precos', 'pagamento', 'melhor', 'veja', 'encontre',
                    'menores', 'vem', 'venha', 'ver'])

    names = set(['walmart', 'walmart.com', 'shoptime', 'shoptime.com',
                 'pontofrio', 'pontofrio.com', 'magazineluiza', 'magazineluiza.com',
                 'magazine luiza', 'americanas', 'americanas.com', 'submarino',
                 'submarino.com', 'extra', 'extra.com.br', 'casas bahia', 'casasbahia.com.br'])

    stopwords = [remove_accents(s) for s in nltk.corpus.stopwords.words('portuguese')]
    repl = list(concat([unuseful, names, stopwords]))
    tk = word_tokenize(input_str)
    return ' '.join([word for word in tk if word not in repl])


def preprocess(item):
    return {'description': sanitize_string(item['description']), 'price': float(sanitize_string(item['price']))}

In [3]:
with open('../data/casasbahia/buscape.json') as f:
    walmart = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

In [4]:
with open('../data/shoptime/buscape.json') as f:
    submarino = [preprocess(dissoc(item, 'image')) for item in json.loads(f.read())['products']]

p = {str(i): walmart[i] for i in range(len(walmart))}
w = {str(i): submarino[i] for i in range(len(submarino))}

In [None]:
linker = dedupe.RecordLink(fields)
linker.classifier = SGDClassifier()
linker.sample(p, w)
dedupe.consoleLabel(linker)

description : aviao super wings bello change up yw710200 intek
price : 174.99

description : aviao super wings - bello change up 8006 - 4 - intek
price : 134.85

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


u


description : boneco pets - vida secreta bichos - max articulado
price : 49.99

description : boneco pets - vida secreta bichos - max som movimento
price : 137.42

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : mega bloks call of duty mattel brutus
price : 37.9

description : mega bloks call of duty patrulheiros icaro - mattel
price : 58.49

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


description : tatame eva 1cm 4 pecas azul 40100014 - mor
price : 84.78

description : tatame eva 1cm 4 pecas azul 40100014 - mor
price : 84.78

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


description : machadinha tomahawk machado taue kh518
price : 134.29

description : machadinha tomahawk machado taue kh518
price : 134.29

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


In [44]:
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.010000
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (4, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (3, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (2, description)
INFO:dedupe.blocking:Canopy: LevenshteinSearchPredicate: (1, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.2, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.4, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.6, des

In [53]:
m = linker.match(p,w, 0.89)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 2.0933752 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 0.6216122 seconds


In [54]:
len(m)

11

In [55]:
for i in m:
    print(p[i[0][0]], w[i[0][1]])

{'description': 'tatame eva 1cm 4 pecas azul 40100014 - mor', 'price': 84.78} {'description': 'tatame eva 1cm 4 pecas azul 40100014 - mor', 'price': 84.78}
{'description': 'machadinha tomahawk machado taue kh518', 'price': 134.29} {'description': 'machadinha tomahawk machado taue kh518', 'price': 134.29}
{'description': 'caneta limpeza lentes fotograficas', 'price': 59.06} {'description': 'caneta limpeza lentes fotograficas', 'price': 58.51}
{'description': 'rack versatile - branco / vermelho', 'price': 653.33} {'description': 'rack versatile - branco / vermelho', 'price': 596.77}
{'description': 'comoda infantil 4 gavetas doce sonho - qmovi branco', 'price': 256.12} {'description': 'comoda infantil 4 gavetas doce sonho branco - qmovi', 'price': 184.9}
{'description': 'blocos encaixe 10 1 transforca lider 593 pecas xalingo', 'price': 201.2} {'description': 'blocos encaixe 10 1 transforca lider 593 pecas - xalingo', 'price': 190.61}
{'description': 'jogo utensilios aco inox 4 pecas oslo

In [16]:
len(walmart)

10792

In [17]:
len(submarino)

11050

In [52]:
linker.threshold(p,w, recall_weight=0.1)

INFO:dedupe.blocking:Canopy: TfidfTextSearchPredicate: (0.8, description)
INFO:dedupe.blocking:10000, 2.1180942 seconds
INFO:dedupe.api:0 records
INFO:dedupe.blocking:10000, 0.6363432 seconds
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.185
INFO:dedupe.api:precision: 0.933
INFO:dedupe.api:With threshold: 0.890


0.89040464