Instale o dedupe
- pip install unicode
- pip install future
- pip install dedupe

# Passo 1: Leitura dos dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings; warnings.simplefilter('ignore')
import logging; logging.disable(level=logging.INFO)

In [3]:
df =pd.read_csv("restaurant.csv",skip_blank_lines=True )
df.shape

(881, 6)

In [4]:
df.head()

Unnamed: 0,name,addr,city,phone,type,cluster
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses,0
2,arnie morton,435 s. la cienega boulevard,los angeles,310-246-1501,steakhouses,0
3,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american,1
4,art's deli,12224 ventura blvd.,studio city,818-762-1221,delis,1


### Vamos remover a coluna cluster para realizar a clusterização dos dados

In [5]:
df.drop(columns=['cluster'], inplace=True)

### Descrição dos dados

In [6]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 5 columns):
name     881 non-null object
addr     881 non-null object
city     881 non-null object
phone    881 non-null object
type     880 non-null object
dtypes: object(5)
memory usage: 34.5+ KB


In [7]:
df = df.fillna('')

# Passo 2: Limpeza dos dados

### Limpeza dos dados

In [8]:
import re
print("Remove irrelevant separators:")
irrelevant_regex = re.compile(r'[^a-z0-9\s]')


print("Remove multi-spaces:")
multispace_regex = re.compile(r'\s\s+')

def assign_no_symbols_name(df):
    return df.assign(
        name=df['name']
             .str.replace(irrelevant_regex, ' ')
             .str.replace(multispace_regex, ' '))

df = assign_no_symbols_name(df)
df.head()

Remove irrelevant separators:
Remove multi-spaces:


Unnamed: 0,name,addr,city,phone,type
0,arnie morton s of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american
1,arnie morton s of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses
2,arnie morton,435 s. la cienega boulevard,los angeles,310-246-1501,steakhouses
3,art s delicatessen,12224 ventura blvd.,studio city,818/762-1221,american
4,art s deli,12224 ventura blvd.,studio city,818-762-1221,delis


### Análise de stopwords

In [9]:
import pprint
from collections import Counter

possible_stopwords = Counter(" ".join(df["name"]).split()).most_common(20)
pprint.pprint(possible_stopwords)

[('s', 184),
 ('cafe', 81),
 ('the', 39),
 ('grill', 32),
 ('restaurant', 26),
 ('la', 24),
 ('le', 20),
 ('house', 20),
 ('bar', 19),
 ('of', 15),
 ('bistro', 13),
 ('room', 11),
 ('kitchen', 10),
 ('deli', 9),
 ('club', 9),
 ('and', 9),
 ('ritz', 9),
 ('carlton', 9),
 ('on', 8),
 ('buckhead', 8)]


### Removendo possíveis stop words

In [10]:
def assign_cleaned_name(df):
    restaurant_stopwords = {
        's', 'the', 'la', 'le', 'of', 'and', 'on'}
    restaurant_stopwords_regex = r'\b(?:{})\b'.format(
        '|'.join(restaurant_stopwords))
    return df.assign(
        name=df['name']
             .str.replace(restaurant_stopwords_regex, '')
             .str.replace(multispace_regex, ' ')
             .str.strip())

df = assign_cleaned_name(df)
df.head()

Unnamed: 0,name,addr,city,phone,type
0,arnie morton chicago,435 s. la cienega blv.,los angeles,310/246-1501,american
1,arnie morton chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses
2,arnie morton,435 s. la cienega boulevard,los angeles,310-246-1501,steakhouses
3,art delicatessen,12224 ventura blvd.,studio city,818/762-1221,american
4,art deli,12224 ventura blvd.,studio city,818-762-1221,delis


# Passo 3: Aplicação do Dedupe

## Usando o Dedupe

- Active Learning Classification

In [11]:
import logging; logging.disable(level=logging.NOTSET)

In [12]:
fields = [
    {
        'field': 'name',
        'variable name': 'name',
        'type': 'ShortString',
        'has missing': True
    },
    {
        'field': 'addr',
        'variable name': 'addr',
        'type': 'String',
    },
    {
        'field': 'city',
        'variable name': 'city',
        'type': 'ShortString',
        'has missing': True
    },
    {
        'field': 'type',
        'variable name': 'type',
        'type': 'ShortString',
        'has missing': True
    }
    
] 

Iniciando a instância ``Dedupe`` e salvando: 

In [13]:
import os
import dedupe
settings_filename = 'dedupe-settings.pickle'
if os.path.exists(settings_filename):
    with open(settings_filename, 'rb') as sf:
        deduper = dedupe.StaticDedupe(sf, num_cores=4)
else:
    deduper = dedupe.Dedupe(fields, num_cores=4)

INFO:dedupe.api:((SimplePredicate: (commonTwoTokens, city), SimplePredicate: (wholeFieldPredicate, name)), (SimplePredicate: (wholeFieldPredicate, name), SimplePredicate: (wholeFieldPredicate, type)))


 Adaptando o formato dos dados para o formato do ``Dedupe``.

In [14]:
data_for_dedupe = df.to_dict('index')
for record in data_for_dedupe.values():
    # Mudando  nans para None
    for k, v in record.items():
        if isinstance(v, float) and np.isnan(v):
            record[k] = None

Aqui estamos usando uma instância ``Dedupe`` que treinamos antes. 
Vamos verificar como foi a input/output do treinamento:

In [15]:
training_input_output = 'training-input-output.txt'
if os.path.exists(training_input_output):
    with open(training_input_output) as t:
        txt = t.read()
        print('\n'.join(txt.split('\n')[:114]))
print('...')

...


In [16]:
if not isinstance(deduper, dedupe.StaticDedupe):
    deduper.sample(data_for_dedupe)
    
    training_filename = 'dedupe-slides-training.json'
    if os.path.exists(training_filename):
        with open(training_filename) as tf:
            deduper.readTraining(tf)

    dedupe.consoleLabel(deduper)
    
    with open(training_filename, 'w') as tf:
        deduper.writeTraining(tf)
    
    deduper.train()
    
    with open(settings_filename, 'wb') as sf:
        deduper.writeSettings(sf)

Você pode verificar o treinamento completo em training-input-output.txt.

Se você quiser treiná-lo, faça um rm dedupe-settings.pickle dedupe-slides-training.json e execute toda a sessão do Active Learning novamente.

Após o treinamento, podemos ver quais ``blocking predicates `` (regras de indexação) o Dedupe aprendeu com nossas informações de treinamento. 
É bom fazer isso para verificar se treinamos o suficiente:

In [17]:
deduper.predicates

((SimplePredicate: (commonTwoTokens, city),
  SimplePredicate: (wholeFieldPredicate, name)),
 (SimplePredicate: (wholeFieldPredicate, name),
  SimplePredicate: (wholeFieldPredicate, type)))

In [18]:
deduper.data_model.predicates()

{ExistsPredicate: (Exists, city),
 ExistsPredicate: (Exists, name),
 ExistsPredicate: (Exists, type),
 LevenshteinCanopyPredicate: (1, addr),
 LevenshteinCanopyPredicate: (1, city),
 LevenshteinCanopyPredicate: (1, name),
 LevenshteinCanopyPredicate: (1, type),
 LevenshteinCanopyPredicate: (2, addr),
 LevenshteinCanopyPredicate: (2, city),
 LevenshteinCanopyPredicate: (2, name),
 LevenshteinCanopyPredicate: (2, type),
 LevenshteinCanopyPredicate: (3, addr),
 LevenshteinCanopyPredicate: (3, city),
 LevenshteinCanopyPredicate: (3, name),
 LevenshteinCanopyPredicate: (3, type),
 LevenshteinCanopyPredicate: (4, addr),
 LevenshteinCanopyPredicate: (4, city),
 LevenshteinCanopyPredicate: (4, name),
 LevenshteinCanopyPredicate: (4, type),
 SimplePredicate: (alphaNumericPredicate, addr),
 SimplePredicate: (alphaNumericPredicate, city),
 SimplePredicate: (alphaNumericPredicate, name),
 SimplePredicate: (alphaNumericPredicate, type),
 SimplePredicate: (commonFourGram, addr),
 SimplePredicate: (c

Para prosseguir com a deduplication, calculamos o ``clustering threshold``

In [None]:
import itertools

threshold = deduper.threshold(data_for_dedupe, recall_weight=1)
clustered_dupes = deduper.match(data_for_dedupe, threshold)

dedupe_found_pairs_set = set()
for cluster, __ in clustered_dupes:  # we'll explain that later
    for pair in itertools.combinations(cluster, 2):
        dedupe_found_pairs_set.add(tuple(pair))



### Exercício

- **Passo 1**: Leitura
- **Passo 2**: Limpeza dos dados
- **Passo 3**: Aplicação do Dedupe para gerar clusters em um novo conjunto de dados

# Passo 1

# Leitura dos dados fake

In [3]:
df_test =  pd.read_csv('MOCK_DATA.csv')
df_test.shape

(500, 7)

In [5]:
df_test.head()

Unnamed: 0,id,first_name,last_name,email,product,Fraud,price
0,1,Gan,Renachowski,grenachowski0@ifeng.com,"Plate - Foam, Bread And Butter",True,$775.18
1,2,Deane,Telford,dtelford1@about.me,Ecolab Silver Fusion,True,$461.56
2,3,Clarice,Bleby,cbleby2@dedecms.com,Pepper - Chilli Seeds Mild,False,$482.56
3,4,Berri,Dovington,bdovington3@mac.com,Cheese - Camembert,False,$799.53
4,5,Jacinda,,jfigin4@wiley.com,Beef - Tender Tips,True,$867.33


In [6]:
df_test.Fraud.value_counts()

True     268
False    232
Name: Fraud, dtype: int64

# Passo 2

# Limpeza dos dados fake

# Passo 3

##### Aplicação do dedupe nos dados Fake.
###### - Gere clusters dos clientes

### Referências:
    
-    https://www.cs.utexas.edu/users/ml/riddle/data.html
-    https://github.com/vintasoftware/deduplication-slides       

# Final